diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 9ff908a4c87d55e87468a06ae0e6085ac165a1b1..a5d3d572181bcd7555d112961eac497e1195cfe3 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -174,15 +174,26 @@ if(NOT WITH_DSO) endif(WIN32) endif(NOT WITH_DSO) -get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY) -function(import_static_library alias path) +function(add_cuda_static_lib alias cuda_lib_paths file_name) + unset(ABS_PATH CACHE) + find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH) add_library(${alias} STATIC IMPORTED GLOBAL) - set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path}) + set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH}) + set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE) + if (NOT ABS_PATH) + message(FATAL_ERROR "Can not find CUDA static library: ${file_name}") + endif() endfunction() -import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a) -import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a) -import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a) -import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a) + +add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a) +add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a) +add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a) +add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a) +if(NOT ${CUDA_VERSION} LESS 10.1) + add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a) +endif() + +set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES}) # setting nvcc arch flags select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 842b94d47e75b4bab577a1150cb3d198eb42ebaf..574baa86a82963ffa76795e029a6ba14f537c80a 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu - ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + /usr/lib/${TARGET_ARCH}-linux-gnu/ $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib /usr/lib ${CUDA_TOOLKIT_ROOT_DIR} - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ) if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0)) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index bd0d117a633824d93c403b8167ff49505160069b..599e7bba7eaf12da7506ce44e706bd9f50ec6998 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -1,5 +1,6 @@ INCLUDE(ExternalProject) +SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) @@ -16,9 +17,12 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e + GIT_TAG + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip + DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} + DOWNLOAD_NAME "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" @@ -29,12 +33,14 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c + GIT_TAG + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} - DOWNLOAD_NAME "eigen" + DOWNLOAD_NAME "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 1d61154c0d45dea795902d6544deb796693db263..5166b494c489e25c970c7dbfe72fa1404302009f 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -20,6 +20,7 @@ endif() include(ExternalProject) +SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak) set(XBYAK_PROJECT extern_xbyak) set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) @@ -38,8 +39,11 @@ ExternalProject_Add( ${XBYAK_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS "" - GIT_REPOSITORY "https://github.com/herumi/xbyak.git" GIT_TAG "v5.661" # Jul 26th + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip + DOWNLOAD_DIR ${XBYAK_SOURCECODE_DIR} + DOWNLOAD_NAME "xbyak-5.66.zip" + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XBYAK_PREFIX_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 23b1e02108642df561948a6faa3152effb7ca932..fdc20351e8bcdf5fe8e95db3516f4c6f607611db 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -1,5 +1,6 @@ INCLUDE(ExternalProject) +SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash) set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") @@ -18,10 +19,12 @@ if(WIN32) ExternalProject_Add( extern_xxhash ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" GIT_TAG "v0.6.5" + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip + DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR} + DOWNLOAD_NAME "xxHash-0.6.5.zip" + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" UPDATE_COMMAND "" BUILD_IN_SOURCE 1 PATCH_COMMAND @@ -41,10 +44,12 @@ else() ExternalProject_Add( extern_xxhash ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" GIT_TAG "v0.6.5" + URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip + DOWNLOAD_DIR ${XXHASH_SOURCECODE_DIR} + DOWNLOAD_NO_PROGRESS 1 PREFIX ${XXHASH_SOURCE_DIR} - DOWNLOAD_NAME "xxhash" + DOWNLOAD_NAME "xxHash-0.6.5.zip" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 415eb451a986cd7e59829b9a8f2c744ecf464bd6..225a3c19a16435c4df6403ff7d1bdd01e628dd72 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES}) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${os_dependency_modules}) if(nv_binary_DEPS) target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) @@ -507,7 +510,7 @@ function(nv_test TARGET_NAME) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest -gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) + gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} ) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog) common_link(${TARGET_NAME}) add_test(${TARGET_NAME} ${TARGET_NAME}) diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 4423e27e1af4d7bf0f0cc9e60858b8144fc3648d..3b9b4ece23266ce818e02c50ac2cd53c8771762a 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -164,7 +164,9 @@ function(lite_cc_library TARGET) endfunction() function(lite_cc_binary TARGET) - set(options "") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(options " -g ") + endif() set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) @@ -255,6 +257,7 @@ endfunction() set(arm_kernels CACHE INTERNAL "arm kernels") set(x86_kernels CACHE INTERNAL "x86 kernels") +set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 173f04126eccbe0ec324c6e19ea8f21f278fd539..036df2a824c3b696c892cb7462f9afb4a3e2a10a 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}") message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") +message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") @@ -121,6 +122,9 @@ if (LITE_WITH_X86) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) endif() +if(LITE_WITH_CUDA) + add_dependencies(publish_inference paddle_full_api_shared) +endif(LITE_WITH_CUDA) if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (NOT LITE_ON_TINY_PUBLISH) # add cxx lib @@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include" ) add_dependencies(tiny_publish_lib bundle_light_api) add_dependencies(publish_inference tiny_publish_lib) @@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ) add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared) add_dependencies(publish_inference tiny_publish_cxx_lib) + add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD + COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so) endif() endif() endif() @@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) endif() endif() - if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND + if ((ARM_TARGET_OS STREQUAL "android") AND ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8))) if (NOT LITE_ON_TINY_PUBLISH) # copy @@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include" ) add_dependencies(publish_inference_android_cxx_demos logging gflags) add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos) @@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile" + COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile" + ) add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos) endif() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index aef0fc396e6eb35a7ef85a8f2fc13651237e19a3..e660bbcdd606133db4e7891b6973f26983b4dd79 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH) set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") endif() set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) -if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) +if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library add_library(paddle_full_api_shared SHARED "") target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) @@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) endif() - + if(LITE_WITH_CUDA) + target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") + endif(LITE_WITH_CUDA) #light api dynamic library lite_cc_library(paddle_light_api_shared MODULE SRCS light_api_shared.cc @@ -65,6 +67,7 @@ endif() message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") +message(STATUS "get CUDA kernels ${cuda_kernels}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") @@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH) ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels} - BM_DEPS ${bm_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + BM_DEPS ${bm_kernels}) endif() # for light api set(light_api_deps scope target_wrapper_host model_parser program) if(LITE_WITH_CUDA) + get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) set(light_api_deps ${light_api_deps} target_wrapper_cuda) - set(cuda_static_deps cudart_static cublas_static curand_static - cudnn_static culibos_static) endif() lite_cc_library(light_api SRCS light_api.cc DEPS scope target_wrapper_host model_parser @@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} - CL_DEPS ${opencl_kenrels} - FPGA_DEPS ${fpga_kenrels} - BM_DEPS ${bm_kenrels}) + CL_DEPS ${opencl_kernels} + FPGA_DEPS ${fpga_kernels}) + BM_DEPS ${bm_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -305,9 +307,10 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - X86_DEPS ${x86_kernels}) + FPGA_DEPS ${fpga_kernels} + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} @@ -316,7 +319,9 @@ if(NOT IOS) CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} - X86_DEPS ${x86_kernels}) + X86_DEPS ${x86_kernels} + CUDA_DEPS ${cuda_kernels}) + endif() #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 462a5e2381acf3cc86ca81002a282933f01ee049..c137324b576f9f9399669a5e68d948b9921e4866 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); - std::vector vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)}, - Place{TARGET(kX86), PRECISION(kFloat)}, - Place{TARGET(kOpenCL), PRECISION(kFloat)}}; + std::vector vaild_places = { + Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}, + }; if (FLAGS_is_quantized_model) { vaild_places.insert(vaild_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index a2b538aa77e0603f439b6b23aab875103fdbbff0..4647f20bbe476d8763f94f707f3d88da7c7544df 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -24,13 +24,6 @@ namespace paddle { namespace lite { -static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] = - ".tailored_ops_source_list"; -static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list"; -static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = - ".tailored_kernels_source_list"; -static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; - void Predictor::SaveModel(const std::string &dir, lite_api::LiteModelType model_type, bool record_info) { @@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) { // get inputs names std::vector Predictor::GetInputNames() { return input_names_; } + // get outputnames std::vector Predictor::GetOutputNames() { return output_names_; } + // append the names of inputs and outputs into input_names_ and output_names_ void Predictor::PrepareFeedFetch() { + std::vector feeds; + std::vector fetchs; +#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) + // The shape of input tensors must be determined before generating NPU and XPU + // program. auto current_block = program_desc_.GetBlock(0); - std::vector feeds; - std::vector fetchs; for (size_t i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); +#else + if (!program_) { + GenRuntimeProgram(); + } + const auto &insts = program_->instructions(); + for (size_t i = 0; i < program_->num_instructions(); i++) { + const auto &op = insts[i].op()->op_info(); +#endif if (op->Type() == "feed") { feeds.push_back(op); } else if (op->Type() == "fetch") { fetchs.push_back(op); } } + input_names_.resize(feeds.size()); output_names_.resize(fetchs.size()); for (size_t i = 0; i < feeds.size(); i++) { @@ -190,6 +197,7 @@ std::vector Predictor::GetOutputs() const { const cpp::ProgramDesc &Predictor::program_desc() const { return program_desc_; } + const RuntimeProgram &Predictor::runtime_program() const { return *program_; } void Predictor::Build(const lite_api::CxxConfig &config, @@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc, const std::vector &valid_places, const std::vector &passes) { program_desc_ = desc; + // `inner_places` is used to optimize passes std::vector inner_places = valid_places; inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); inner_places.emplace_back( TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); Program program(desc, scope_, inner_places); - /// The first place in valid_places is + core::KernelPickFactor factor; factor.ConsiderTarget(); factor.ConsiderPrecision(); factor.ConsiderDataLayout(); + optimizer_.Run(std::move(program), inner_places, factor, passes); exec_scope_ = optimizer_.exec_scope(); PrepareFeedFetch(); @@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const { auto *var = exec_scope_->FindVar(name); return &var->Get(); } + // get input by name lite::Tensor *Predictor::GetInputByName(const std::string &name) { auto element = std::find(input_names_.begin(), input_names_.end(), name); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index 502ce812e1f4a7f520e89e6eaff020c5853f5308..504710d9fa29420b8762f31e0c675b59c6c626bd 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -29,6 +29,13 @@ namespace paddle { namespace lite { +static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] = + ".tailored_ops_source_list"; +static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list"; +static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] = + ".tailored_kernels_source_list"; +static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list"; + /* * Predictor for inference, input a model, it will optimize and execute it. */ diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc index 63a401745b325654f81c3af93402703395264c0d..79f9bea762e099b249f597dddb7df790361edc2a 100644 --- a/lite/api/mobilenetv1_test.cc +++ b/lite/api/mobilenetv1_test.cc @@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) { #ifdef LITE_WITH_OPENCL TEST(MobileNetV1, test_opencl) { std::vector valid_places({ - Place{TARGET(kOpenCL), PRECISION(kFloat)}, - Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + TARGET(kARM), // enable kARM CPU kernel when no opencl kernel }); TestModel(valid_places); diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc index 1aef522b2a6bb95f895449469f3c13e4a713179a..1c426e8568cf71b6f48edbbeb8a93fec2e89c594 100644 --- a/lite/api/model_optimize_tool.cc +++ b/lite/api/model_optimize_tool.cc @@ -20,6 +20,7 @@ // model_optimize_tool's compiling period #include "all_kernel_faked.cc" // NOLINT #include "kernel_src_map.h" // NOLINT +#include "lite/api/cxx_api.h" #include "lite/api/paddle_api.h" #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" @@ -31,6 +32,18 @@ DEFINE_string(model_dir, "", "path of the model. This option will be ignored if model_file " "and param_file are exist"); +DEFINE_string(model_filename, + "", + "model topo filename of the model in models set. This option" + " will be used to specific tailoring"); +DEFINE_string(param_filename, + "", + "model param filename of the model in models set. This option" + " will be used to specific tailoring"); +DEFINE_string(model_set_dir, + "", + "path of the models set. This option will be used to specific" + " tailoring"); DEFINE_string(model_file, "", "model file path of the combined-param model"); DEFINE_string(param_file, "", "param file path of the combined-param model"); DEFINE_string( @@ -58,29 +71,23 @@ void DisplayKernels() { LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString(); } -void Main() { - if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) { - LOG(WARNING) - << "Load combined-param model. Option model_dir will be ignored"; - } - - if (FLAGS_display_kernels) { - DisplayKernels(); - exit(0); - } - - lite_api::CxxConfig config; - config.set_model_dir(FLAGS_model_dir); - config.set_model_file(FLAGS_model_file); - config.set_param_file(FLAGS_param_file); - +std::vector ParserValidPlaces() { std::vector valid_places; - auto target_reprs = lite::Split(FLAGS_valid_targets, " "); + auto target_reprs = lite::Split(FLAGS_valid_targets, ","); for (auto& target_repr : target_reprs) { if (target_repr == "arm") { valid_places.emplace_back(TARGET(kARM)); } else if (target_repr == "opencl") { - valid_places.emplace_back(TARGET(kOpenCL)); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}); + valid_places.emplace_back( + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}); + valid_places.emplace_back( + TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel } else if (target_repr == "x86") { valid_places.emplace_back(TARGET(kX86)); } else { @@ -100,26 +107,130 @@ void Main() { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); } + return valid_places; +} + +void RunOptimize(const std::string& model_dir, + const std::string& model_file, + const std::string& param_file, + const std::string& optimize_out, + const std::string& optimize_out_type, + const std::vector& valid_places, + bool record_tailoring_info) { + if (!model_file.empty() && !param_file.empty()) { + LOG(WARNING) + << "Load combined-param model. Option model_dir will be ignored"; + } + + lite_api::CxxConfig config; + config.set_model_dir(model_dir); + config.set_model_file(model_file); + config.set_param_file(param_file); + config.set_valid_places(valid_places); auto predictor = lite_api::CreatePaddlePredictor(config); LiteModelType model_type; - if (FLAGS_optimize_out_type == "protobuf") { + if (optimize_out_type == "protobuf") { model_type = LiteModelType::kProtobuf; - } else if (FLAGS_optimize_out_type == "naive_buffer") { + } else if (optimize_out_type == "naive_buffer") { model_type = LiteModelType::kNaiveBuffer; } else { - LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type; + LOG(FATAL) << "Unsupported Model type :" << optimize_out_type; } - OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); + OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map); predictor->SaveOptimizedModel( - FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info); - if (FLAGS_record_tailoring_info) { + optimize_out, model_type, record_tailoring_info); + if (record_tailoring_info) { LOG(INFO) << "Record the information of tailored model into :" - << FLAGS_optimize_out; + << optimize_out; + } +} + +void CollectModelMetaInfo(const std::string& output_dir, + const std::vector& models, + const std::string& filename) { + std::set total; + for (const auto& name : models) { + std::string model_path = + lite::Join({output_dir, name, filename}, "/"); + auto lines = lite::ReadLines(model_path); + total.insert(lines.begin(), lines.end()); + } + std::string output_path = + lite::Join({output_dir, filename}, "/"); + lite::WriteLines(std::vector(total.begin(), total.end()), + output_path); +} + +void Main() { + if (FLAGS_display_kernels) { + DisplayKernels(); + exit(0); } + + auto valid_places = ParserValidPlaces(); + if (FLAGS_model_set_dir == "") { + RunOptimize(FLAGS_model_dir, + FLAGS_model_file, + FLAGS_param_file, + FLAGS_optimize_out, + FLAGS_optimize_out_type, + valid_places, + FLAGS_record_tailoring_info); + return; + } + + if (!FLAGS_record_tailoring_info) { + LOG(WARNING) << "--model_set_dir option only be used with " + "--record_tailoring_info=true together"; + return; + } + + auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true); + if (model_dirs.size() == 0) { + LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model"; + } + // Optimize models in FLAGS_model_set_dir + for (const auto& name : model_dirs) { + std::string input_model_dir = + lite::Join({FLAGS_model_set_dir, name}, "/"); + std::string output_model_dir = + lite::Join({FLAGS_optimize_out, name}, "/"); + + std::string model_file = ""; + std::string param_file = ""; + + if (FLAGS_model_filename != "" && FLAGS_param_filename != "") { + model_file = + lite::Join({input_model_dir, FLAGS_model_filename}, "/"); + param_file = + lite::Join({input_model_dir, FLAGS_param_filename}, "/"); + } + + LOG(INFO) << "Start optimize model: " << input_model_dir; + RunOptimize(input_model_dir, + model_file, + param_file, + output_model_dir, + FLAGS_optimize_out_type, + valid_places, + FLAGS_record_tailoring_info); + LOG(INFO) << "Optimize done. "; + } + + // Collect all models information + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME); + CollectModelMetaInfo(FLAGS_optimize_out, + model_dirs, + lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME); + CollectModelMetaInfo( + FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME); } } // namespace lite_api diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index 1358267000991c81b80453669cf46638449b8a7b..a04e86b7d2a1e06a52c38b5f00e9c07966be1bfe 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -21,14 +21,14 @@ #include "lite/api/paddle_use_passes.h" #include "lite/api/test_helper.h" #include "lite/core/device_info.h" -#include "lite/tests/utils/timer.h" +#include "lite/core/profile/timer.h" #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" #ifdef LITE_WITH_PROFILE #include "lite/core/profile/basic_profiler.h" #endif // LITE_WITH_PROFILE -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_string(input_shape, "1,3,224,224", @@ -102,20 +102,20 @@ void Run(const std::vector>& input_shapes, Timer ti; for (int j = 0; j < repeat; ++j) { - ti.start(); + ti.Start(); predictor->Run(); - ti.end(); - LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms"; + float t = ti.Stop(); + LOG(INFO) << "iter: " << j << ", time: " << t << " ms"; } LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "Model: " << model_dir << ", power_mode: " << static_cast(power_mode) << ", threads num " << thread_num << ", warmup: " << warmup_times - << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms() + << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg() << " ms" - << ", min time: " << ti.get_min_time() << " ms" - << ", max time: " << ti.get_max_time() << " ms."; + << ", min time: " << ti.LapTimes().Min() << " ms" + << ", max time: " << ti.LapTimes().Max() << " ms."; auto output = predictor->GetOutput(0); auto out = output->data(); diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index f148096bb69a3a249521bcb847d5beae3f8297f9..aabb53529221bde53b6b2ee27b2efefee2e6054d 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) { } } template -void Tensor::CopyToCpu(T *data) { +void Tensor::CopyToCpu(T *data) const { const T *src_data = tensor(raw_tensor_)->data(); int64_t num = tensor(raw_tensor_)->numel(); CHECK(num > 0) << "You should call Resize interface first"; @@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu(const int *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); template void Tensor::CopyFromCpu(const int *); +template void Tensor::CopyFromCpu(const int64_t *); template void Tensor::CopyFromCpu(const float *); template void Tensor::CopyFromCpu(const int8_t *); -template void Tensor::CopyToCpu(int8_t *); -template void Tensor::CopyToCpu(float *); -template void Tensor::CopyToCpu(int *); +template void Tensor::CopyToCpu(int8_t *) const; +template void Tensor::CopyToCpu(float *) const; +template void Tensor::CopyToCpu(int *) const; shape_t Tensor::shape() const { return ctensor(raw_tensor_)->dims().Vectorize(); diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 42b455da811fe1a21277d38f2e1237000276b1ff..c578769bd5159d27ad43e4e93de33f601223004b 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -49,7 +49,7 @@ struct LITE_API Tensor { void CopyFromCpu(const T* data); template - void CopyToCpu(T* data); + void CopyToCpu(T* data) const; /// Shape of the tensor. shape_t shape() const; TargetType target() const; diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index 3d7d496afbc55e1dfdfe83d123c7e41dd59bf1ff..894d839185ea9e1b6b47b87c398f249f044c2b51 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) { "any", "fpga", "npu", - "xpu", - "bm"}; + "xpu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) { "kAny", "kFPGA", "kNPU", - "kXPU", - "kBM"}; + "kXPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -131,8 +129,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU), - TARGET(kFPGA), - TARGET(kBM)}); + TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; } diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index a13abb699cea36ba53e430668e8dcd6d19d46d9e..07284be095c05e5dfa069b0973d5982cf1f07c8a 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -52,9 +52,8 @@ enum class TargetType : int { kFPGA = 7, kNPU = 8, kXPU = 9, - kBM = 10, kAny = 6, // any target - NUM = 11, // number of fields. + NUM = 10, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 70355fdf890eb63cd5bedd5bab42a2dd69af0927..9d56d262abf549584819ab893144e41fc399439f 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass); USE_MIR_PASS(variable_place_inference_pass); USE_MIR_PASS(type_target_cast_pass); USE_MIR_PASS(generate_program_pass); -USE_MIR_PASS(subgraph_program_pass); +#ifdef LITE_WITH_NPU +USE_MIR_PASS(generate_npu_program_pass); +#endif +#ifdef LITE_WITH_XPU +USE_MIR_PASS(generate_xpu_program_pass); +#endif USE_MIR_PASS(io_copy_kernel_pick_pass); USE_MIR_PASS(argument_type_display_pass); diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc index c483373dc745f6520d51ece3936448ada71990d3..5314c5ed75d862635a1b87cdad33bf3c58dcd6cc 100644 --- a/lite/api/test_step_rnn_lite_x86.cc +++ b/lite/api/test_step_rnn_lite_x86.cc @@ -12,20 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - #include #include #include diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt index cbbcf49a5fd55dabd6b072bc6b3b2e3f9bb91a13..076c791daab182c4eff477a621ecd2ec52a0c3e7 100644 --- a/lite/backends/arm/math/CMakeLists.txt +++ b/lite/backends/arm/math/CMakeLists.txt @@ -57,9 +57,10 @@ endif() if (NOT HAS_ARM_MATH_LIB_DIR) # TODO(xxx): seperate them and do not deps proto, eigen3 - cc_library(math_arm SRCS - funcs.cc + cc_library(math_arm SRCS + funcs.cc packed_sgemm.cc + packed_sgemm_c4.cc sgemm.cc gemm_prepacked_int8.cc gemm_s8.cc @@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR) gemv_arm_int8.cc conv3x3s1_direct_fp32.cc conv3x3s2_direct_fp32.cc - conv3x3s1_depthwise_fp32.cc - conv3x3s2_depthwise_fp32.cc + conv3x3s1p01_depthwise_fp32.cc + conv3x3s2p01_depthwise_fp32.cc + conv3x3s1px_depthwise_fp32.cc + conv3x3s2px_depthwise_fp32.cc conv3x3s1_direct_int8.cc conv3x3s2_direct_int8.cc conv3x3s1_depthwise_int8.cc @@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR) conv5x5s1_depthwise_int8.cc conv5x5s1_depthwise_fp32.cc conv5x5s2_depthwise_fp32.cc - conv_depthwise_3x3p0.cc - conv_depthwise_3x3p1.cc - conv_depthwise_3x3s1.cc - conv_depthwise_3x3s2.cc + conv3x3_winograd_fp32_c4.cc conv_winograd_3x3.cc conv_impl.cc - softmax.cc + softmax.cc scale.cc pooling.cc elementwise.cc + layout.cc lrn.cc decode_bboxes.cc concat.cc @@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR) anchor_generator.cc DEPS ${lite_kernel_deps} context tensor) endif() - diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc index b5d2c6af13cc1dd864eaac6cb6589cc879f029fe..38be1d689dd47ab59baf417e40989a91bb6366e0 100644 --- a/lite/backends/arm/math/col_im_transform.cc +++ b/lite/backends/arm/math/col_im_transform.cc @@ -32,8 +32,10 @@ void col2im(const float* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -41,19 +43,22 @@ void col2im(const float* data_col, float* data_im) { memset(data_im, 0, height * width * channels * sizeof(float)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { data_im[input_row * width + input_col] += *data_col; diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h index 8560679d7f4091c4cb424b54e54a42cf6e7e8905..e3e32c4715ade10972f77e0c4d5a2cd4d16b4725 100644 --- a/lite/backends/arm/math/col_im_transform.h +++ b/lite/backends/arm/math/col_im_transform.h @@ -26,8 +26,10 @@ void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc new file mode 100644 index 0000000000000000000000000000000000000000..5834461b8fe0b2d37f174d5f66269fb58f2504a1 --- /dev/null +++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc @@ -0,0 +1,564 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/backends/arm/math/packed_sgemm_c4.h" +#ifdef ARM_WITH_OMP +#include +#endif +#include + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void input_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride); +void output_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride); +void output_trans_c4_post(const float* src, + int src_stride, + float* dest, + int dest_stride, + float* bias_value, + bool has_relu); +void weight_trans_c4( + float* dest, const float* src, int ic, int oc, void* workspace); + +/* +*The following function conv_compute_6x6_3x3 is base on +*MNN[https://github.com/alibaba/MNN] +* +*Copyright © 2018, Alibaba Group Holding Limited +*/ +void conv_compute_6x6_3x3(const float* input, + float* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const float* weight, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + const int pad_h = (*param.paddings)[0]; + const int pad_w = (*param.paddings)[2]; + float* tmp_work_space = + ctx->workspace_data() + ctx->llc_size() / sizeof(float); + + int in_n_stride = chin * hin * win; + int out_n_stride = chout * hout * wout; + int ic_stride = win * hin; + int oc_stride = wout * hout; + int ic_4 = (chin + 3) / 4; + int oc_4 = (chout + 3) / 4; + + int tile_w = (wout + 5) / 6; + int tile_h = (hout + 5) / 6; + int size_tile = tile_h * tile_w; + float zero_ptr[8]; + memset(zero_ptr, 0, 8 * sizeof(float)); + + int w_pad = win + pad_w * 2; + int h_pad = hin + pad_h * 2; + float* input_c4 = tmp_work_space; + int new_h_stride = w_pad * 4; + int new_c_stride = new_h_stride * h_pad; + + int ic_4_stride = w_pad * h_pad * 4; + int oc_4_stride = wout * hout * 4; + + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int block_count = (size_tile + tile_block - 1) / tile_block; + + int threads = ctx->threads(); + float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride; + int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 256; + memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float)); + float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride; + float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 256; + + // begin compute + for (int ni = 0; ni < num; ++ni) { + // trans input to c4 + for (int i = 0; i < ic_4; ++i) { + prepack_input_nxwc4_dw(input + ni * in_n_stride, + input_c4 + i * new_c_stride, + i * 4, + -pad_h, + hin + pad_h, + -pad_w, + win + pad_w, + chin, + win, + hin, + zero_ptr); + } + float* output_ptr = output + ni * out_n_stride; + + const float* weight_ptr = weight; + const float* bias_ptr = bias; +#pragma omp parallel for num_threads(threads) + for (int tbi = 0; tbi < block_count; ++tbi) { +#ifdef ARM_WITH_OMP + float* tmp_data = + g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride; + float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 256; + float* trans_remain_tmp_data = + g_trans_remain_tmp_data + omp_get_thread_num() * 256; +#else + float* tmp_data = g_tmp_data; + float* trans_tmp_data = g_trans_tmp_data; + float* trans_remain_tmp_data = g_trans_remain_tmp_data; +#endif + int tile_index = tbi * tile_block; + int tile_remain = size_tile - tile_index; + int tile_count = tile_remain > tile_block ? tile_block : tile_remain; + + // input trans + int c_gi_stride = tile_count * oc_4 * 4; + int b_gi_stride = tile_count * ic_4 * 4; + //* + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int src_x = tw_index * 6; + int src_y = th_index * 6; + int ex = src_x + 8 > w_pad ? w_pad - src_x : 8; + int ey = src_y + 8 > h_pad ? h_pad - src_y : 8; + + float* dst_ptr = tmp_data + ti * 4; + const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4; + + if (ex == 8 && ey == 8) { + // trans input + for (int ci = 0; ci < ic_4; ++ci) { + const float* src_ci = src_ptr + ci * ic_4_stride; + for (int i = 0; i < 8; ++i) { + const float* ci_ptr = src_ci + i * w_pad * 4; + input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32); + } + float* dst_ci = dst_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + input_trans_c4(trans_tmp_data + i * 32, + 4, + dst_ci + i * b_gi_stride * 8, + b_gi_stride); + } + } + } else { + // trans remain input + int x_size = ex; + for (int ci = 0; ci < ic_4; ++ci) { + const float* src_ci = src_ptr + ci * ic_4_stride; + // pad + memset(trans_remain_tmp_data, 0, 256 * sizeof(float)); + if (x_size > 0) { + for (int yi = 0; yi < ey; ++yi) { + float* dst_yi = trans_remain_tmp_data + yi * 32; + const float* src_yi = src_ci + w_pad * yi * 4; + memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4); + } + } + + // trans + for (int i = 0; i < 8; ++i) { + float* ci_ptr = trans_remain_tmp_data + i * 32; + input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32); + } + float* dst_ci = dst_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + input_trans_c4(trans_tmp_data + i * 32, + 4, + dst_ci + i * b_gi_stride * 8, + b_gi_stride); + } + } // for ci_4 + } + } + //*/ + // input trans end + // *begin compute dot + // * + //* + float* dst_temp_data = tmp_data + tile_block * ic_4 * 256; + float* b_ptr = tmp_data; + int w_gi_stride = ic_4 * oc_4 * 16; + for (int gi = 0; gi < 64; ++gi) { + float* origin_C = dst_temp_data + gi * c_gi_stride; + float* origin_B = b_ptr + gi * b_gi_stride; + const float* origin_A = weight + gi * w_gi_stride; + sgemm_prepack_c4_small(oc_4 * 4, + tile_count, + ic_4 * 4, + origin_A, + origin_B, + origin_C, + nullptr, + false, + false, + ctx); + } + //*/ + //* + // output trans + float bias_value[4]; + memset(bias_value, 0, 4 * sizeof(float)); + + for (int ti = 0; ti < tile_count; ++ti) { + int index = tile_index + ti; + + int tw_index = index % tile_w; + int th_index = index / tile_w; + + int dst_x = tw_index * 6; + int dst_y = th_index * 6; + + int ex = dst_x + 6 > wout ? wout - dst_x : 6; + int ey = dst_y + 6 > hout ? hout - dst_y : 6; + + float* dst_ptr = output + (dst_y * wout + dst_x) * 4; + float* src_ptr = dst_temp_data + ti * 4; + + if (ex == 6) { + // trans output + for (int ci = 0; ci < oc_4; ++ci) { + if (param.bias) { + bias_value[0] = bias[ci * 4]; + bias_value[1] = bias[ci * 4 + 1]; + bias_value[2] = bias[ci * 4 + 2]; + bias_value[3] = bias[ci * 4 + 3]; + } + + float* dst_ci = dst_ptr + ci * oc_4_stride; + float* src_ci = src_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + output_trans_c4(src_ci + i * c_gi_stride * 8, + c_gi_stride, + trans_tmp_data + i * 4, + 32); + } + for (int i = 0; i < ey; ++i) { + output_trans_c4_post(trans_tmp_data + i * 32, + 4, + trans_remain_tmp_data + i * 24, + 4, + bias_value, + param.fuse_relu); + } + write_to_output_c4_fp32(trans_remain_tmp_data, + output_ptr, + ci * 4, + ci * 4 + 4, + dst_y, + dst_y + ey, + dst_x, + dst_x + ex, + chout, + hout, + wout, + false, + zero_ptr); + } + } else { + for (int ci = 0; ci < oc_4; ++ci) { + if (param.bias) { + bias_value[0] = bias[ci * 4]; + bias_value[1] = bias[ci * 4 + 1]; + bias_value[2] = bias[ci * 4 + 2]; + bias_value[3] = bias[ci * 4 + 3]; + } + // trans output + float* dst_ci = dst_ptr + ci * oc_4_stride; + float* src_ci = src_ptr + ci * tile_count * 4; + for (int i = 0; i < 8; ++i) { + output_trans_c4(src_ci + i * c_gi_stride * 8, + c_gi_stride, + trans_tmp_data + i * 4, + 32); + } + for (int i = 0; i < ey; ++i) { + output_trans_c4_post(trans_tmp_data + i * 32, + 4, + trans_remain_tmp_data + i * 24, + 4, + bias_value, + param.fuse_relu); + } + // copy to dest + memset(trans_tmp_data, 0, 144 * sizeof(float)); + for (int i = 0; i < ey; ++i) { + memcpy(trans_tmp_data + i * ex * 4, + trans_remain_tmp_data + i * 24, + ex * sizeof(float) * 4); + } + write_to_output_c4_fp32(trans_tmp_data, + output_ptr, + ci * 4, + ci * 4 + 4, + dst_y, + dst_y + ey, + dst_x, + dst_x + ex, + chout, + hout, + wout, + false, + zero_ptr); + } + } + } + //*/ + } // for block_count + } // for num +} // conv_compute + +void output_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride) { + const float32x4_t src0 = vld1q_f32(src); + const float32x4_t src1 = vld1q_f32(src + src_stride); + const float32x4_t src2 = vld1q_f32(src + src_stride * 2); + const float32x4_t src3 = vld1q_f32(src + src_stride * 3); + const float32x4_t src4 = vld1q_f32(src + src_stride * 4); + const float32x4_t src5 = vld1q_f32(src + src_stride * 5); + const float32x4_t src6 = vld1q_f32(src + src_stride * 6); + const float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t tmp024a = vaddq_f32(src1, src2); + float32x4_t tmp135a = vsubq_f32(src1, src2); + float32x4_t tmp024b = vaddq_f32(src3, src4); + float32x4_t tmp135b = vsubq_f32(src3, src4); + float32x4_t tmp024c = vaddq_f32(src5, src6); + float32x4_t tmp135c = vsubq_f32(src5, src6); + + float32x4_t dest0 = + vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c); + float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)), + vmulq_n_f32(tmp024c, 0.25f)); + float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)), + vmulq_n_f32(tmp024c, 0.0625f)); + + float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)), + vmulq_n_f32(tmp135c, 0.5f)); + float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)), + vmulq_n_f32(tmp135c, 0.125f)); + float32x4_t dest5 = + vaddq_f32(src7, + vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)), + vmulq_n_f32(tmp135c, 0.03125f))); + + vst1q_f32(dest, dest0); + vst1q_f32(dest + dest_stride, dest1); + vst1q_f32(dest + dest_stride * 2, dest2); + vst1q_f32(dest + dest_stride * 3, dest3); + vst1q_f32(dest + dest_stride * 4, dest4); + vst1q_f32(dest + dest_stride * 5, dest5); +} +void output_trans_c4_post(const float* src, + int src_stride, + float* dest, + int dest_stride, + float* bias_value, + bool has_relu = false) { + const float32x4_t src0 = vld1q_f32(src); + const float32x4_t src1 = vld1q_f32(src + src_stride); + const float32x4_t src2 = vld1q_f32(src + src_stride * 2); + const float32x4_t src3 = vld1q_f32(src + src_stride * 3); + const float32x4_t src4 = vld1q_f32(src + src_stride * 4); + const float32x4_t src5 = vld1q_f32(src + src_stride * 5); + const float32x4_t src6 = vld1q_f32(src + src_stride * 6); + const float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t tmp024a = vaddq_f32(src1, src2); + float32x4_t tmp135a = vsubq_f32(src1, src2); + float32x4_t tmp024b = vaddq_f32(src3, src4); + float32x4_t tmp135b = vsubq_f32(src3, src4); + float32x4_t tmp024c = vaddq_f32(src5, src6); + float32x4_t tmp135c = vsubq_f32(src5, src6); + + float32x4_t dest0 = + vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c); + float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)), + vmulq_n_f32(tmp024c, 0.25f)); + float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)), + vmulq_n_f32(tmp024c, 0.0625f)); + + float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)), + vmulq_n_f32(tmp135c, 0.5f)); + float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)), + vmulq_n_f32(tmp135c, 0.125f)); + float32x4_t dest5 = + vaddq_f32(src7, + vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)), + vmulq_n_f32(tmp135c, 0.03125f))); + + if (bias_value) { + float32x4_t bias = vld1q_f32(bias_value); + dest0 = vaddq_f32(dest0, bias); + dest1 = vaddq_f32(dest1, bias); + dest2 = vaddq_f32(dest2, bias); + dest3 = vaddq_f32(dest3, bias); + dest4 = vaddq_f32(dest4, bias); + dest5 = vaddq_f32(dest5, bias); + } + + if (has_relu) { + float32x4_t zeros = vdupq_n_f32(0); + dest0 = vmaxq_f32(dest0, zeros); + dest1 = vmaxq_f32(dest1, zeros); + dest2 = vmaxq_f32(dest2, zeros); + dest3 = vmaxq_f32(dest3, zeros); + dest4 = vmaxq_f32(dest4, zeros); + dest5 = vmaxq_f32(dest5, zeros); + } + + vst1q_f32(dest, dest0); + vst1q_f32(dest + dest_stride, dest1); + vst1q_f32(dest + dest_stride * 2, dest2); + vst1q_f32(dest + dest_stride * 3, dest3); + vst1q_f32(dest + dest_stride * 4, dest4); + vst1q_f32(dest + dest_stride * 5, dest5); +} + +void input_trans_c4(const float* src, + int src_stride, + float* dest, + int dest_stride) { + float32x4_t src0 = vld1q_f32(src); + float32x4_t src1 = vld1q_f32(src + src_stride); + float32x4_t src2 = vld1q_f32(src + src_stride * 2); + float32x4_t src3 = vld1q_f32(src + src_stride * 3); + float32x4_t src4 = vld1q_f32(src + src_stride * 4); + float32x4_t src5 = vld1q_f32(src + src_stride * 5); + float32x4_t src6 = vld1q_f32(src + src_stride * 6); + float32x4_t src7 = vld1q_f32(src + src_stride * 7); + + float32x4_t dst0 = vaddq_f32(vsubq_f32(src0, src6), + vmulq_n_f32(vsubq_f32(src4, src2), 5.25)); + float32x4_t dst7 = vaddq_f32(vsubq_f32(src7, src1), + vmulq_n_f32(vsubq_f32(src3, src5), 5.25)); + + float32x4_t tmp12a = + vsubq_f32(vaddq_f32(src2, src6), vmulq_n_f32(src4, 4.25)); + float32x4_t tmp12b = + vsubq_f32(vaddq_f32(src1, src5), vmulq_n_f32(src3, 4.25)); + float32x4_t dst1 = vaddq_f32(tmp12a, tmp12b); + float32x4_t dst2 = vsubq_f32(tmp12a, tmp12b); + + float32x4_t tmp34a = vsubq_f32(vaddq_f32(src6, vmulq_n_f32(src2, 0.25)), + vmulq_n_f32(src4, 1.25)); + float32x4_t tmp34b = + vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 0.5), vmulq_n_f32(src3, 2.5)), + vmulq_n_f32(src5, 2)); + float32x4_t dst3 = vaddq_f32(tmp34a, tmp34b); + float32x4_t dst4 = vsubq_f32(tmp34a, tmp34b); + + float32x4_t tmp56a = + vaddq_f32(src6, vmulq_n_f32(vsubq_f32(src2, vmulq_n_f32(src4, 1.25)), 4)); + float32x4_t tmp56b = + vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 2), vmulq_n_f32(src3, 2.5)), + vmulq_n_f32(src5, 0.5)); + float32x4_t dst5 = vaddq_f32(tmp56a, tmp56b); + float32x4_t dst6 = vsubq_f32(tmp56a, tmp56b); + + vst1q_f32(dest, dst0); + vst1q_f32(dest + dest_stride, dst1); + vst1q_f32(dest + dest_stride * 2, dst2); + vst1q_f32(dest + dest_stride * 3, dst3); + vst1q_f32(dest + dest_stride * 4, dst4); + vst1q_f32(dest + dest_stride * 5, dst5); + vst1q_f32(dest + dest_stride * 6, dst6); + vst1q_f32(dest + dest_stride * 7, dst7); +} +void weight_trans_c4( + float* dest, const float* din, int ch_in, int ch_out, void* workspace) { + const float coeff[8][3] = {{1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.0f / 45, 16.0f / 45, 8.0f / 45}, + {32.0f / 45, -16.0f / 45, 8.0f / 45}, + {0.0f, 0.0f, 1.0f}}; + + float* ptr_out = static_cast(workspace); + + for (int i = 0; i < ch_out; i++) { + for (int j = 0; j < ch_in; j++) { + const float* kernel0 = + static_cast(din) + (i * ch_in + j) * 9; + float* ptr_channel = ptr_out + (i * ch_in + j) * 64; + + //! transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + //! h + float tmp[8][3]; + for (int i = 0; i < 8; i++) { + tmp[i][0] = + k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2]; + tmp[i][1] = + k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2]; + tmp[i][2] = + k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2]; + } + + //! v + for (int j = 0; j < 8; j++) { + float* tmpp = &tmp[j][0]; + for (int i = 0; i < 8; i++) { + ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] + + tmpp[1] * coeff[i][1] + + tmpp[2] * coeff[i][2]; + } + } + } + } + + int oc_pad = (ch_out + 3) / 4 * 4; + int ic_pad = (ch_in + 3) / 4 * 4; + int c_stride = ic_pad * oc_pad; + for (int i = 0; i < ch_out * ch_in * 64; ++i) { + int new_c = i % 64; + int new_oc = i / ch_in / 64 / 4; + int new_ic = i / 64 % (ch_in * 4) % ch_in; + int new_inner = i / ch_in / 64 % 4; + int dest_ind = + new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner; + dest[dest_ind] = ptr_out[i]; + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc index 6a1fa37681585883280625a22c15aec43c6554af..b4972a1ecab151947f8aaa7d6db0f6e82a08e5e4 100644 --- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc +++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc @@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param, auto dim_in = param.x->dims(); auto dim_out = param.output->dims(); const int threads = ctx->threads(); + auto paddings = *param.paddings; int llc_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; int ow = dim_out[3]; int oh = dim_out[2]; int ic = dim_in[1]; @@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data, ARMContext* ctx) { const int threads = ctx->threads(); int l2_size = ctx->llc_size() / sizeof(float); + auto paddings = *param.paddings; - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int win_round = wout_round + 2; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc index f966313e118acf3f74124aca1d16aa3c50009bb8..64e72bc441bb93fa955e12ff53ce17f0e37b4830 100644 --- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc +++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc @@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din, const operators::ConvParam& param, Context* ctx, const float* scale) { + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / 4; diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4c9fb99ef9a6b5d3987a1efd5a644f322ea043c --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc @@ -0,0 +1,2539 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void conv_depthwise_3x3s1p0_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p0_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p1_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1p1_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx); + +void conv_depthwise_3x3s1_fp32(const float *din, + float *dout, + int num, + int ch_out, + int h_out, + int w_out, + int ch_in, + int h_in, + int w_in, + const float *weights, + const float *bias, + int pad, + bool flag_bias, + bool flag_relu, + ARMContext *ctx) { + if (pad == 0) { + if (w_in > 5) { + conv_depthwise_3x3s1p0_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s1p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } + if (pad == 1) { + if (w_in > 4) { + conv_depthwise_3x3s1p1_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s1p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } +} + +#ifdef __aarch64__ +#define INIT_S1 \ + "PRFM PLDL1KEEP, [%[din_ptr0]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr1]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr2]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr3]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr4]] \n" \ + "PRFM PLDL1KEEP, [%[din_ptr5]] \n" \ + "movi v21.4s, #0x0\n" /* out0 = 0 */ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ + +#define LEFT_COMPUTE_S1 \ + "ext v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr0], %[din_ptr0], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr1], %[din_ptr1], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \ + \ + "ext v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/ \ + "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */ \ + "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v17.16b, v4.16b, v5.16b, #4 \n" /* v16=1234 */ \ + "ext v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/ \ + \ + /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ + +#define LEFT_RESULT_S1 \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "blt 3f \n" + +#define MID_COMPUTE_S1 \ + "1: \n" /* r0 */ \ + "fmla v12.4s , v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v0.4s}, [%[din_ptr0]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v1.4s}, [%[din_ptr0]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v2.4s}, [%[din_ptr1]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v3.4s}, [%[din_ptr1]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v4.4s}, [%[din_ptr2]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v5.4s}, [%[din_ptr2]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define MID_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_COMPUTE_S1 \ + "3: \n" \ + "ld1 {v18.4s, v19.4s}, [%[vmask]] \n" \ + "ld1 {v22.4s}, [%[doutr0]] \n" \ + "ld1 {v23.4s}, [%[doutr1]] \n" \ + "ld1 {v24.4s}, [%[doutr2]] \n" \ + "ld1 {v25.4s}, [%[doutr3]] \n" \ + \ + "bif v0.16b, %[vzero].16b, v18.16b \n" \ + "bif v1.16b, %[vzero].16b, v19.16b \n" \ + "bif v2.16b, %[vzero].16b, v18.16b \n" \ + "bif v3.16b, %[vzero].16b, v19.16b \n" \ + \ + "bif v4.16b, %[vzero].16b, v18.16b \n" \ + "bif v5.16b, %[vzero].16b, v19.16b \n" \ + "bif v6.16b, %[vzero].16b, v18.16b \n" \ + "bif v7.16b, %[vzero].16b, v19.16b \n" \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */ \ + "fmla v12.4s, v0.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v8.16b, %[vzero].16b, v18.16b \n" \ + "bif v9.16b, %[vzero].16b, v19.16b \n" \ + "bif v10.16b, %[vzero].16b, v18.16b \n" \ + "bif v11.16b, %[vzero].16b, v19.16b \n" \ + \ + "fmla v12.4s, v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "ld1 {v18.4s}, [%[rmask]] \n" \ + \ + "fmla v12.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */ \ + "fmla v13.4s , v2.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v2.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v13.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v13.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */ \ + "fmla v14.4s , v4.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v4.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v12.4s , v4.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmla v14.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v12.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "fmla v14.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v12.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define LEFT_RESULT_S1_RELU \ + /* r4 */ \ + "fmla v15.4s , v8.4s, %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */ \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \ + \ + "ext v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/ \ + "ext v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \ + "fmla v15.4s , v10.4s, %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */ \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \ + \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */ \ + "cmp %w[cnt], #1 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + "blt 3f \n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v6.4s}, [%[din_ptr3]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + \ + "ld1 {v7.4s}, [%[din_ptr3]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v12.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" \ + \ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v13.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ \ + "ld1 {v14.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" \ + "ld1 {v15.4s}, [%[bias_val]] \n" /*vdupq_n_f32(bias_val)*/ \ + \ + "bne 1b \n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "fmla v15.4s , v6.4s, %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v6.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v13.4s , v6.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v13.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v12.16b, v22.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v13.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */ \ + "fmla v15.4s , v8.4s, %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + "fmla v14.4s , v8.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "st1 {v12.4s}, [%[doutr0]], #16 \n" \ + "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + "fmla v14.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v13.16b, v23.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + "fmla v14.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "ext v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/ \ + "ext v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ \ + \ + "st1 {v13.4s}, [%[doutr1]], #16 \n" /* r3 */ \ + "fmla v15.4s , v10.4s, %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \ + \ + "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/ \ + \ + "fmla v15.4s , v16.4s, %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \ + \ + "bif v14.16b, v24.16b, v18.16b \n" \ + \ + "fmla v15.4s , v17.4s, %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \ + \ + "st1 {v14.4s}, [%[doutr2]], #16 \n" \ + \ + "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/ \ + \ + "bif v15.16b, v25.16b, v18.16b \n" \ + \ + "st1 {v15.4s}, [%[doutr3]], #16 \n" + +#define COMPUTE_S_S1 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s}, [%[din0]], #16\n" \ + "ld1 {v1.4s}, [%[din1]], #16\n" \ + "ld1 {v2.4s}, [%[din2]], #16\n" \ + "ld1 {v3.4s}, [%[din3]], #16\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask].16b\n" \ + "bif v2.16b, %[zero].16b, %[mask].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask].16b\n" \ + \ + "ext v4.16b, %[zero].16b, v0.16b, #12\n" \ + "ext v5.16b, %[zero].16b, v1.16b, #12\n" \ + "ext v6.16b, %[zero].16b, v2.16b, #12\n" \ + "ext v7.16b, %[zero].16b, v3.16b, #12\n" \ + \ + "ext v8.16b, v0.16b, %[zero].16b, #4\n" \ + "ext v9.16b, v1.16b, %[zero].16b, #4\n" \ + "ext v10.16b, v2.16b, %[zero].16b, #4\n" \ + "ext v11.16b, v3.16b, %[zero].16b, #4\n" \ + \ + "fmul v12.4s, v0.4s, %[wr0].s[1]\n" \ + "fmul v13.4s, v1.4s, %[wr0].s[1]\n" \ + \ + "fmul v14.4s, v1.4s, %[wr1].s[1]\n" \ + "fmul v15.4s, v2.4s, %[wr1].s[1]\n" \ + \ + "fmul v16.4s, v2.4s, %[wr2].s[1]\n" \ + "fmul v17.4s, v3.4s, %[wr2].s[1]\n" \ + \ + "fmla v12.4s, v4.4s, %[wr0].s[0]\n" \ + "fmla v13.4s, v5.4s, %[wr0].s[0]\n" \ + \ + "fmla v14.4s, v5.4s, %[wr1].s[0]\n" \ + "fmla v15.4s, v6.4s, %[wr1].s[0]\n" \ + \ + "fmla v16.4s, v6.4s, %[wr2].s[0]\n" \ + "fmla v17.4s, v7.4s, %[wr2].s[0]\n" \ + \ + "fmla v12.4s, v8.4s, %[wr0].s[2]\n" \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "fmla v14.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v15.4s, v10.4s, %[wr1].s[2]\n" \ + \ + "fmla v16.4s, v10.4s, %[wr2].s[2]\n" \ + "fmla v17.4s, v11.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v14.4s\n" \ + "fadd v12.4s, v12.4s, v16.4s\n" \ + \ + "fadd v13.4s, v13.4s, v15.4s\n" \ + "fadd v13.4s, v13.4s, v17.4s\n" \ + \ + "fadd v12.4s, v12.4s, %[bias].4s\n" \ + "fadd v13.4s, v13.4s, %[bias].4s\n" + +#define RESULT_S_S1 \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "prfm pldl1keep, [%[out1]]\n" \ + "prfm pldl1keep, [%[out2]]\n" \ + \ + "fmax v12.4s, v12.4s, %[zero].4s\n" \ + "fmax v13.4s, v13.4s, %[zero].4s\n" \ + \ + "st1 {v12.4s}, [%[out1]]\n" \ + "st1 {v13.4s}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "prfm pldl1keep, [%[din0]]\n" \ + "prfm pldl1keep, [%[din1]]\n" \ + "prfm pldl1keep, [%[din2]]\n" \ + "prfm pldl1keep, [%[din3]]\n" \ + \ + "ld1 {v0.4s, v1.4s}, [%[din0]]\n" \ + "ld1 {v2.4s, v3.4s}, [%[din1]]\n" \ + "ld1 {v4.4s, v5.4s}, [%[din2]]\n" \ + "ld1 {v6.4s, v7.4s}, [%[din3]]\n" \ + \ + "bif v0.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v1.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v2.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v3.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v4.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v5.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "bif v6.16b, %[zero].16b, %[mask1].16b\n" \ + "bif v7.16b, %[zero].16b, %[mask2].16b\n" \ + \ + "ext v8.16b, v0.16b, v1.16b, #4\n" \ + "ext v9.16b, v0.16b, v1.16b, #8\n" \ + \ + "and v12.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v13.16b, %[vbias].16b, %[vbias].16b \n" /* r0 */ \ + "fmul v10.4s, v0.4s, %[wr0].s[0]\n" \ + "fmul v11.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v12.4s, v9.4s, %[wr0].s[2]\n" \ + \ + "ext v8.16b, v2.16b, v3.16b, #4\n" \ + "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */ \ + "fmul v14.4s, v2.4s, %[wr0].s[0]\n" \ + "fmla v10.4s, v2.4s, %[wr1].s[0]\n" \ + \ + "fmul v15.4s, v8.4s, %[wr0].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr1].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr0].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr1].s[2]\n" \ + \ + "ext v8.16b, v4.16b, v5.16b, #4\n" \ + "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */ \ + "fmla v14.4s, v4.4s, %[wr1].s[0]\n" \ + "fmla v10.4s, v4.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr1].s[1]\n" \ + "fmla v11.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fmla v13.4s, v9.4s, %[wr1].s[2]\n" \ + "fmla v12.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "ext v8.16b, v6.16b, v7.16b, #4\n" \ + "ext v9.16b, v6.16b, v7.16b, #8\n" \ + \ + "fmla v14.4s, v6.4s, %[wr2].s[0]\n" \ + \ + "fmla v15.4s, v8.4s, %[wr2].s[1]\n" \ + \ + "fadd v12.4s, v12.4s, v10.4s\n" \ + \ + "fmla v13.4s, v9.4s, %[wr2].s[2]\n" \ + \ + "fadd v12.4s, v12.4s, v11.4s\n" \ + "fadd v13.4s, v13.4s, v14.4s\n" \ + "fadd v13.4s, v13.4s, v15.4s\n" // \ + // "prfm pldl1keep, [%[out1]]\n" \ + // "prfm pldl1keep, [%[out2]]\n" \ + // \ + // "st1 {v12.4s}, [%[out1]]\n" \ + // "st1 {v13.4s}, [%[out2]]\n" \ + + +#else +#define INIT_S1 \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vld1.32 {d16-d18}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1_ptr]]! @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2_ptr]]! @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3_ptr]]! @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" + +#define LEFT_COMPUTE_S1 \ + "vext.32 q6, %q[vzero], q8, #3 @ 0012\n" \ + "vext.32 q7, q8, q9, #1 @ 1234\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n" \ + "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q10, #3 @ 0012\n" \ + "vext.32 q7, q10, q11, #1 @ 1234\n" \ + \ + /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q10, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q12, #3 @ 0012\n" \ + "vext.32 q7, q12, q13, #1 @ 1234\n" \ + \ + /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q12, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[2]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, %q[vzero], q14, #3 @ 0012\n" \ + "vext.32 q7, q14, q15, #1 @ 1234\n" + +#define LEFT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_COMPUTE_S1 \ + "1: @ right pad entry\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + "pld [%[din3_ptr]] @ preload data\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vld1.32 {d18}, [%[din0_ptr]] @ load din r0\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d20-d21}, [%[din1_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d22}, [%[din1_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d24-d25}, [%[din2_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d26}, [%[din2_ptr]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define MID_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_COMPUTE_S1 \ + "3: @ right pad entry\n" \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d31}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d19}, [%[rmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[rmask]]! @ load din r0\n" \ + \ + "vmla.f32 q5, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d16-d17}, [%[dout_ptr1]] @ load din r0\n" \ + "vld1.32 {d20-d21}, [%[dout_ptr2]] @ load din r0\n" \ + \ + "vmla.f32 q5, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q5, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q4, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" + +#define RIGHT_RESULT_S1 \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define LEFT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][0] @ q4 += 1234 * wr0[0]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 1234 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "cmp %[cnt], #1 @ check whether has mid cols\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + "blt 3f @ jump to main loop start point\n" + +#define MID_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vld1.32 {d28-d29}, [%[din3_ptr]]! @ load din r0\n" \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vld1.32 {d30}, [%[din3_ptr]] @ load din r0\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" \ + "vdup.32 q4, %[bias_val] @ and \n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], #1 @ loop count minus 1\n" \ + \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "bne 1b @ jump to main loop start point\n" + +#define RIGHT_RESULT_S1_RELU \ + /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmax.f32 q4, q4, %q[vzero] @ relu \n" \ + \ + "vmla.f32 q5, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d8, d16, d19 @ bit select, deal with right pad\n" \ + "vbif d9, d17, d23 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q5, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vst1.32 {d8-d9}, [%[dout_ptr1]]! @ store result, add pointer\n" \ + \ + "vmax.f32 q5, q5, %q[vzero] @ relu \n" \ + \ + "vbif d10, d20, d19 @ bit select, deal with right pad\n" \ + "vbif d11, d21, d23 @ bit select, deal with right pad\n" \ + \ + "vst1.32 {d10-d11}, [%[dout_ptr2]]! @ store result, add pointer\n" + +#define COMPUTE_S_S1 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + \ + "vld1.32 {d12-d13}, [%[din0]]!\n" \ + "vld1.32 {d14-d15}, [%[din1]]!\n" \ + "vld1.32 {d16-d17}, [%[din2]]!\n" \ + "vld1.32 {d18-d19}, [%[din3]]!\n" \ + \ + "vbif q6, %q[vzero], %q[mask]\n" \ + "vbif q7, %q[vzero], %q[mask]\n" \ + "vbif q8, %q[vzero], %q[mask]\n" \ + "vbif q9, %q[vzero], %q[mask]\n" \ + \ + "vmul.f32 q14, q6, %e[wr0][1]\n" \ + "vmul.f32 q15, q7, %e[wr0][1]\n" \ + \ + "vmla.f32 q14, q7, %e[wr1][1]\n" \ + "vmla.f32 q15, q8, %e[wr1][1]\n" \ + \ + "vmla.f32 q14, q8, %e[wr2][1]\n" \ + "vmla.f32 q15, q9, %e[wr2][1]\n" \ + \ + "vext.32 q10, %q[vzero], q6, #3\n" \ + "vext.32 q11, %q[vzero], q7, #3\n" \ + "vext.32 q12, %q[vzero], q8, #3\n" \ + "vext.32 q13, %q[vzero], q9, #3\n" \ + \ + "vmla.f32 q14, q10, %e[wr0][0]\n" \ + "vmla.f32 q15, q11, %e[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %e[wr1][0]\n" \ + "vmla.f32 q15, q12, %e[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %e[wr2][0]\n" \ + "vmla.f32 q15, q13, %e[wr2][0]\n" \ + \ + "vext.32 q10, q6, %q[vzero], #1\n" \ + "vext.32 q11, q7, %q[vzero], #1\n" \ + "vext.32 q12, q8, %q[vzero], #1\n" \ + "vext.32 q13, q9, %q[vzero], #1\n" \ + \ + "vmla.f32 q14, q10, %f[wr0][0]\n" \ + "vmla.f32 q15, q11, %f[wr0][0]\n" \ + \ + "vmla.f32 q14, q11, %f[wr1][0]\n" \ + "vmla.f32 q15, q12, %f[wr1][0]\n" \ + \ + "vmla.f32 q14, q12, %f[wr2][0]\n" \ + "vmla.f32 q15, q13, %f[wr2][0]\n" \ + \ + "vadd.f32 q14, q14, %q[bias]\n" \ + "vadd.f32 q15, q15, %q[bias]\n" + +#define RESULT_S_S1 \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define RESULT_S_S1_RELU \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmax.f32 q14, q14, %q[vzero]\n" \ + "vmax.f32 q15, q15, %q[vzero]\n" \ + \ + "vst1.32 {d28-d29}, [%[out1]]\n" \ + "vst1.32 {d30-d31}, [%[out2]]\n" + +#define COMPUTE_S_S1_P0 \ + "pld [%[din0]]\n" \ + "pld [%[din1]]\n" \ + "pld [%[din2]]\n" \ + "pld [%[din3]]\n" \ + "vld1.32 {d16-d18}, [%[din0]] @ load din r0\n" \ + "vld1.32 {d20-d22}, [%[din1]] @ load din r1\n" \ + "vld1.32 {d24-d26}, [%[din2]] @ load din r2\n" \ + "vld1.32 {d28-d30}, [%[din3]] @ load din r3\n" \ + \ + "vdup.32 q4, %[bias_val] @ and \n" \ + "vdup.32 q5, %[bias_val] @ and \n" \ + \ + "vld1.32 {d19}, [%[vmask]]! @ load din r0\n" \ + "vld1.32 {d23}, [%[vmask]]! @ load din r0\n" \ + \ + "vld1.32 {d27}, [%[vmask]]! @ load din r0\n" \ + \ + "vbif d16, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d20, %e[vzero], d19 @ bit select, deal with right pad\n" \ + \ + "vbif d17, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d21, %e[vzero], d23 @ bit select, deal with right pad\n" \ + \ + "vbif d18, %e[vzero], d27 @ bit select, deal with right pad\n" \ + "vbif d22, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vext.32 q6, q8, q9, #1 @ 1234\n" \ + "vext.32 q7, q8, q9, #2 @ 2345\n" /* r0 */ \ + "vmla.f32 q4, q8, %e[wr0][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vbif d24, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d25, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d26, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vbif d28, %e[vzero], d19 @ bit select, deal with right pad\n" \ + "vbif d29, %e[vzero], d23 @ bit select, deal with right pad\n" \ + "vbif d30, %e[vzero], d27 @ bit select, deal with right pad\n" \ + \ + "vmla.f32 q4, q7, %f[wr0][0] @ q4 += 2345 * wr0[2]\n" \ + \ + "vext.32 q6, q10, q11, #1 @ 1234\n" \ + "vext.32 q7, q10, q11, #2 @ 2345\n" /* r1 */ \ + "vmla.f32 q5, q10, %e[wr0][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q10, %e[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q8, q6, %e[wr0][1] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q10, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmul.f32 q9, q7, %f[wr0][0] @ q4 += 1234 * wr0[1]\n" \ + "vmul.f32 q11, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q12, q13, #1 @ 1234\n" \ + "vext.32 q7, q12, q13, #2 @ 2345\n" /* r2 */ \ + "vmla.f32 q5, q12, %e[wr1][0] @ q4 += 1234 * wr0[0]\n" \ + "vmla.f32 q4, q12, %e[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q8, q6, %e[wr1][1] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q10, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + \ + "vmla.f32 q9, q7, %f[wr1][0] @ q4 += 1234 * wr0[1]\n" \ + "vmla.f32 q11, q7, %f[wr2][0] @ q4 += 1234 * wr0[1]\n" \ + \ + "vext.32 q6, q14, q15, #1 @ 1234\n" \ + "vext.32 q7, q14, q15, #2 @ 2345\n" /* r3 */ \ + "vmla.f32 q5, q14, %e[wr2][0] @ q4 += 0123 * wr0[0]\n" \ + \ + "vmla.f32 q8, q6, %e[wr2][1] @ q4 += 1234 * wr0[1]\n" \ + "vadd.f32 q4, q4, q10 @ q4 += q10 \n" \ + \ + "pld [%[out1]]\n" \ + "pld [%[out2]]\n" \ + \ + "vmla.f32 q9, q7, %f[wr2][0] @ q4 += 2345 * wr0[2]\n" \ + "vadd.f32 q14, q4, q11 @ q4 += q10 \n" \ + \ + "vadd.f32 q5, q5, q8 @ q4 += q10 \n" \ + "vadd.f32 q15, q5, q9 @ q4 += q10 \n" + +#endif +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p1_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = (w_in + 3) >> 2; + int cnt_col = tile_w - 2; + + unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in); + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + din_ptr4 = dr3; + din_ptr5 = dr4; + dr0 = dr3; + dr1 = dr4; + dr2 = dr5; + } else { + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + } + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 > h_in) { + switch (i + 5 - h_in) { + case 5: + din_ptr1 = zero_ptr; + case 4: + din_ptr2 = zero_ptr; + case 3: + din_ptr3 = zero_ptr; + case 2: + din_ptr4 = zero_ptr; + case 1: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + // unsigned int* rst_mask = rmask; + + if (i == 0) { + din_ptr0 = zero_ptr; + din_ptr1 = dr0; + din_ptr2 = dr1; + din_ptr3 = dr2; + dr0 = dr1; + dr1 = dr2; + dr2 = dr3; + dr3 = dr2 + w_in; + } else { + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + } + //! process bottom pad + if (i + 3 > h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = cnt_col; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile( + INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1 + MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1 + MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1 + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p1_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[4] = {3, 2, 1, 0}; + const float zero[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in)); + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } + + int hs = -1; + int he = 3; + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + int h_cnt = (h_out + 1) >> 1; + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_cnt; ++j) { + const float *dr0 = din_channel + hs * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + if (hs == -1) { + dr0 = zero; + } + + switch (he - h_in) { + case 2: + dr2 = zero; + doutr1 = trash_buf; + case 1: + dr3 = zero; + default: + break; + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [zero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [mask] "w"(vmask_rp), + [bias] "w"(wbias), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + doutr0 = doutr1; + doutr1 += w_out; + hs += 2; + he += 2; + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} + +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width > 4 + */ +void conv_depthwise_3x3s1p0_bias(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! pad is done implicit + const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + //! for 4x6 convolution window + const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + + float *zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float *write_ptr = zero_ptr + w_in; + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + int w_stride = 9; + + int tile_w = w_out >> 2; + int remain = w_out % 4; + + unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in); + const int remian_idx[4] = {0, 1, 2, 3}; + + uint32x4_t vmask_rp1 = + vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_rp2 = + vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right)); + uint32x4_t vmask_result = + vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + unsigned int rmask[4]; + vst1q_u32(rmask, vmask_result); + + float32x4_t vzero = vdupq_n_f32(0.f); + + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int c = 0; c < ch_in; c++) { + float *dout_ptr = dout_batch + c * size_out_channel; + + const float *din_ch_ptr = din_batch + c * size_in_channel; + + float bias_val = flag_bias ? bias[c] : 0.f; + float vbias[4] = {bias_val, bias_val, bias_val, bias_val}; + + const float *wei_ptr = weights + c * w_stride; + + float32x4_t wr0 = vld1q_f32(wei_ptr); + float32x4_t wr1 = vld1q_f32(wei_ptr + 3); + float32x4_t wr2 = vld1q_f32(wei_ptr + 6); + + float *doutr0 = dout_ptr; + float *doutr1 = doutr0 + w_out; + float *doutr2 = doutr1 + w_out; + float *doutr3 = doutr2 + w_out; + + const float *dr0 = din_ch_ptr; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + const float *dr4 = dr3 + w_in; + const float *dr5 = dr4 + w_in; + + const float *din_ptr0 = dr0; + const float *din_ptr1 = dr1; + const float *din_ptr2 = dr2; + const float *din_ptr3 = dr3; + const float *din_ptr4 = dr4; + const float *din_ptr5 = dr5; + + float *ptr_zero = const_cast(zero); +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 4) { + //! process top pad pad_h = 1 + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + din_ptr4 = dr4; + din_ptr5 = dr5; + + doutr0 = dout_ptr; + doutr1 = doutr0 + w_out; + doutr2 = doutr1 + w_out; + doutr3 = doutr2 + w_out; + + dr0 = dr4; + dr1 = dr5; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + dr5 = dr4 + w_in; + + //! process bottom pad + if (i + 5 >= h_in) { + switch (i + 5 - h_in) { + case 4: + din_ptr1 = zero_ptr; + case 3: + din_ptr2 = zero_ptr; + case 2: + din_ptr3 = zero_ptr; + case 1: + din_ptr4 = zero_ptr; + case 0: + din_ptr5 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 4 > h_out) { + switch (i + 4 - h_out) { + case 3: + doutr1 = write_ptr; + case 2: + doutr2 = write_ptr; + case 1: + doutr3 = write_ptr; + default: + break; + } + } + + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1_RELU + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } else { + asm volatile( + INIT_S1 + "ld1 {v8.4s}, [%[din_ptr4]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v10.4s}, [%[din_ptr5]], #16 \n" /*vld1q_f32(din_ptr0)*/ + "ext v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ + "ext v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */ + "ld1 {v9.4s}, [%[din_ptr4]] \n" /*vld1q_f32(din_ptr0)*/ + "ld1 {v11.4s}, [%[din_ptr5]] \n" /*vld1q_f32(din_ptr0)*/ + MID_COMPUTE_S1 MID_RESULT_S1 + "cmp %w[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [cnt] "+r"(cnt), + [din_ptr0] "+r"(din_ptr0), + [din_ptr1] "+r"(din_ptr1), + [din_ptr2] "+r"(din_ptr2), + [din_ptr3] "+r"(din_ptr3), + [din_ptr4] "+r"(din_ptr4), + [din_ptr5] "+r"(din_ptr5), + [doutr0] "+r"(doutr0), + [doutr1] "+r"(doutr1), + [doutr2] "+r"(doutr2), + [doutr3] "+r"(doutr3) + : [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [bias_val] "r"(vbias), + [vmask] "r"(vmask), + [rmask] "r"(rmask), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21", + "v22", + "v23", + "v24", + "v25"); + } + dout_ptr = dout_ptr + 4 * w_out; + } +#else + for (int i = 0; i < h_out; i += 2) { + din_ptr0 = dr0; + din_ptr1 = dr1; + din_ptr2 = dr2; + din_ptr3 = dr3; + + doutr0 = dout_ptr; + doutr1 = dout_ptr + w_out; + + dr0 = dr2; + dr1 = dr3; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + //! process bottom pad + if (i + 3 >= h_in) { + switch (i + 3 - h_in) { + case 3: + din_ptr1 = zero_ptr; + case 2: + din_ptr2 = zero_ptr; + case 1: + din_ptr3 = zero_ptr; + case 0: + din_ptr3 = zero_ptr; + default: + break; + } + } + //! process bottom remain + if (i + 2 > h_out) { + doutr1 = write_ptr; + } + int cnt = tile_w; + unsigned int *rmask_ptr = rmask; + unsigned int *vmask_ptr = vmask; + if (flag_relu) { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1_RELU + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1_RELU "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S1 + "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n" + "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n" + "vext.32 q6, q8, q9, #1 @ 0012\n" + "vext.32 q7, q8, q9, #2 @ 1234\n" MID_COMPUTE_S1 + MID_RESULT_S1 + "cmp %[remain], #1 \n" + "blt 0f \n" RIGHT_COMPUTE_S1 + RIGHT_RESULT_S1 "0: \n" + : [dout_ptr1] "+r"(doutr0), + [dout_ptr2] "+r"(doutr1), + [din0_ptr] "+r"(din_ptr0), + [din1_ptr] "+r"(din_ptr1), + [din2_ptr] "+r"(din_ptr2), + [din3_ptr] "+r"(din_ptr3), + [cnt] "+r"(cnt), + [rmask] "+r"(rmask_ptr), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias_val] "r"(bias_val), + [vzero] "w"(vzero), + [remain] "r"(remain) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + dout_ptr += 2 * w_out; + } //! end of processing mid rows +#endif + } + } +} +/** + * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias, + * width <= 4 + */ +void conv_depthwise_3x3s1p0_bias_s(float *dout, + const float *din, + const float *weights, + const float *bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext *ctx) { + //! 3x3s1 convolution, implemented by direct algorithm + //! pad is done implicit + //! for 4x6 convolution window + const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask_rp1 = + vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in)); + uint32x4_t vmask_rp2 = + vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in)); + + unsigned int vmask[8]; + vst1q_u32(vmask, vmask_rp1); + vst1q_u32(vmask + 4, vmask_rp2); + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + for (int n = 0; n < num; ++n) { + const float *din_batch = din + n * ch_in * size_in_channel; + float *dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + float *dout_channel = dout_batch + i * size_out_channel; + const float *din_channel = din_batch + i * size_in_channel; + const float *weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#endif // __aarch64__ + + float out_buf1[4]; + float out_buf2[4]; + float trash_buf[4]; + + float *doutr0 = dout_channel; + float *doutr1 = dout_channel + w_out; + + for (int j = 0; j < h_out; j += 2) { + const float *dr0 = din_channel + j * w_in; + const float *dr1 = dr0 + w_in; + const float *dr2 = dr1 + w_in; + const float *dr3 = dr2 + w_in; + + doutr0 = dout_channel + j * w_out; + doutr1 = doutr0 + w_out; + + if (j + 3 >= h_in) { + switch (j + 3 - h_in) { + case 3: + dr1 = zero_ptr; + case 2: + dr2 = zero_ptr; + case 1: + dr3 = zero_ptr; + doutr1 = trash_buf; + case 0: + dr3 = zero_ptr; + doutr1 = trash_buf; + default: + break; + } + } +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vbias] "w"(wbias), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [zero] "w"(vzero), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + unsigned int *vmask_ptr = vmask; + float bias_val = flag_bias ? bias[i] : 0.f; + if (flag_relu) { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1 + : [din0] "+r"(dr0), + [din1] "+r"(dr1), + [din2] "+r"(dr2), + [din3] "+r"(dr3), + [vmask] "+r"(vmask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [vzero] "w"(vzero), + [bias_val] "r"(bias_val), + [out1] "r"(out_buf1), + [out2] "r"(out_buf2) + : "cc", + "memory", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *doutr0++ = out_buf1[w]; + *doutr1++ = out_buf2[w]; + } + } // end of processing heights + } // end of processing channels + } // end of processing batchs +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc new file mode 100644 index 0000000000000000000000000000000000000000..08e5efecd751bcca534ba7a47035c5f70fa1f6bf --- /dev/null +++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc @@ -0,0 +1,541 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/core/context.h" +#include "lite/operators/op_params.h" +#ifdef ARM_WITH_OMP +#include +#endif + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void conv_3x3s1_depthwise_fp32(const float* i_data, + float* o_data, + int bs, + int oc, + int oh, + int ow, + int ic, + int ih, + int win, + const float* weights, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + int threads = ctx->threads(); + + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; + + const int out_c_block = 4; + const int out_h_kernel = 2; + const int out_w_kernel = 4; + const int win_ext = ow + 2; + const int ow_round = ROUNDUP(ow, 4); + const int win_round = ROUNDUP(win_ext, 4); + const int hin_round = oh + 2; + const int prein_size = win_round * hin_round * out_c_block; + auto workspace_size = + threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; + ctx->ExtendWorkspace(sizeof(float) * workspace_size); + + bool flag_relu = param.fuse_relu; + bool flag_bias = param.bias != nullptr; + + /// get workspace + float* ptr_zero = ctx->workspace_data(); + memset(ptr_zero, 0, sizeof(float) * win_round); + float* ptr_write = ptr_zero + win_round; + + int size_in_channel = win * ih; + int size_out_channel = ow * oh; + + int ws = -pad_w; + int we = ws + win_round; + int hs = -pad_h; + int he = hs + hin_round; + int w_loop = ow_round / 4; + auto remain = w_loop * 4 - ow; + bool flag_remain = remain > 0; + remain = 4 - remain; + remain = remain > 0 ? remain : 0; + int row_len = win_round * out_c_block; + + for (int n = 0; n < bs; ++n) { + const float* din_batch = i_data + n * ic * size_in_channel; + float* dout_batch = o_data + n * oc * size_out_channel; +#pragma omp parallel for num_threads(threads) + for (int c = 0; c < oc; c += out_c_block) { +#ifdef ARM_WITH_OMP + float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; +#else + float* pre_din = ptr_write + ow_round; +#endif + /// const array size + float pre_out[out_c_block * out_w_kernel * out_h_kernel]; // NOLINT + prepack_input_nxwc4_dw( + din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); + const float* weight_c = weights + c * 9; // kernel_w * kernel_h + float* dout_c00 = dout_batch + c * size_out_channel; + float bias_local[4] = {0, 0, 0, 0}; + if (flag_bias) { + bias_local[0] = bias[c]; + bias_local[1] = bias[c + 1]; + bias_local[2] = bias[c + 2]; + bias_local[3] = bias[c + 3]; + } + float32x4_t vbias = vld1q_f32(bias_local); +#ifdef __aarch64__ + float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 + float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 + float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 + float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 + float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 + float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 + float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 + float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 + float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 +#endif + for (int h = 0; h < oh; h += out_h_kernel) { + float* outc00 = dout_c00 + h * ow; + float* outc01 = outc00 + ow; + float* outc10 = outc00 + size_out_channel; + float* outc11 = outc10 + ow; + float* outc20 = outc10 + size_out_channel; + float* outc21 = outc20 + ow; + float* outc30 = outc20 + size_out_channel; + float* outc31 = outc30 + ow; + const float* inr0 = pre_din + h * row_len; + const float* inr1 = inr0 + row_len; + const float* inr2 = inr1 + row_len; + const float* inr3 = inr2 + row_len; + if (c + out_c_block > oc) { + switch (c + out_c_block - oc) { + case 3: + outc10 = ptr_write; + outc11 = ptr_write; + case 2: + outc20 = ptr_write; + outc21 = ptr_write; + case 1: + outc30 = ptr_write; + outc31 = ptr_write; + default: + break; + } + } + if (h + out_h_kernel > oh) { + outc01 = ptr_write; + outc11 = ptr_write; + outc21 = ptr_write; + outc31 = ptr_write; + } + float* outl[] = {outc00, + outc10, + outc20, + outc30, + outc01, + outc11, + outc21, + outc31, + reinterpret_cast(bias_local), + reinterpret_cast(flag_relu)}; + void* outl_ptr = reinterpret_cast(outl); + for (int w = 0; w < w_loop; ++w) { + bool flag_mask = (w == w_loop - 1) && flag_remain; + float* out0 = pre_out; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ + "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ + "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ + "ldp q8, q9, [%[inr1]], #32\n" /* load input r1*/ + "ldp q4, q5, [%[inr0]]\n" /* load input r0*/ + "ldp q10, q11, [%[inr1]]\n" /* load input r1*/ + /* r0, r1, mul w0, get out r0, r1 */ + "fmul v15.4s , %[w0].4s, v0.4s\n" /* outr00 = w0 * r0, 0*/ + "fmul v16.4s , %[w0].4s, v1.4s\n" /* outr01 = w0 * r0, 1*/ + "fmul v17.4s , %[w0].4s, v2.4s\n" /* outr02 = w0 * r0, 2*/ + "fmul v18.4s , %[w0].4s, v3.4s\n" /* outr03 = w0 * r0, 3*/ + "fmul v19.4s , %[w0].4s, v6.4s\n" /* outr10 = w0 * r1, 0*/ + "fmul v20.4s , %[w0].4s, v7.4s\n" /* outr11 = w0 * r1, 1*/ + "fmul v21.4s , %[w0].4s, v8.4s\n" /* outr12 = w0 * r1, 2*/ + "fmul v22.4s , %[w0].4s, v9.4s\n" /* outr13 = w0 * r1, 3*/ + /* r0, r1, mul w1, get out r0, r1 */ + "fmla v15.4s , %[w1].4s, v1.4s\n" /* outr00 = w1 * r0[1]*/ + "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ + "fmla v16.4s , %[w1].4s, v2.4s\n" /* outr01 = w1 * r0[2]*/ + "fmla v17.4s , %[w1].4s, v3.4s\n" /* outr02 = w1 * r0[3]*/ + "fmla v18.4s , %[w1].4s, v4.4s\n" /* outr03 = w1 * r0[4]*/ + "fmla v19.4s , %[w1].4s, v7.4s\n" /* outr10 = w1 * r1[1]*/ + "fmla v20.4s , %[w1].4s, v8.4s\n" /* outr11 = w1 * r1[2]*/ + "fmla v21.4s , %[w1].4s, v9.4s\n" /* outr12 = w1 * r1[3]*/ + "fmla v22.4s , %[w1].4s, v10.4s\n"/* outr13 = w1 * r1[4]*/ + /* r0, r1, mul w2, get out r0, r1 */ + "fmla v15.4s , %[w2].4s, v2.4s\n" /* outr00 = w2 * r0[2]*/ + "fmla v16.4s , %[w2].4s, v3.4s\n" /* outr01 = w2 * r0[3]*/ + "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ + "fmla v17.4s , %[w2].4s, v4.4s\n" /* outr02 = w2 * r0[4]*/ + "fmla v18.4s , %[w2].4s, v5.4s\n" /* outr03 = w2 * r0[5]*/ + "ldp q4, q5, [%[inr2]]\n" /* load input r2*/ + "fmla v19.4s , %[w2].4s, v8.4s\n" /* outr10 = w2 * r1[2]*/ + "fmla v20.4s , %[w2].4s, v9.4s\n" /* outr11 = w2 * r1[3]*/ + "fmla v21.4s , %[w2].4s, v10.4s\n"/* outr12 = w2 * r1[4]*/ + "fmla v22.4s , %[w2].4s, v11.4s\n"/* outr13 = w2 * r1[5]*/ + /* r1, r2, mul w3, get out r0, r1 */ + "fmla v15.4s , %[w3].4s, v6.4s\n" /* outr00 = w3 * r1[0]*/ + "fmla v16.4s , %[w3].4s, v7.4s\n" /* outr01 = w3 * r1[1]*/ + "fmla v17.4s , %[w3].4s, v8.4s\n" /* outr02 = w3 * r1[2]*/ + "fmla v18.4s , %[w3].4s, v9.4s\n" /* outr03 = w3 * r1[3]*/ + "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr10 = w3 * r2[0]*/ + "fmla v20.4s , %[w3].4s, v1.4s\n" /* outr11 = w3 * r2[1]*/ + "fmla v21.4s , %[w3].4s, v2.4s\n" /* outr12 = w3 * r2[2]*/ + "fmla v22.4s , %[w3].4s, v3.4s\n" /* outr13 = w3 * r2[3]*/ + /* r1, r2, mul w4, get out r0, r1 */ + "fmla v15.4s , %[w4].4s, v7.4s\n" /* outr00 = w4 * r1[1]*/ + "ldp q6, q7, [%[inr3]], #32\n" /* load input r3*/ + "fmla v16.4s , %[w4].4s, v8.4s\n" /* outr01 = w4 * r1[2]*/ + "fmla v17.4s , %[w4].4s, v9.4s\n" /* outr02 = w4 * r1[3]*/ + "fmla v18.4s , %[w4].4s, v10.4s\n"/* outr03 = w4 * r1[4]*/ + "ldp x0, x1, [%[outl]] \n" + "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr10 = w4 * r2[1]*/ + "fmla v20.4s , %[w4].4s, v2.4s\n" /* outr11 = w4 * r2[2]*/ + "fmla v21.4s , %[w4].4s, v3.4s\n" /* outr12 = w4 * r2[3]*/ + "fmla v22.4s , %[w4].4s, v4.4s\n" /* outr13 = w4 * r2[4]*/ + /* r1, r2, mul w5, get out r0, r1 */ + "fmla v15.4s , %[w5].4s, v8.4s\n" /* outr00 = w5 * r1[2]*/ + "fmla v16.4s , %[w5].4s, v9.4s\n" /* outr01 = w5 * r1[3]*/ + "ldp q8, q9, [%[inr3]], #32\n" /* load input r3*/ + "fmla v17.4s , %[w5].4s, v10.4s\n"/* outr02 = w5 * r1[4]*/ + "fmla v18.4s , %[w5].4s, v11.4s\n"/* outr03 = w5 * r1[5]*/ + "ldp q10, q11, [%[inr3]]\n" /* load input r3*/ + "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr10 = w5 * r2[2]*/ + "fmla v20.4s , %[w5].4s, v3.4s\n" /* outr11 = w5 * r2[3]*/ + "fmla v21.4s , %[w5].4s, v4.4s\n" /* outr12 = w5 * r2[4]*/ + "fmla v22.4s , %[w5].4s, v5.4s\n" /* outr13 = w5 * r2[5]*/ + /* r2, r3, mul w6, get out r0, r1 */ + "fmla v15.4s , %[w6].4s, v0.4s\n" /* outr00 = w6 * r2[0]*/ + "fmla v16.4s , %[w6].4s, v1.4s\n" /* outr01 = w6 * r2[1]*/ + "fmla v17.4s , %[w6].4s, v2.4s\n" /* outr02 = w6 * r2[2]*/ + "fmla v18.4s , %[w6].4s, v3.4s\n" /* outr03 = w6 * r2[3]*/ + "ldp x2, x3, [%[outl], #16] \n" + "fmla v19.4s , %[w6].4s, v6.4s\n" /* outr10 = w6 * r3[0]*/ + "fmla v20.4s , %[w6].4s, v7.4s\n" /* outr11 = w6 * r3[1]*/ + "fmla v21.4s , %[w6].4s, v8.4s\n" /* outr12 = w6 * r3[2]*/ + "fmla v22.4s , %[w6].4s, v9.4s\n" /* outr13 = w6 * r3[3]*/ + /* r2, r3, mul w7, get out r0, r1 */ + "fmla v15.4s , %[w7].4s, v1.4s\n" /* outr00 = w7 * r2[1]*/ + "fmla v16.4s , %[w7].4s, v2.4s\n" /* outr01 = w7 * r2[2]*/ + "fmla v17.4s , %[w7].4s, v3.4s\n" /* outr02 = w7 * r2[3]*/ + "fmla v18.4s , %[w7].4s, v4.4s\n" /* outr03 = w7 * r2[4]*/ + "ldp x4, x5, [%[outl], #32] \n" + "fmla v19.4s , %[w7].4s, v7.4s\n" /* outr10 = w7 * r3[1]*/ + "fmla v20.4s , %[w7].4s, v8.4s\n" /* outr11 = w7 * r3[2]*/ + "fmla v21.4s , %[w7].4s, v9.4s\n" /* outr12 = w7 * r3[3]*/ + "fmla v22.4s , %[w7].4s, v10.4s\n"/* outr13 = w7 * r3[4]*/ + /* r2, r3, mul w8, get out r0, r1 */ + "fmla v15.4s , %[w8].4s, v2.4s\n" /* outr00 = w8 * r2[2]*/ + "fmla v16.4s , %[w8].4s, v3.4s\n" /* outr01 = w8 * r2[3]*/ + "fmla v17.4s , %[w8].4s, v4.4s\n" /* outr02 = w8 * r2[0]*/ + "fmla v18.4s , %[w8].4s, v5.4s\n" /* outr03 = w8 * r2[1]*/ + "ldp x6, x7, [%[outl], #48] \n" + "fmla v19.4s , %[w8].4s, v8.4s\n" /* outr10 = w8 * r3[2]*/ + "fmla v20.4s , %[w8].4s, v9.4s\n" /* outr11 = w8 * r3[3]*/ + "fmla v21.4s , %[w8].4s, v10.4s\n"/* outr12 = w8 * r3[0]*/ + "fmla v22.4s , %[w8].4s, v11.4s\n"/* outr13 = w8 * r3[1]*/ + + "fadd v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ + "fadd v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ + "fadd v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ + "fadd v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ + "fadd v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ + "fadd v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ + "fadd v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ + "fadd v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ + + /* transpose */ + "trn1 v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ + "trn2 v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ + "trn1 v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ + "trn2 v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ + "trn1 v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ + "trn2 v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ + "trn1 v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ + "trn2 v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ + "trn1 v15.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ + "trn2 v19.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ + "trn1 v17.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ + "trn2 v21.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ + "trn1 v16.2d, v4.2d, v6.2d\n" /* r1: a0a1a2a3*/ + "trn2 v20.2d, v4.2d, v6.2d\n" /* r1: c0c1c2c3*/ + "trn1 v18.2d, v5.2d, v7.2d\n" /* r1: b0b1b2b3*/ + "trn2 v22.2d, v5.2d, v7.2d\n" /* r1: d0d1d2d3*/ + + "cbz %w[flag_relu], 0f\n" /* skip relu*/ + "movi v0.4s, #0\n" /* for relu */ + "fmax v15.4s, v15.4s, v0.4s\n" + "fmax v16.4s, v16.4s, v0.4s\n" + "fmax v17.4s, v17.4s, v0.4s\n" + "fmax v18.4s, v18.4s, v0.4s\n" + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + "0:\n" + "cbnz %w[flag_mask], 1f\n" + "str q15, [x0]\n" /* save outc00 */ + "str q16, [x4]\n" /* save outc01 */ + "str q17, [x1]\n" /* save outc10 */ + "str q18, [x5]\n" /* save outc11 */ + "str q19, [x2]\n" /* save outc20 */ + "str q20, [x6]\n" /* save outc21 */ + "str q21, [x3]\n" /* save outc30 */ + "str q22, [x7]\n" /* save outc31 */ + "b 2f\n" + "1:\n" + "str q15, [%[out]], #16 \n" /* save remain to pre_out */ + "str q17, [%[out]], #16 \n" /* save remain to pre_out */ + "str q19, [%[out]], #16 \n" /* save remain to pre_out */ + "str q21, [%[out]], #16 \n" /* save remain to pre_out */ + "str q16, [%[out]], #16 \n" /* save remain to pre_out */ + "str q18, [%[out]], #16 \n" /* save remain to pre_out */ + "str q20, [%[out]], #16 \n" /* save remain to pre_out */ + "str q22, [%[out]], #16 \n" /* save remain to pre_out */ + "2:\n" + :[inr0] "+r"(inr0), [inr1] "+r"(inr1), + [inr2] "+r"(inr2), [inr3] "+r"(inr3), + [out]"+r"(out0) + :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), + [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), + [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), + [vbias]"w" (vbias), [outl] "r" (outl_ptr), + [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu) + : "cc", "memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8", "v9", "v10", "v11", "v15", + "v16","v17","v18","v19","v20","v21","v22", + "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7" + ); +#else + asm volatile( + /* load weights */ + "vld1.32 {d10-d13}, [%[wc0]]! @ load w0, w1, to q5, q6\n" + "vld1.32 {d14-d15}, [%[wc0]]! @ load w2, to q7\n" + /* load r0, r1 */ + "vld1.32 {d0-d3}, [%[r0]]! @ load r0, q0, q1\n" + "vld1.32 {d4-d7}, [%[r0]]! @ load r0, q2, q3\n" + /* main loop */ + "0: @ main loop\n" + /* mul r0 with w0, w1, w2, get out r0 */ + "vmul.f32 q8, q5, q0 @ w0 * inr00\n" + "vmul.f32 q9, q5, q1 @ w0 * inr01\n" + "vmul.f32 q10, q5, q2 @ w0 * inr02\n" + "vmul.f32 q11, q5, q3 @ w0 * inr03\n" + "vmla.f32 q8, q6, q1 @ w1 * inr01\n" + "vld1.32 {d0-d3}, [%[r0]] @ load r0, q0, q1\n" + "vmla.f32 q9, q6, q2 @ w1 * inr02\n" + "vmla.f32 q10, q6, q3 @ w1 * inr03\n" + "vmla.f32 q11, q6, q0 @ w1 * inr04\n" + "vmla.f32 q8, q7, q2 @ w2 * inr02\n" + "vmla.f32 q9, q7, q3 @ w2 * inr03\n" + "vld1.32 {d4-d7}, [%[r1]]! @ load r0, q2, q3\n" + "vmla.f32 q10, q7, q0 @ w2 * inr04\n" + "vmla.f32 q11, q7, q1 @ w2 * inr05\n" + "vld1.32 {d0-d3}, [%[r1]]! @ load r0, q0, q1\n" + "vld1.32 {d8-d9}, [%[wc0]]! @ load w3 to q4\n" + /* mul r1 with w0-w5, get out r0, r1 */ + "vmul.f32 q12, q5, q2 @ w0 * inr10\n" + "vmul.f32 q13, q5, q3 @ w0 * inr11\n" + "vmul.f32 q14, q5, q0 @ w0 * inr12\n" + "vmul.f32 q15, q5, q1 @ w0 * inr13\n" + "vld1.32 {d10-d11}, [%[wc0]]! @ load w4 to q5\n" + "vmla.f32 q8, q4, q2 @ w3 * inr10\n" + "vmla.f32 q9, q4, q3 @ w3 * inr11\n" + "vmla.f32 q10, q4, q0 @ w3 * inr12\n" + "vmla.f32 q11, q4, q1 @ w3 * inr13\n" + /* mul r1 with w1, w4, get out r1, r0 */ + "vmla.f32 q8, q5, q3 @ w4 * inr11\n" + "vmla.f32 q12, q6, q3 @ w1 * inr11\n" + "vld1.32 {d4-d7}, [%[r1]] @ load r1, q2, q3\n" + "vmla.f32 q9, q5, q0 @ w4 * inr12\n" + "vmla.f32 q13, q6, q0 @ w1 * inr12\n" + "vmla.f32 q10, q5, q1 @ w4 * inr13\n" + "vmla.f32 q14, q6, q1 @ w1 * inr13\n" + "vmla.f32 q11, q5, q2 @ w4 * inr14\n" + "vmla.f32 q15, q6, q2 @ w1 * inr14\n" + "vld1.32 {d12-d13}, [%[wc0]]! @ load w5 to q6\n" + /* mul r1 with w2, w5, get out r1, r0 */ + "vmla.f32 q12, q7, q0 @ w2 * inr12\n" + "vmla.f32 q13, q7, q1 @ w2 * inr13\n" + "vmla.f32 q8, q6, q0 @ w5 * inr12\n" + "vmla.f32 q9, q6, q1 @ w5 * inr13\n" + "vld1.32 {d0-d3}, [%[r2]]! @ load r2, q0, q1\n" + "vmla.f32 q14, q7, q2 @ w2 * inr14\n" + "vmla.f32 q15, q7, q3 @ w2 * inr15\n" + "vmla.f32 q10, q6, q2 @ w5 * inr14\n" + "vmla.f32 q11, q6, q3 @ w5 * inr15\n" + "vld1.32 {d4-d7}, [%[r2]]! @ load r2, q0, q1\n" + "vld1.32 {d14-d15}, [%[wc0]]! @ load w6, to q7\n" + /* mul r2 with w3-w8, get out r0, r1 */ + "vmla.f32 q12, q4, q0 @ w3 * inr20\n" + "vmla.f32 q13, q4, q1 @ w3 * inr21\n" + "vmla.f32 q14, q4, q2 @ w3 * inr22\n" + "vmla.f32 q15, q4, q3 @ w3 * inr23\n" + "vld1.32 {d8-d9}, [%[wc0]]! @ load w7, to q4\n" + "vmla.f32 q8, q7, q0 @ w6 * inr20\n" + "vmla.f32 q9, q7, q1 @ w6 * inr21\n" + "vmla.f32 q10, q7, q2 @ w6 * inr22\n" + "vmla.f32 q11, q7, q3 @ w6 * inr23\n" + /* mul r2 with w4, w7, get out r1, r0 */ + "vmla.f32 q8, q4, q1 @ w7 * inr21\n" + "vmla.f32 q12, q5, q1 @ w4 * inr21\n" + "vld1.32 {d0-d3}, [%[r2]] @ load r2, q0, q1\n" + "vmla.f32 q9, q4, q2 @ w7 * inr22\n" + "vmla.f32 q13, q5, q2 @ w4 * inr22\n" + "vmla.f32 q10, q4, q3 @ w7 * inr23\n" + "vmla.f32 q14, q5, q3 @ w4 * inr23\n" + "vmla.f32 q11, q4, q0 @ w7 * inr24\n" + "vmla.f32 q15, q5, q0 @ w4 * inr24\n" + "vld1.32 {d10-d11}, [%[wc0]]! @ load w8 to q5\n" + /* mul r1 with w5, w8, get out r1, r0 */ + "vmla.f32 q12, q6, q2 @ w5 * inr22\n" + "vmla.f32 q13, q6, q3 @ w5 * inr23\n" + "vmla.f32 q8, q5, q2 @ w8 * inr22\n" + "vmla.f32 q9, q5, q3 @ w8 * inr23\n" + "vld1.32 {d4-d7}, [%[r3]]! @ load r3, q2, q3\n" + "ldr r4, [%[outl], #32] @ load bias addr to r4\n" + "vmla.f32 q14, q6, q0 @ w5 * inr24\n" + "vmla.f32 q15, q6, q1 @ w5 * inr25\n" + "vmla.f32 q10, q5, q0 @ w8 * inr24\n" + "vmla.f32 q11, q5, q1 @ w8 * inr25\n" + "vld1.32 {d0-d3}, [%[r3]]! @ load r3, q0, q1\n" + "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" + /* mul r3 with w6, w7, w8, get out r1 */ + "vmla.f32 q12, q7, q2 @ w6 * inr30\n" + "vmla.f32 q13, q7, q3 @ w6 * inr31\n" + "vmla.f32 q14, q7, q0 @ w6 * inr32\n" + "vmla.f32 q15, q7, q1 @ w6 * inr33\n" + "vmla.f32 q12, q4, q3 @ w7 * inr31\n" + "vld1.32 {d4-d7}, [%[r3]] @ load r3, q2, q3\n" + "vld1.32 {d12-d13}, [r4] @ load bias\n" + "vmla.f32 q13, q4, q0 @ w7 * inr32\n" + "vmla.f32 q14, q4, q1 @ w7 * inr33\n" + "vmla.f32 q15, q4, q2 @ w7 * inr34\n" + "ldr r0, [%[outl]] @ load outc00 to r0\n" + "vmla.f32 q12, q5, q0 @ w8 * inr32\n" + "vmla.f32 q13, q5, q1 @ w8 * inr33\n" + "ldr r5, [%[outl], #36] @ load flag_relu to r5\n" + "vmla.f32 q14, q5, q2 @ w8 * inr34\n" + "vmla.f32 q15, q5, q3 @ w8 * inr35\n" + "ldr r1, [%[outl], #4] @ load outc10 to r1\n" + "vadd.f32 q8, q8, q6 @ r00 add bias\n" + "vadd.f32 q9, q9, q6 @ r01 add bias\n" + "vadd.f32 q10, q10, q6 @ r02 add bias\n" + "vadd.f32 q11, q11, q6 @ r03 add bias\n" + "ldr r2, [%[outl], #8] @ load outc20 to r2\n" + "vadd.f32 q12, q12, q6 @ r10 add bias\n" + "vadd.f32 q13, q13, q6 @ r11 add bias\n" + "vadd.f32 q14, q14, q6 @ r12 add bias\n" + "vadd.f32 q15, q15, q6 @ r13 add bias\n" + "ldr r3, [%[outl], #12] @ load outc30 to r3\n" + "vmov.u32 q7, #0 @ mov zero to q7\n" + "cmp r5, #0 @ cmp flag relu\n" + "beq 1f @ skip relu\n" + "vmax.f32 q8, q8, q7 @ r00 relu\n" + "vmax.f32 q9, q9, q7 @ r01 relu\n" + "vmax.f32 q10, q10, q7 @ r02 relu\n" + "vmax.f32 q11, q11, q7 @ r03 relu\n" + "vmax.f32 q12, q12, q7 @ r10 relu\n" + "vmax.f32 q13, q13, q7 @ r11 relu\n" + "vmax.f32 q14, q14, q7 @ r12 relu\n" + "vmax.f32 q15, q15, q7 @ r13 relu\n" + "1:\n" + "ldr r4, [%[outl], #16] @ load outc01 to r4\n" + "vtrn.32 q8, q9 @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" + "vtrn.32 q10, q11 @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" + "vtrn.32 q12, q13 @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" + "vtrn.32 q14, q15 @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" + "ldr r5, [%[outl], #20] @ load outc11 to r5\n" + "vswp d17, d20 @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" + "vswp d19, d22 @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" + "vswp d25, d28 @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" + "vswp d27, d30 @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" + "cmp %[flag_mask], #0 @ cmp flag mask\n" + "bne 2f\n" + "vst1.32 {d16-d17}, [r0] @ save outc00\n" + "vst1.32 {d18-d19}, [r1] @ save outc10\n" + "vst1.32 {d20-d21}, [r2] @ save outc20\n" + "vst1.32 {d22-d23}, [r3] @ save outc30\n" + "vst1.32 {d24-d25}, [r4] @ save outc01\n" + "vst1.32 {d26-d27}, [r5] @ save outc11\n" + "ldr r0, [%[outl], #24] @ load outc21 to r0\n" + "ldr r1, [%[outl], #28] @ load outc31 to r1\n" + "vst1.32 {d28-d29}, [r0] @ save outc21\n" + "vst1.32 {d30-d31}, [r1] @ save outc31\n" + "b 3f @ branch end\n" + "2: \n" + "vst1.32 {d16-d17}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d18-d19}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d20-d21}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d22-d23}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d24-d25}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d26-d27}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d28-d29}, [%[out0]]! @ save remain to pre_out\n" + "vst1.32 {d30-d31}, [%[out0]]! @ save remain to pre_out\n" + "3: \n" + : [r0] "+r"(inr0), [r1] "+r"(inr1), + [r2] "+r"(inr2), [r3] "+r"(inr3), + [out0] "+r"(out0), [wc0] "+r"(weight_c) + : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr) + : "cc", "memory", + "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", + "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5" + ); +#endif // __arch64__ + // clang-format on + outl[0] += 4; + outl[1] += 4; + outl[2] += 4; + outl[3] += 4; + outl[4] += 4; + outl[5] += 4; + outl[6] += 4; + outl[7] += 4; + if (flag_mask) { + memcpy(outl[0] - 4, pre_out, remain * sizeof(float)); + memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float)); + memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float)); + memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float)); + memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float)); + memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float)); + memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float)); + memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float)); + } + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc index 8260718a50f8e2fa8497d41d958e82a45ea0480d..807135f57dfadf690277ab57bd5597e9470ae549 100644 --- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc +++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc @@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param, ARMContext* ctx) { auto dim_in = param.x->dims(); auto dim_out = param.output->dims(); + auto paddings = *param.paddings; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; int ow = dim_out[3]; int oh = dim_out[2]; int ic = dim_in[1]; @@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data, //! 3x3s2 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; const int threads = ctx->threads(); int l2_size = ctx->llc_size() / sizeof(float); - const int pad_w = param.paddings[1]; - const int pad_h = param.paddings[0]; + const int pad_w = paddings[2]; + const int pad_h = paddings[0]; const int wout_round = ROUNDUP(ow, OUT_W_BLOCK); const int win_round = wout_round * 2 /*stride_w*/ + 1; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc index 01b7a812ebc05a054bb9952bf53605ce7aed135a..26829544bfd34d7acfc1d49086e86c3e0edad5f1 100644 --- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc +++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc @@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, //! 3x3s2 int8 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; const int threads = ctx->threads(); int llc_size = ctx->llc_size() / 4; @@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din, //! 3x3s2 int8 convolution, implemented by direct algorithm //! prepack input to tmp buffer //! write output to tmp buffer + auto paddings = *param.paddings; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; const int threads = ctx->threads(); //! set 1/4 l2 cache int llc_size = ctx->llc_size() / 4; diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc new file mode 100644 index 0000000000000000000000000000000000000000..455781e37e0747950e6740f6db45c1ce8c0e96c8 --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc @@ -0,0 +1,1862 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_depthwise.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +void conv_depthwise_3x3s2p0_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p0_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2p1_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx); + +void conv_depthwise_3x3s2_fp32(const float* din, + float* dout, + int num, + int ch_out, + int h_out, + int w_out, + int ch_in, + int h_in, + int w_in, + const float* weights, + const float* bias, + int pad, + bool flag_bias, + bool flag_relu, + ARMContext* ctx) { + if (pad == 0) { + if (w_in > 7) { + conv_depthwise_3x3s2p0_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s2p0_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } + if (pad == 1) { + if (w_in > 7) { + conv_depthwise_3x3s2p1_bias(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } else { + conv_depthwise_3x3s2p1_bias_s(dout, + din, + weights, + bias, + flag_bias, + flag_relu, + num, + ch_in, + h_in, + w_in, + h_out, + w_out, + ctx); + } + } +} +#ifdef __aarch64__ +#define INIT_S2 \ + "prfm pldl1keep, [%[inptr0]] \n" \ + "prfm pldl1keep, [%[inptr1]] \n" \ + "prfm pldl1keep, [%[inptr2]] \n" \ + "prfm pldl1keep, [%[inptr3]] \n" \ + "prfm pldl1keep, [%[inptr4]] \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" + +#define LEFT_COMPUTE_S2 \ + "ext v10.16b, %[vzero].16b, v1.16b, #12 \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[1] \n" /* {0,2,4,6} * w01 */ \ + "fmul v12.4s, v1.4s, %[w0].s[2] \n" /* {1,3,5,7} * w02 */ \ + "fmla v16.4s, v10.4s, %[w0].s[0] \n" /* {0,1,3,5} * w00*/ \ + \ + "ext v10.16b, %[vzero].16b, v3.16b, #12 \n" /* v10 = {0,1,3,5} */ \ + \ + "sub %[inptr0], %[inptr0], #4 \n" \ + "sub %[inptr1], %[inptr1], #4 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[1] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v5.16b, #12 \n" \ + \ + "sub %[inptr2], %[inptr2], #4 \n" \ + "sub %[inptr3], %[inptr3], #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[1] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[1] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[2] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[2] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[0] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v7.16b, #12 \n" \ + \ + "sub %[inptr4], %[inptr4], #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[1] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[0] \n" \ + \ + "ext v10.16b, %[vzero].16b, v9.16b, #12 \n" \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define LEFT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, v18.16b, #4 \n" \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, v19.16b, #4 \n" \ + \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, v20.16b, #4 \n" \ + \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, v21.16b, #4 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" + +#define MID_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %w[remain], #1 \n" \ + "blt 4f \n" \ + "3: \n" \ + "bif v0.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v1.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v2.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v3.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "bif v4.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v5.16b, %[vzero].16b, %[mask2].16b \n" \ + \ + "ext v10.16b, v0.16b, %[vzero].16b, #4 \n" \ + \ + "bif v6.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v7.16b, %[vzero].16b, %[mask2].16b \n" /* r0 */ \ + "fmul v11.4s, v0.4s, %[w0].s[0] \n" \ + "fmul v12.4s, v1.4s, %[w0].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w0].s[2] \n" \ + \ + "ext v10.16b, v2.16b, %[vzero].16b, #4 \n" \ + "bif v8.16b, %[vzero].16b, %[mask1].16b \n" \ + "bif v9.16b, %[vzero].16b, %[mask2].16b \n" /* r1 */ \ + "fmla v11.4s, v2.4s, %[w1].s[0] \n" \ + "fmla v12.4s, v3.4s, %[w1].s[1] \n" \ + "fmla v16.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v4.16b, %[vzero].16b, #4 \n" /* r2 */ \ + "fmul v13.4s, v4.4s, %[w0].s[0] \n" \ + "fmla v11.4s, v4.4s, %[w2].s[0] \n" \ + \ + "fmul v14.4s, v5.4s, %[w0].s[1] \n" \ + "fmla v12.4s, v5.4s, %[w2].s[1] \n" \ + \ + "fmla v17.4s, v10.4s, %[w0].s[2] \n" \ + "fmla v16.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ext v10.16b, v6.16b, %[vzero].16b, #4 \n" /* r3 */ \ + "fmla v13.4s, v6.4s, %[w1].s[0] \n" \ + "fmla v14.4s, v7.4s, %[w1].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w1].s[2] \n" \ + \ + "ext v10.16b, v8.16b, %[vzero].16b, #4 \n" \ + "ld1 {v0.4s}, [%[outptr0]] \n" \ + \ + "fadd v16.4s, v16.4s, v11.4s \n" \ + "fadd v16.4s, v16.4s, v12.4s \n" \ + "ld1 {v1.4s}, [%[outptr1]] \n" + +#define RIGHT_RESULT_S2 \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define LEFT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[1] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[2] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[0] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" \ + \ + "ld2 {v0.4s, v1.4s}, [%[inptr0]], #32 \n" \ + "ld2 {v2.4s, v3.4s}, [%[inptr1]], #32 \n" \ + "ld2 {v4.4s, v5.4s}, [%[inptr2]], #32 \n" \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "ld2 {v6.4s, v7.4s}, [%[inptr3]], #32 \n" \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" \ + \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "cmp %w[cnt], #1 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "ld2 {v8.4s, v9.4s}, [%[inptr4]], #32 \n" \ + "ld1 {v15.4s}, [%[inptr0]] \n" \ + "ld1 {v18.4s}, [%[inptr1]] \n" \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "ld1 {v19.4s}, [%[inptr2]] \n" \ + "ld1 {v20.4s}, [%[inptr3]] \n" \ + "ld1 {v21.4s}, [%[inptr4]] \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "ext v10.16b, v0.16b, v15.16b, #4 \n" \ + "and v16.16b, %[vbias].16b, %[vbias].16b \n" \ + "subs %w[cnt], %w[cnt], #1 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + \ + "and v17.16b, %[vbias].16b, %[vbias].16b \n" \ + \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + /* r4 */ \ + "fmla v13.4s, v8.4s, %[w2].s[0] \n" \ + "fmla v14.4s, v9.4s, %[w2].s[1] \n" \ + "fmla v17.4s, v10.4s, %[w2].s[2] \n" \ + \ + "fmax v16.4s, v16.4s, %[vzero].4s \n" /* relu */ \ + \ + "fadd v17.4s, v17.4s, v13.4s \n" \ + \ + "bif v16.16b, v0.16b, %[wmask].16b \n" \ + \ + "fadd v17.4s, v17.4s, v14.4s \n" \ + \ + "st1 {v16.4s}, [%[outptr0]], #16 \n" \ + \ + "fmax v17.4s, v17.4s, %[vzero].4s \n" /* relu */ \ + \ + "bif v17.16b, v1.16b, %[wmask].16b \n" \ + \ + "st1 {v17.4s}, [%[outptr1]], #16 \n" \ + "4: \n" + +#define COMPUTE_S_S2 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v9.16b, v11.16b, #12 \n" \ + "ext v7.16b, v9.16b, v13.16b, #12 \n" \ + "ext v8.16b, v9.16b, v15.16b, #12 \n" \ + \ + "fmul v4.4s, v10.4s, %[wr0].s[1] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[2] \n" \ + "fmul v6.4s, v6.4s, %[wr0].s[0] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[1] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[2] \n" \ + "fmla v6.4s, v7.4s, %[wr1].s[0] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[1] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[2] \n" \ + "fmla v6.4s, v8.4s, %[wr2].s[0] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v6.4s \n" + +#define RESULT_S_S2 \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "fadd v4.4s, v4.4s, %[bias].4s \n" \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + \ + "st1 {v4.4s}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "movi v9.4s, #0 \n" \ + "ld1 {v6.4s, v7.4s}, [%[mask_ptr]], #32 \n" \ + \ + "ld2 {v10.4s, v11.4s}, [%[din0_ptr]], #32 \n" \ + "ld2 {v12.4s, v13.4s}, [%[din1_ptr]], #32 \n" \ + "ld2 {v14.4s, v15.4s}, [%[din2_ptr]], #32 \n" \ + "and v4.16b, %[bias].16b, %[bias].16b \n" \ + \ + "bif v10.16b, v9.16b, v6.16b \n" \ + "bif v11.16b, v9.16b, v7.16b \n" \ + "bif v12.16b, v9.16b, v6.16b \n" \ + "bif v13.16b, v9.16b, v7.16b \n" \ + "bif v14.16b, v9.16b, v6.16b \n" \ + "bif v15.16b, v9.16b, v7.16b \n" \ + \ + "ext v6.16b, v10.16b, v9.16b, #4 \n" \ + "ext v7.16b, v12.16b, v9.16b, #4 \n" \ + "ext v8.16b, v14.16b, v9.16b, #4 \n" \ + \ + "fmla v4.4s, v10.4s, %[wr0].s[0] \n" \ + "fmul v5.4s, v11.4s, %[wr0].s[1] \n" \ + "fmul v16.4s, v6.4s, %[wr0].s[2] \n" \ + \ + "fmla v4.4s, v12.4s, %[wr1].s[0] \n" \ + "fmla v5.4s, v13.4s, %[wr1].s[1] \n" \ + "fmla v16.4s, v7.4s, %[wr1].s[2] \n" \ + \ + "fmla v4.4s, v14.4s, %[wr2].s[0] \n" \ + "fmla v5.4s, v15.4s, %[wr2].s[1] \n" \ + "fmla v16.4s, v8.4s, %[wr2].s[2] \n" \ + \ + "fadd v4.4s, v4.4s, v5.4s \n" \ + "fadd v4.4s, v4.4s, v16.4s \n" + +#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "fmax v4.4s, v4.4s, v9.4s \n" \ + "st1 {v4.4s}, [%[out]] \n" + +#else +#define INIT_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r1\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + "pld [%[din0_ptr]] @ preload data\n" \ + "pld [%[din1_ptr]] @ preload data\n" \ + "pld [%[din2_ptr]] @ preload data\n" \ + \ + "vdup.32 q3, %[bias] @ and \n" + +#define LEFT_COMPUTE_S2 \ + "vext.32 q6, q9, q11, #3 @ shift right 1 data\n" \ + "vext.32 q7, q9, q13, #3 @ shift right 1 data\n" \ + "vext.32 q8, q9, q15, #3 @ shift right 1 data\n" \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 1, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 1, out0\n" \ + \ + "sub %[din0_ptr], #4 @ inpitr0 - 1\n" \ + "sub %[din1_ptr], #4 @ inpitr1 - 1\n" \ + "sub %[din2_ptr], #4 @ inpitr2 - 1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 1, out1\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 1, out1\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 1, out1\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r1\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define LEFT_RESULT_S2 \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_COMPUTE_S2 \ + "2: \n" \ + "vld1.32 {d16}, [%[din0_ptr]] @ load din r0\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + "vext.32 q6, q10, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din1_ptr]] @ load din r1\n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q7, q12, q8, #1 @ shift left 1 \n" \ + "vld1.32 {d16}, [%[din2_ptr]] @ load din r1\n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vext.32 q6, q14, q8, #1 @ shift left 1 \n" \ + \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define MID_RESULT_S2 \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_COMPUTE_S2 \ + "1: \n" \ + "cmp %[remain], #1 \n" \ + "blt 3f \n" \ + \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vext.32 q6, q14, q9, #1 @ shift left 1 \n" \ + "vld1.f32 {d20-d21}, [%[outptr]] @ load output\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vld1.f32 {d22-d23}, [%[mask_ptr]] @ load mask\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q6, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RIGHT_RESULT_S2 \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define LEFT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "cmp %[cnt], #1 \n" \ + "blt 1f \n" + +#define MID_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "subs %[cnt], #1 \n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "bne 2b \n" + +#define RIGHT_RESULT_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vbif.f32 q3, q10, q11 @ write mask\n" \ + \ + "vst1.32 {d6-d7}, [%[outptr]]! \n" \ + "3: \n" + +#define COMPUTE_S_S2 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]]! @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q9, q11, #3 @ shift left 1 \n" \ + "vext.32 q7, q9, q13, #3 @ shift left 1 \n" \ + "vext.32 q8, q9, q15, #3 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %f[wr0][0] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %e[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %f[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %e[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %f[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %e[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_RELU \ + "vmax.f32 q3, q3, q9 @ relu\n" \ + \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#define COMPUTE_S_S2_P0 \ + "vmov.u32 q9, #0 \n" \ + "vld1.f32 {d12-d15}, [%[mask_ptr]] @ load mask\n" \ + "vdup.32 q3, %[bias] @ and \n" \ + \ + "vld2.32 {d20-d23}, [%[din0_ptr]]! @ load din r0\n" \ + "vld2.32 {d24-d27}, [%[din1_ptr]]! @ load din r1\n" \ + "vld2.32 {d28-d31}, [%[din2_ptr]]! @ load din r2\n" \ + \ + "vbif q10, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q11, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q12, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q13, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + "vbif q14, q9, q6 @ bit select, deal with " \ + "right pad\n" \ + "vbif q15, q9, q7 @ bit select, deal with " \ + "right pad\n" \ + \ + "vext.32 q6, q10, q9, #1 @ shift left 1 \n" \ + "vext.32 q7, q12, q9, #1 @ shift left 1 \n" \ + "vext.32 q8, q14, q9, #1 @ shift left 1 \n" \ + \ + "vmul.f32 q4, q10, %e[wr0][0] @ mul weight 0, out0\n" \ + "vmul.f32 q5, q11, %e[wr0][1] @ mul weight 0, out0\n" \ + "vmla.f32 q3, q6, %f[wr0][0] @ mul weight 0, out0\n" \ + \ + "vmla.f32 q4, q12, %e[wr1][0] @ mul weight 1, out0\n" \ + "vmla.f32 q5, q13, %e[wr1][1] @ mul weight 1, out0\n" \ + "vmla.f32 q3, q7, %f[wr1][0] @ mul weight 1, out0\n" \ + \ + "vmla.f32 q4, q14, %e[wr2][0] @ mul weight 2, out0\n" \ + "vmla.f32 q5, q15, %e[wr2][1] @ mul weight 2, out0\n" \ + "vmla.f32 q3, q8, %f[wr2][0] @ mul weight 2, out0\n" \ + \ + "vadd.f32 q3, q3, q4 @ add \n" \ + "vadd.f32 q3, q3, q5 @ add \n" + +#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]] \n" + +#define RESULT_S_S2_P0_RELU \ + "vmax.f32 q3, q3, q9 @ relu \n" \ + "vst1.32 {d6-d7}, [%[out]] \n" + +#endif + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + * w_in > 7 + */ +void conv_depthwise_3x3s2p1_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + int size_pad_bottom = h_out * 2 - h_in; + + int cnt_col = (w_out >> 2) - 2; + int size_right_remain = w_in - (7 + cnt_col * 8); + if (size_right_remain >= 9) { + cnt_col++; + size_right_remain -= 8; + } + int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4); // + + int size_right_pad = w_out * 2 - w_in; + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_in; i += 4) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + din3_ptr = dr2; + din4_ptr = dr3; + dr0 = dr3; + dr1 = dr4; + } else { + dr0 = dr4; + dr1 = dr0 + w_in; + } + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i + 4 > h_in) { + switch (i + 4 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i / 2 + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = cnt_col; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_in; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + if (i == 0) { + din0_ptr = zero_ptr; + din1_ptr = dr0; + din2_ptr = dr1; + dr0 = dr1; + dr1 = dr2; + dr2 = dr1 + w_in; + } else { + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + } + + //! process bottom pad + if (i + 2 > h_in) { + switch (i + 2 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = cnt_col; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile( + INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2 + MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2 + MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p1_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + int hs = -1; + int he = 2; + float out_buf[4]; + for (int j = 0; j < h_out; ++j) { + const float* dr0 = din_channel + hs * w_in; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + if (hs == -1) { + dr0 = zeros; + } + if (he > h_in) { + dr2 = zeros; + } + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2 RESULT_S_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + hs += 2; + he += 2; + } + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2 + */ +// w_in > 7 +void conv_depthwise_3x3s2p0_bias(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + + int tile_w = w_out >> 2; + int cnt_remain = w_out % 4; + + unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3)); + + uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain), + vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + uint32x4_t wmask = + vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx)); // 0 1 2 3 + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + float* zero_ptr = ctx->workspace_data(); + memset(zero_ptr, 0, w_in * sizeof(float)); + float* write_ptr = zero_ptr + w_in; + + unsigned int dmask[12]; + + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + vst1q_u32(dmask + 8, wmask); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float32x4_t vzero = vdupq_n_f32(0.f); + +#ifdef __aarch64__ + float32x4_t wbias; + if (flag_bias) { + wbias = vdupq_n_f32(bias[i]); + } else { + wbias = vdupq_n_f32(0.f); + } +#else + float bias_c = 0.f; + if (flag_bias) { + bias_c = bias[i]; + } +#endif // __aarch64__ + + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + const float* dr3 = dr2 + w_in; + const float* dr4 = dr3 + w_in; + + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + const float* din3_ptr = dr3; + const float* din4_ptr = dr4; + + float* doutr0 = dout_channel; + float* doutr0_ptr = nullptr; + float* doutr1_ptr = nullptr; + +#ifdef __aarch64__ + for (int i = 0; i < h_out; i += 2) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + din3_ptr = dr3; + din4_ptr = dr4; + + doutr0_ptr = doutr0; + doutr1_ptr = doutr0 + w_out; + + dr0 = dr4; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + dr3 = dr2 + w_in; + dr4 = dr3 + w_in; + + //! process bottom pad + if (i * 2 + 5 > h_in) { + switch (i * 2 + 5 - h_in) { + case 4: + din1_ptr = zero_ptr; + case 3: + din2_ptr = zero_ptr; + case 2: + din3_ptr = zero_ptr; + case 1: + din4_ptr = zero_ptr; + case 0: + din4_ptr = zero_ptr; + default: + break; + } + } + //! process output pad + if (i + 2 > h_out) { + doutr1_ptr = write_ptr; + } + int cnt = tile_w; + if (flag_relu) { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2_RELU + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2_RELU + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } else { + asm volatile( + INIT_S2 + "ld1 {v15.4s}, [%[inptr0]] \n" + "ld1 {v18.4s}, [%[inptr1]] \n" + "ld1 {v19.4s}, [%[inptr2]] \n" + "ld1 {v20.4s}, [%[inptr3]] \n" + "ld1 {v21.4s}, [%[inptr4]] \n" + "ext v10.16b, v0.16b, v15.16b, #4 \n" // v10 = {2,4,6,8} + MID_COMPUTE_S2 MID_RESULT_S2 + "cmp %w[remain], #1 \n" + "blt 4f \n" RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + "4: \n" + : [inptr0] "+r"(din0_ptr), + [inptr1] "+r"(din1_ptr), + [inptr2] "+r"(din2_ptr), + [inptr3] "+r"(din3_ptr), + [inptr4] "+r"(din4_ptr), + [outptr0] "+r"(doutr0_ptr), + [outptr1] "+r"(doutr1_ptr), + [cnt] "+r"(cnt) + : [vzero] "w"(vzero), + [w0] "w"(wr0), + [w1] "w"(wr1), + [w2] "w"(wr2), + [remain] "r"(cnt_remain), + [mask1] "w"(vmask_rp1), + [mask2] "w"(vmask_rp2), + [wmask] "w"(wmask), + [vbias] "w"(wbias) + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20", + "v21"); + } + doutr0 = doutr0 + 2 * w_out; + } +#else + for (int i = 0; i < h_out; i++) { + din0_ptr = dr0; + din1_ptr = dr1; + din2_ptr = dr2; + + doutr0_ptr = doutr0; + + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + //! process bottom pad + if (i * 2 + 3 > h_in) { + switch (i * 2 + 3 - h_in) { + case 2: + din1_ptr = zero_ptr; + case 1: + din2_ptr = zero_ptr; + default: + break; + } + } + int cnt = tile_w; + unsigned int* mask_ptr = dmask; + if (flag_relu) { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU + RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2 + RIGHT_RESULT_S2 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [outptr] "+r"(doutr0_ptr), + [cnt] "+r"(cnt), + [mask_ptr] "+r"(mask_ptr) + : [remain] "r"(cnt_remain), + [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } + doutr0 = doutr0 + w_out; + } +#endif + } + } +} + +/** + * \brief depthwise convolution kernel 3x3, stride 2, width <= 4 + */ +void conv_depthwise_3x3s2p0_bias_s(float* dout, + const float* din, + const float* weights, + const float* bias, + bool flag_bias, + bool flag_relu, + const int num, + const int ch_in, + const int h_in, + const int w_in, + const int h_out, + const int w_out, + ARMContext* ctx) { + int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + int out_pad_idx[4] = {0, 1, 2, 3}; + float zeros[8] = {0.0f}; + const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f}; + + uint32x4_t vmask_rp1 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx)); // 0 2 4 6 + uint32x4_t vmask_rp2 = + vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4)); // 1 3 5 7 + + int size_in_channel = w_in * h_in; + int size_out_channel = w_out * h_out; + + unsigned int dmask[8]; + vst1q_u32(dmask, vmask_rp1); + vst1q_u32(dmask + 4, vmask_rp2); + + for (int n = 0; n < num; ++n) { + const float* din_batch = din + n * ch_in * size_in_channel; + float* dout_batch = dout + n * ch_in * size_out_channel; +#pragma omp parallel for + for (int i = 0; i < ch_in; ++i) { + const float* din_channel = din_batch + i * size_in_channel; + float* dout_channel = dout_batch + i * size_out_channel; + + const float* weight_ptr = weights + i * 9; + float32x4_t wr0 = vld1q_f32(weight_ptr); + float32x4_t wr1 = vld1q_f32(weight_ptr + 3); + float32x4_t wr2 = vld1q_f32(weight_ptr + 6); + + float bias_c = 0.f; + + if (flag_bias) { + bias_c = bias[i]; + } + float32x4_t vbias = vdupq_n_f32(bias_c); + float out_buf[4]; + const float* dr0 = din_channel; + const float* dr1 = dr0 + w_in; + const float* dr2 = dr1 + w_in; + for (int j = 0; j < h_out; j++) { + const float* din0_ptr = dr0; + const float* din1_ptr = dr1; + const float* din2_ptr = dr2; + if (j * 2 + 2 >= h_in) { + switch (j + 2 - h_in) { + case 1: + din1_ptr = zero_ptr; + case 0: + din2_ptr = zero_ptr; + default: + break; + } + } + dr0 = dr2; + dr1 = dr0 + w_in; + dr2 = dr1 + w_in; + + unsigned int* mask_ptr = dmask; +#ifdef __aarch64__ + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [mask_ptr] "+r"(mask_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "w"(vbias), + [out] "r"(out_buf) + : "cc", + "memory", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16"); + } +#else + if (flag_relu) { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } else { + asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr) + : [wr0] "w"(wr0), + [wr1] "w"(wr1), + [wr2] "w"(wr2), + [bias] "r"(bias_c), + [out] "r"(out_buf), + [mask_ptr] "r"(dmask) + : "cc", + "memory", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15"); + } +#endif + for (int w = 0; w < w_out; ++w) { + *dout_channel++ = out_buf[w]; + } + } + } + } +} +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc new file mode 100644 index 0000000000000000000000000000000000000000..9852c0f84eae8451ef795c95faddfc88e833bea8 --- /dev/null +++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc @@ -0,0 +1,362 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/arm/math/conv_block_utils.h" +#include "lite/backends/arm/math/conv_impl.h" +#include "lite/core/context.h" +#include "lite/operators/op_params.h" +#ifdef ARM_WITH_OMP +#include +#endif + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void conv_3x3s2_depthwise_fp32(const float* i_data, + float* o_data, + int bs, + int oc, + int oh, + int ow, + int ic, + int ih, + int win, + const float* weights, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx) { + auto paddings = *param.paddings; + int threads = ctx->threads(); + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; + const int out_c_block = 4; + const int out_h_kernel = 1; + const int out_w_kernel = 4; + const int win_ext = ow * 2 + 1; + const int ow_round = ROUNDUP(ow, 4); + const int win_round = ROUNDUP(win_ext, 4); + const int hin_round = oh * 2 + 1; + const int prein_size = win_round * hin_round * out_c_block; + auto workspace_size = + threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/; + ctx->ExtendWorkspace(sizeof(float) * workspace_size); + + bool flag_relu = param.fuse_relu; + bool flag_bias = param.bias != nullptr; + + /// get workspace + auto ptr_zero = ctx->workspace_data(); + memset(ptr_zero, 0, sizeof(float) * win_round); + float* ptr_write = ptr_zero + win_round; + + int size_in_channel = win * ih; + int size_out_channel = ow * oh; + + int ws = -pad_w; + int we = ws + win_round; + int hs = -pad_h; + int he = hs + hin_round; + int w_loop = ow_round / 4; + auto remain = w_loop * 4 - ow; + bool flag_remain = remain > 0; + remain = 4 - remain; + remain = remain > 0 ? remain : 0; + int row_len = win_round * out_c_block; + + for (int n = 0; n < bs; ++n) { + const float* din_batch = i_data + n * ic * size_in_channel; + float* dout_batch = o_data + n * oc * size_out_channel; +#pragma omp parallel for num_threads(threads) + for (int c = 0; c < oc; c += out_c_block) { +#ifdef ARM_WITH_OMP + float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size; +#else + float* pre_din = ptr_write + ow_round; +#endif + /// const array size + prepack_input_nxwc4_dw( + din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero); + const float* weight_c = weights + c * 9; // kernel_w * kernel_h + float* dout_c00 = dout_batch + c * size_out_channel; + float bias_local[4] = {0, 0, 0, 0}; + if (flag_bias) { + bias_local[0] = bias[c]; + bias_local[1] = bias[c + 1]; + bias_local[2] = bias[c + 2]; + bias_local[3] = bias[c + 3]; + } +#ifdef __aarch64__ + float32x4_t w0 = vld1q_f32(weight_c); // w0, v23 + float32x4_t w1 = vld1q_f32(weight_c + 4); // w1, v24 + float32x4_t w2 = vld1q_f32(weight_c + 8); // w2, v25 + float32x4_t w3 = vld1q_f32(weight_c + 12); // w3, v26 + float32x4_t w4 = vld1q_f32(weight_c + 16); // w4, v27 + float32x4_t w5 = vld1q_f32(weight_c + 20); // w5, v28 + float32x4_t w6 = vld1q_f32(weight_c + 24); // w6, v29 + float32x4_t w7 = vld1q_f32(weight_c + 28); // w7, v30 + float32x4_t w8 = vld1q_f32(weight_c + 32); // w8, v31 +#endif + for (int h = 0; h < oh; h += out_h_kernel) { + float* outc0 = dout_c00 + h * ow; + float* outc1 = outc0 + size_out_channel; + float* outc2 = outc1 + size_out_channel; + float* outc3 = outc2 + size_out_channel; + const float* inr0 = pre_din + h * 2 * row_len; + const float* inr1 = inr0 + row_len; + const float* inr2 = inr1 + row_len; + if (c + out_c_block > oc) { + switch (c + out_c_block - oc) { + case 3: + outc1 = ptr_write; + case 2: + outc2 = ptr_write; + case 1: + outc3 = ptr_write; + default: + break; + } + } + auto c0 = outc0; + auto c1 = outc1; + auto c2 = outc2; + auto c3 = outc3; + float pre_out[16]; + for (int w = 0; w < w_loop; ++w) { + bool flag_mask = (w == w_loop - 1) && flag_remain; + if (flag_mask) { + c0 = outc0; + c1 = outc1; + c2 = outc2; + c3 = outc3; + outc0 = pre_out; + outc1 = pre_out + 4; + outc2 = pre_out + 8; + outc3 = pre_out + 12; + } +// clang-format off +#ifdef __aarch64__ + asm volatile( + "ldr q8, [%[bias]]\n" /* load bias */ + "ldp q0, q1, [%[inr0]], #32\n" /* load input r0*/ + "and v19.16b, v8.16b, v8.16b\n" + "ldp q2, q3, [%[inr0]], #32\n" /* load input r0*/ + "and v20.16b, v8.16b, v8.16b\n" + "ldp q4, q5, [%[inr0]], #32\n" /* load input r0*/ + "and v21.16b, v8.16b, v8.16b\n" + "ldp q6, q7, [%[inr0]], #32\n" /* load input r0*/ + "and v22.16b, v8.16b, v8.16b\n" + "ldr q8, [%[inr0]]\n" /* load input r0*/ + /* r0 mul w0-w2, get out */ + "fmla v19.4s , %[w0].4s, v0.4s\n" /* outr0 = w0 * r0, 0*/ + "fmla v20.4s , %[w0].4s, v2.4s\n" /* outr1 = w0 * r0, 2*/ + "fmla v21.4s , %[w0].4s, v4.4s\n" /* outr2 = w0 * r0, 4*/ + "fmla v22.4s , %[w0].4s, v6.4s\n" /* outr3 = w0 * r0, 6*/ + "fmla v19.4s , %[w1].4s, v1.4s\n" /* outr0 = w1 * r0, 1*/ + "ldp q0, q1, [%[inr1]], #32\n" /* load input r1*/ + "fmla v20.4s , %[w1].4s, v3.4s\n" /* outr1 = w1 * r0, 3*/ + "fmla v21.4s , %[w1].4s, v5.4s\n" /* outr2 = w1 * r0, 5*/ + "fmla v22.4s , %[w1].4s, v7.4s\n" /* outr3 = w1 * r0, 7*/ + "fmla v19.4s , %[w2].4s, v2.4s\n" /* outr0 = w0 * r0, 2*/ + "ldp q2, q3, [%[inr1]], #32\n" /* load input r1*/ + "fmla v20.4s , %[w2].4s, v4.4s\n" /* outr1 = w0 * r0, 4*/ + "ldp q4, q5, [%[inr1]], #32\n" /* load input r1*/ + "fmla v21.4s , %[w2].4s, v6.4s\n" /* outr2 = w0 * r0, 6*/ + "ldp q6, q7, [%[inr1]], #32\n" /* load input r1*/ + "fmla v22.4s , %[w2].4s, v8.4s\n" /* outr3 = w0 * r0, 8*/ + "ldr q8, [%[inr1]]\n" /* load input r1*/ + /* r1, mul w3-w5, get out */ + "fmla v19.4s , %[w3].4s, v0.4s\n" /* outr0 = w3 * r1, 0*/ + "fmla v20.4s , %[w3].4s, v2.4s\n" /* outr1 = w3 * r1, 2*/ + "fmla v21.4s , %[w3].4s, v4.4s\n" /* outr2 = w3 * r1, 4*/ + "fmla v22.4s , %[w3].4s, v6.4s\n" /* outr3 = w3 * r1, 6*/ + "fmla v19.4s , %[w4].4s, v1.4s\n" /* outr0 = w4 * r1, 1*/ + "ldp q0, q1, [%[inr2]], #32\n" /* load input r2*/ + "fmla v20.4s , %[w4].4s, v3.4s\n" /* outr1 = w4 * r1, 3*/ + "fmla v21.4s , %[w4].4s, v5.4s\n" /* outr2 = w4 * r1, 5*/ + "fmla v22.4s , %[w4].4s, v7.4s\n" /* outr3 = w4 * r1, 7*/ + "fmla v19.4s , %[w5].4s, v2.4s\n" /* outr0 = w5 * r1, 2*/ + "ldp q2, q3, [%[inr2]], #32\n" /* load input r2*/ + "fmla v20.4s , %[w5].4s, v4.4s\n" /* outr1 = w5 * r1, 4*/ + "ldp q4, q5, [%[inr2]], #32\n" /* load input r2*/ + "fmla v21.4s , %[w5].4s, v6.4s\n" /* outr2 = w5 * r1, 6*/ + "ldp q6, q7, [%[inr2]], #32\n" /* load input r2*/ + "fmla v22.4s , %[w5].4s, v8.4s\n" /* outr3 = w5 * r1, 8*/ + "ldr q8, [%[inr2]]\n" /* load input r2*/ + /* r2, mul w6-w8, get out r0, r1 */ + "fmla v19.4s , %[w6].4s, v0.4s\n" /* outr0 = w6 * r2, 0*/ + "fmla v20.4s , %[w6].4s, v2.4s\n" /* outr1 = w6 * r2, 2*/ + "fmla v21.4s , %[w6].4s, v4.4s\n" /* outr2 = w6 * r2, 4*/ + "fmla v22.4s , %[w6].4s, v6.4s\n" /* outr3 = w6 * r2, 6*/ + "fmla v19.4s , %[w7].4s, v1.4s\n" /* outr0 = w7 * r2, 1*/ + "fmla v20.4s , %[w7].4s, v3.4s\n" /* outr1 = w7 * r2, 3*/ + "fmla v21.4s , %[w7].4s, v5.4s\n" /* outr2 = w7 * r2, 5*/ + "fmla v22.4s , %[w7].4s, v7.4s\n" /* outr3 = w7 * r2, 7*/ + "fmla v19.4s , %[w8].4s, v2.4s\n" /* outr0 = w8 * r2, 2*/ + "fmla v20.4s , %[w8].4s, v4.4s\n" /* outr1 = w8 * r2, 4*/ + "fmla v21.4s , %[w8].4s, v6.4s\n" /* outr2 = w8 * r2, 6*/ + "fmla v22.4s , %[w8].4s, v8.4s\n" /* outr3 = w8 * r2, 8*/ + /* transpose */ + "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/ + "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/ + "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/ + "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/ + "trn1 v19.2d, v0.2d, v2.2d\n" /* r0: a0a1a2a3*/ + "trn2 v21.2d, v0.2d, v2.2d\n" /* r0: c0c1c2c3*/ + "trn1 v20.2d, v1.2d, v3.2d\n" /* r0: b0b1b2b3*/ + "trn2 v22.2d, v1.2d, v3.2d\n" /* r0: d0d1d2d3*/ + /* relu */ + "cbz %w[flag_relu], 0f\n" /* skip relu*/ + "movi v0.4s, #0\n" /* for relu */ + "fmax v19.4s, v19.4s, v0.4s\n" + "fmax v20.4s, v20.4s, v0.4s\n" + "fmax v21.4s, v21.4s, v0.4s\n" + "fmax v22.4s, v22.4s, v0.4s\n" + /* save result */ + "0:\n" + "str q19, [%[outc0]], #16\n" + "str q20, [%[outc1]], #16\n" + "str q21, [%[outc2]], #16\n" + "str q22, [%[outc3]], #16\n" + :[inr0] "+r"(inr0), [inr1] "+r"(inr1), + [inr2] "+r"(inr2), + [outc0]"+r"(outc0), [outc1]"+r"(outc1), + [outc2]"+r"(outc2), [outc3]"+r"(outc3) + :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), + [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5), + [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8), + [bias] "r" (bias_local), [flag_relu]"r"(flag_relu) + : "cc", "memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8", "v19","v20","v21","v22" + ); +#else + asm volatile( + /* fill with bias */ + "vld1.32 {d16-d17}, [%[bias]]\n" /* load bias */ + /* load weights */ + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */ + "vld1.32 {d0-d3}, [%[r0]]!\n" /* load input r0, 0,1*/ + "vand.i32 q12, q8, q8\n" + "vld1.32 {d4-d7}, [%[r0]]!\n" /* load input r0, 2,3*/ + "vand.i32 q13, q8, q8\n" + "vld1.32 {d8-d11}, [%[r0]]!\n" /* load input r0, 4,5*/ + "vand.i32 q14, q8, q8\n" + "vld1.32 {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/ + "vand.i32 q15, q8, q8\n" + "vld1.32 {d16-d17}, [%[r0]]\n" /* load input r0, 8*/ + /* mul r0 with w0, w1, w2 */ + "vmla.f32 q12, q9, q0 @ w0 * inr0\n" + "vmla.f32 q13, q9, q2 @ w0 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */ + "vmla.f32 q14, q9, q4 @ w0 * inr4\n" + "vmla.f32 q15, q9, q6 @ w0 * inr6\n" + "vmla.f32 q12, q10, q1 @ w1 * inr1\n" + "vld1.32 {d0-d3}, [%[r1]]! @ load r1, 0, 1\n" + "vmla.f32 q13, q10, q3 @ w1 * inr3\n" + "vmla.f32 q14, q10, q5 @ w1 * inr5\n" + "vmla.f32 q15, q10, q7 @ w1 * inr7\n" + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */ + "vmla.f32 q12, q11, q2 @ w2 * inr2\n" + "vld1.32 {d4-d7}, [%[r1]]! @ load r1, 2, 3\n" + "vmla.f32 q13, q11, q4 @ w2 * inr4\n" + "vld1.32 {d8-d11}, [%[r1]]! @ load r1, 4, 5\n" + "vmla.f32 q14, q11, q6 @ w2 * inr6\n" + "vld1.32 {d12-d15}, [%[r1]]! @ load r1, 6, 7\n" + "vmla.f32 q15, q11, q8 @ w2 * inr8\n" + /* mul r1 with w3, w4, w5 */ + "vmla.f32 q12, q9, q0 @ w3 * inr0\n" + "vmla.f32 q13, q9, q2 @ w3 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */ + "vmla.f32 q14, q9, q4 @ w3 * inr4\n" + "vmla.f32 q15, q9, q6 @ w3 * inr6\n" + "vld1.32 {d16-d17}, [%[r1]]\n" /* load input r1, 8*/ + "vmla.f32 q12, q10, q1 @ w4 * inr1\n" + "vld1.32 {d0-d3}, [%[r2]]! @ load r2, 0, 1\n" + "vmla.f32 q13, q10, q3 @ w4 * inr3\n" + "vmla.f32 q14, q10, q5 @ w4 * inr5\n" + "vmla.f32 q15, q10, q7 @ w4 * inr7\n" + "vld1.32 {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */ + "vmla.f32 q12, q11, q2 @ w5 * inr2\n" + "vld1.32 {d4-d7}, [%[r2]]! @ load r2, 2, 3\n" + "vmla.f32 q13, q11, q4 @ w5 * inr4\n" + "vld1.32 {d8-d11}, [%[r2]]! @ load r2, 4, 5\n" + "vmla.f32 q14, q11, q6 @ w5 * inr6\n" + "vld1.32 {d12-d15}, [%[r2]]! @ load r2, 6, 7\n" + "vmla.f32 q15, q11, q8 @ w5 * inr8\n" + /* mul r2 with w6, w7, w8 */ + "vmla.f32 q12, q9, q0 @ w6 * inr0\n" + "vmla.f32 q13, q9, q2 @ w6 * inr2\n" + "vld1.32 {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */ + "vmla.f32 q14, q9, q4 @ w6 * inr4\n" + "vmla.f32 q15, q9, q6 @ w6 * inr6\n" + "vld1.32 {d16-d17}, [%[r2]]\n" /* load input r2, 8*/ + "vmla.f32 q12, q10, q1 @ w7 * inr1\n" + "vmla.f32 q13, q10, q3 @ w7 * inr3\n" + "vmla.f32 q14, q10, q5 @ w7 * inr5\n" + "vmla.f32 q15, q10, q7 @ w7 * inr7\n" + "sub %[wc0], %[wc0], #144 @ wc0 - 144 to start address\n" + "vmla.f32 q12, q11, q2 @ w8 * inr2\n" + "vmla.f32 q13, q11, q4 @ w8 * inr4\n" + "vmla.f32 q14, q11, q6 @ w8 * inr6\n" + "vmla.f32 q15, q11, q8 @ w8 * inr8\n" + /* transpose */ + "vtrn.32 q12, q13\n" /* a0a1c0c1, b0b1d0d1*/ + "vtrn.32 q14, q15\n" /* a2a3c2c3, b2b3d2d3*/ + "vswp d25, d28\n" /* a0a1a2a3, c0c1c2c3*/ + "vswp d27, d30\n" /* b0b1b2b3, d0d1d2d3*/ + "cmp %[flag_relu], #0\n" + "beq 0f\n" /* skip relu*/ + "vmov.u32 q0, #0\n" + "vmax.f32 q12, q12, q0\n" + "vmax.f32 q13, q13, q0\n" + "vmax.f32 q14, q14, q0\n" + "vmax.f32 q15, q15, q0\n" + "0:\n" + "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/ + "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/ + "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/ + "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/ + :[r0] "+r"(inr0), [r1] "+r"(inr1), + [r2] "+r"(inr2), [wc0] "+r" (weight_c), + [outc0]"+r"(outc0), [outc1]"+r"(outc1), + [outc2]"+r"(outc2), [outc3]"+r"(outc3) + :[bias] "r" (bias_local), + [flag_relu]"r"(flag_relu) + :"cc", "memory", + "q0","q1","q2","q3","q4","q5","q6","q7", + "q8", "q9","q10","q11","q12","q13","q14","q15" + ); +#endif // __arch64__ + // clang-format off + if (flag_mask) { + for (int i = 0; i < remain; ++i) { + c0[i] = pre_out[i]; + c1[i] = pre_out[i + 4]; + c2[i] = pre_out[i + 8]; + c3[i] = pre_out[i + 12]; + } + } + } + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h index b2d16d18d2300ea51de8c8e9f25664ffdf4aebc7..e4279d9a728bc7af0f14a00b781db449fc426582 100644 --- a/lite/backends/arm/math/conv_block_utils.h +++ b/lite/backends/arm/math/conv_block_utils.h @@ -254,6 +254,7 @@ inline void prepack_input_nxwc4_dw(const float* din, LOG(FATAL) << "prepack_dw_input, valid height must > zero"; } float32x4_t vzero = vdupq_n_f32(0.f); + auto out_data = dout; int size_w = we - ws; int w0 = ws < 0 ? 0 : ws; @@ -269,6 +270,7 @@ inline void prepack_input_nxwc4_dw(const float* din, bool flag_ext_l = left_remain > 0; int left_sl = 4 - left_remain; + int left_valid_sl = left_sl > width ? width : left_sl; uint32x4_t vmask_padl; bool flag_mask_l = false; if (flag_ext_l) { @@ -290,6 +292,7 @@ inline void prepack_input_nxwc4_dw(const float* din, } int size_c = width * height; for (int h = hs; h < he; ++h) { + dout = out_data + (h - hs) * 4 * size_w; auto ptr_c0 = din + cs * size_c + h * width; auto ptr_c1 = ptr_c0 + size_c; auto ptr_c2 = ptr_c1 + size_c; @@ -351,10 +354,10 @@ inline void prepack_input_nxwc4_dw(const float* din, } transpose_4x4(vc0, vc1, vc2, vc3, dout); dout += 16; - ptr_c0 += left_sl; - ptr_c1 += left_sl; - ptr_c2 += left_sl; - ptr_c3 += left_sl; + ptr_c0 += left_valid_sl; + ptr_c1 += left_valid_sl; + ptr_c2 += left_valid_sl; + ptr_c3 += left_valid_sl; } /// valid for (int i = 0; i < cnt_valid; ++i) { @@ -722,7 +725,57 @@ inline bool write_to_output_c1_fp32(const float* din, } return true; } - +#ifdef __aarch64__ +#define NCHWC2_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1*/ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1*/ \ + "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ \ + "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ + +#define NCHWC2_TRANS_FP32_RELU \ + "fmax v2.4s, v4.4s, v20.4s \n" /*relu*/ \ + "fmax v3.4s, v5.4s, v20.4s \n" /*relu*/ + +#define NCHWC2_TRANS_FP32_STORE \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + \ + "str q2, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q3, [%[doutc1r0]], #16 \n" /* store c2r0*/ \ + \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC2_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " \ + "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " \ + "c1r0, c1r1 \n" \ + "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " \ + "c1r2, c1r3 \n" \ + \ + "vswp d1, d2 @ swap data\n" + +#define NCHWC2_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" + +#define NCHWC2_TRANS_FP32_STORE \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " \ + "pointer\n" \ + \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" \ + \ + "bne 1b @ jump to main loop\n" +#endif /*wirte result in outputs * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] */ @@ -777,127 +830,41 @@ inline bool write_to_output_c2_fp32(const float* din, int cnt_loop = cnt; if (flag_relu) { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "fmax v2.4s, v4.4s, v20.4s \n" /*relu*/ - "fmax v3.4s, v5.4s, v20.4s \n" /*relu*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q2, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q3, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU + NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU + NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } else { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "1: \n" /* main loop*/ - "trn1 v2.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v3.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load data, c0r0, c1r0, c0r1, - c1r1, , c0r2, c1r2, c0r3, - c1r3 */ - "trn1 v4.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - "trn2 v5.2d, v2.2d, v3.2d \n" /* trans q8, q10*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - - "str q4, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q5, [%[doutc1r0]], #16 \n" /* store c2r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", "v1", "v2", "v3", "v4", "v5"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", "v1", "v2", "v3", "v4", "v5"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data, c0r0, " - "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n" - "1: @ main loop\n" - "vtrn.32 d0, d1 @ trans data:c0r0, c0r1, " - "c1r0, c1r1 \n" - "vtrn.32 d2, d3 @ trans data:c0r2, c0r3, " - "c1r2, c1r3 \n" - - "vswp d1, d2 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @ load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } } @@ -922,6 +889,70 @@ inline bool write_to_output_c2_fp32(const float* din, return true; } +#ifdef __aarch64__ +#define NCHWC4_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ \ + "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ \ + "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ \ + "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ \ + "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ + +#define NCHWC4_TRANS_FP32_RELU \ + "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ \ + "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ \ + "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ \ + "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ + +#define NCHWC4_TRANS_FP32_STORE \ + "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ \ + "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ \ + "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC4_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " \ + "\n" \ + "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " \ + "\n" \ + \ + "vswp d1, d4 @ swap data\n" \ + "vswp d3, d6 @ swap data\n" + +#define NCHWC4_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" \ + "vmax.f32 q2, q2, q15 @ relu\n" \ + "vmax.f32 q3, q3, q15 @ relu\n" + +#define NCHWC4_TRANS_FP32_STORE \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" \ + "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" \ + "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" \ + \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + \ + "bne 1b @ jump to main loop\n" +#endif /*wirte result in outputs * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w] */ @@ -958,7 +989,9 @@ inline bool write_to_output_c4_fp32(const float* din, int size_h = (he > height ? height : he) - hs; // size_h == hei_n - int cnt = (width - ws) / w4; + int valid_we = we > width ? width : we; + int cnt = (valid_we - ws) / w4; + int remain = valid_we - ws - cnt * w4; for (int i = 0; i < size_h; i++) { int size_w = i * width; @@ -983,185 +1016,88 @@ inline bool write_to_output_c4_fp32(const float* din, int cnt_loop = cnt; if (flag_relu) { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU + NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q15"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU + NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q15"); #endif } else { #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v1.4s \n" /* trans q0, q1*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "trn1 v10.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v2.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "trn1 v16.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn2 v17.2d, v8.2d, v10.2d \n" /* trans q8, q10*/ - "trn1 v18.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "trn2 v19.2d, v9.2d, v11.2d \n" /* trans q9, q11*/ - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v8", - "v9", - "v10", - "v11", - "v16", - "v17", - "v18", - "v19"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v8", + "v9", + "v10", + "v11", + "v16", + "v17", + "v18", + "v19"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q1 @ trans data:c00c01c20c21 " - "\n" - "vtrn.32 q2, q3 @ trans data:c02c03c22c23 " - "\n" - - "vswp d1, d4 @ swap data\n" - "vswp d3, d6 @ swap data\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add pointer\n" - "vst1.32 {d2-d3}, [%[doutc1r0]]! @ store result, add pointer\n" - "vst1.32 {d4-d5}, [%[doutc2r0]]! @ store result, add pointer\n" - "vst1.32 {d6-d7}, [%[doutc3r0]]! @ store result, add pointer\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3"); + asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3"); #endif } } - if (we > width) { + if (remain > 0) { int offset = i * w_round * c4 + c4 * w4 * cnt; din_hei_ptr = ptr_din + offset; - int j = we - w4; + int j = 0; if (flag_relu) { - for (; j < width; ++j) { + for (; j < remain; ++j) { *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f); *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f); *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f); @@ -1169,7 +1105,7 @@ inline bool write_to_output_c4_fp32(const float* din, din_hei_ptr += w4; } } else { - for (; j < width; ++j) { + for (; j < remain; ++j) { *(doutc0_ptr++) = din_hei_ptr[0]; *(doutc1_ptr++) = din_hei_ptr[1]; *(doutc2_ptr++) = din_hei_ptr[2]; @@ -1182,6 +1118,120 @@ inline bool write_to_output_c4_fp32(const float* din, return true; } +#ifdef __aarch64__ +#define NCHWC8_TRANS_FP32_COMPUTE \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + "movi v20.4s, #0 \n" /* for relu */ \ + "1: \n" /* main loop*/ \ + "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ \ + "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ \ + "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ \ + "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ \ + "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + \ + "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ \ + "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ \ + "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ \ + "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ \ + "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ \ + \ + "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ \ + "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ \ + "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ \ + "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ \ + "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ \ + \ + "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ \ + "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ \ + "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ \ + "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ \ + "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ + +#define NCHWC8_TRANS_FP32_RELU \ + "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ \ + "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ \ + "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ \ + "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ \ + \ + "fmax v8.4s, v8.4s, v20.4s \n" /*relu*/ \ + "fmax v9.4s, v9.4s, v20.4s \n" /*relu*/ \ + "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ \ + "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ + +#define NCHWC8_TRANS_FP32_STORE \ + "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ \ + "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ \ + "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ \ + "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ \ + "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ \ + "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ \ + "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ \ + "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ \ + \ + "bne 1b \n" /* jump to main loop*/ +#else +#define NCHWC8_TRANS_FP32_COMPUTE \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" \ + "vmov.u32 q15, #0 @ dump zero\n" \ + "1: @ main loop\n" \ + "vtrn.32 q0, q2 @ trans q0, q2 \n" \ + "vtrn.32 q4, q6 @ trans q4, q6 \n" \ + "vswp.32 d1, d8 @ swap d1, d8 \n" \ + "vswp.32 d5, d12 @ swap d5, d12\n" \ + \ + "vtrn.32 q1, q3 @ trans q1, q3 \n" \ + "vtrn.32 q5, q7 @ trans q5, q7 \n" \ + "vswp.32 d3, d10 @ swap d3, d10\n" \ + "vswp.32 d7, d14 @ swap d7, d14\n" + +#define NCHWC8_TRANS_FP32_RELU \ + "vmax.f32 q0, q0, q15 @ relu\n" \ + "vmax.f32 q1, q1, q15 @ relu\n" \ + "vmax.f32 q2, q2, q15 @ relu\n" \ + "vmax.f32 q3, q3, q15 @ relu\n" \ + \ + "vmax.f32 q4, q4, q15 @ relu\n" \ + "vmax.f32 q5, q5, q15 @ relu\n" \ + "vmax.f32 q6, q6, q15 @ relu\n" \ + "vmax.f32 q7, q7, q15 @ relu\n" + +#define NCHWC8_TRANS_FP32_STORE \ + "subs %[cnt], %[cnt], #1 @ loop count - 1\n" \ + "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " \ + "pointer\n" \ + \ + "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" \ + \ + "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " \ + "pointer\n" \ + "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " \ + "pointer\n" \ + \ + "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" \ + "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" \ + \ + "bne 1b @ jump to main loop\n" + +#endif /*wirte result in outputs * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w] */ @@ -1261,158 +1311,54 @@ inline bool write_to_output_c8_fp32(const float* din, if (cnt > 0) { int cnt_loop = cnt; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "movi v20.4s, #0 \n" /* for relu */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "fmax v16.4s, v16.4s, v20.4s \n" /*relu*/ - "fmax v17.4s, v17.4s, v20.4s \n" /*relu*/ - "fmax v18.4s, v18.4s, v20.4s \n" /*relu*/ - "fmax v19.4s, v19.4s, v20.4s \n" /*relu*/ - - "fmax v8.4s, v8.4s, v20.4s \n" /*relu*/ - "fmax v9.4s, v9.4s, v20.4s \n" /*relu*/ - "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/ - "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU + NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "vmov.u32 q15, #0 @ dump zero\n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "vmax.f32 q0, q0, q15 @ relu\n" - "vmax.f32 q1, q1, q15 @ relu\n" - "vmax.f32 q2, q2, q15 @ relu\n" - "vmax.f32 q3, q3, q15 @ relu\n" - - "vmax.f32 q4, q4, q15 @ relu\n" - "vmax.f32 q5, q5, q15 @ relu\n" - "vmax.f32 q6, q6, q15 @ relu\n" - "vmax.f32 q7, q7, q15 @ relu\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4", "q15"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU + NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q4", "q15"); #endif } if (we > width) { @@ -1468,138 +1414,53 @@ inline bool write_to_output_c8_fp32(const float* din, if (cnt > 0) { int cnt_loop = cnt; #ifdef __aarch64__ - asm volatile( - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - "1: \n" /* main loop*/ - "trn1 v8.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn2 v9.4s, v0.4s, v2.4s \n" /* trans q0, q1*/ - "trn1 v10.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "trn2 v11.4s, v1.4s, v3.4s \n" /* trans q2, q3*/ - "ldp q0, q1, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v12.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn2 v13.4s, v4.4s, v6.4s \n" /* trans q0, q1*/ - "trn1 v14.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "trn2 v15.4s, v5.4s, v7.4s \n" /* trans q2, q3*/ - "ldp q2, q3, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "trn1 v16.2d, v8.2d, v12.2d \n" /* trans q8, q10 00 01 02 03*/ - "trn2 v17.2d, v8.2d, v12.2d \n" /* trans q8, q10 20 21 22 23*/ - "trn1 v18.2d, v9.2d, v13.2d \n" /* trans q9, q11 10 11 12 13*/ - "trn2 v19.2d, v9.2d, v13.2d \n" /* trans q9, q11 30 31 32 33*/ - "ldp q4, q5, [%[ptr_din]], #32 \n" /* load r00, r01 to q0, q1 */ - - "trn1 v8.2d, v10.2d, v14.2d \n" /* trans q8, q10 40 41 42 43*/ - "trn2 v9.2d, v10.2d, v14.2d \n" /* trans q8, q10 60 61 62 63*/ - "trn1 v12.2d, v11.2d, v15.2d \n" /* trans q9, q11 50 51 52 53*/ - "trn2 v13.2d, v11.2d, v15.2d \n" /* trans q9, q11 70 71 72 73*/ - "ldp q6, q7, [%[ptr_din]], #32 \n" /* load r02, r03 to q2, q3 */ - - "str q16, [%[doutc0r0]], #16 \n" /* store c0r0*/ - "str q17, [%[doutc2r0]], #16 \n" /* store c2r0*/ - "str q18, [%[doutc1r0]], #16 \n" /* store c1r0*/ - "str q19, [%[doutc3r0]], #16 \n" /* store c3r0*/ - - "subs %w[cnt], %w[cnt], #1 \n" /* loop count -1*/ - "str q8, [%[doutc4r0]], #16 \n" /* store c0r0*/ - "str q9, [%[doutc6r0]], #16 \n" /* store c2r0*/ - "str q12, [%[doutc5r0]], #16 \n" /* store c1r0*/ - "str q13, [%[doutc7r0]], #16 \n" /* store c3r0*/ - - "bne 1b \n" /* jump to main loop*/ - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [cnt] "+r"(cnt_loop), - [ptr_din] "+r"(din_hei_ptr) - : - : "v0", - "v1", - "v2", - "v3", - "v4", - "v5", - "v6", - "v7", - "v8", - "v9", - "v10", - "v11", - "v12", - "v13", - "v14", - "v15", - "v16", - "v17", - "v18", - "v19", - "v20"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [cnt] "+r"(cnt_loop), + [ptr_din] "+r"(din_hei_ptr) + : + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "v16", + "v17", + "v18", + "v19", + "v20"); #else - asm volatile( - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - "1: @ main loop\n" - "vtrn.32 q0, q2 @ trans q0, q2 \n" - "vtrn.32 q4, q6 @ trans q4, q6 \n" - "vswp.32 d1, d8 @ swap d1, d8 \n" - "vswp.32 d5, d12 @ swap d5, d12\n" - - "vtrn.32 q1, q3 @ trans q1, q3 \n" - "vtrn.32 q5, q7 @ trans q5, q7 \n" - "vswp.32 d3, d10 @ swap d3, d10\n" - "vswp.32 d7, d14 @ swap d7, d14\n" - - "subs %[cnt], %[cnt], #1 @ loop count - 1\n" - - "vst1.32 {d0-d1}, [%[doutc0r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d2-d3}, [%[doutc4r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d4-d5}, [%[doutc1r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d6-d7}, [%[doutc5r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d0-d3}, [%[ptr_din]]! @load data \n" - "vld1.32 {d4-d7}, [%[ptr_din]]! @load data \n" - - "vst1.32 {d8-d9}, [%[doutc2r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d10-d11}, [%[doutc6r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d12-d13}, [%[doutc3r0]]! @ store result, add " - "pointer\n" - "vst1.32 {d14-d15}, [%[doutc7r0]]! @ store result, add " - "pointer\n" - - "vld1.32 {d8-d11}, [%[ptr_din]]! @load data \n" - "vld1.32 {d12-d15}, [%[ptr_din]]! @load data \n" - - "bne 1b @ jump to main loop\n" - - : [doutc0r0] "+r"(doutc0_ptr), - [doutc1r0] "+r"(doutc1_ptr), - [doutc2r0] "+r"(doutc2_ptr), - [doutc3r0] "+r"(doutc3_ptr), - [doutc4r0] "+r"(doutc4_ptr), - [doutc5r0] "+r"(doutc5_ptr), - [doutc6r0] "+r"(doutc6_ptr), - [doutc7r0] "+r"(doutc7_ptr), - [ptr_din] "+r"(din_hei_ptr), - [cnt] "+r"(cnt_loop) - : - : "q0", "q1", "q2", "q3", "q4"); + asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE + : [doutc0r0] "+r"(doutc0_ptr), + [doutc1r0] "+r"(doutc1_ptr), + [doutc2r0] "+r"(doutc2_ptr), + [doutc3r0] "+r"(doutc3_ptr), + [doutc4r0] "+r"(doutc4_ptr), + [doutc5r0] "+r"(doutc5_ptr), + [doutc6r0] "+r"(doutc6_ptr), + [doutc7r0] "+r"(doutc7_ptr), + [ptr_din] "+r"(din_hei_ptr), + [cnt] "+r"(cnt_loop) + : + : "q0", "q1", "q2", "q3", "q4"); #endif } if (we > width) { diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h index 1a23982cd575afb6b249390de7081165c03414b9..b6c3478880d5cb59999d23ff03e2e342708ca95b 100644 --- a/lite/backends/arm/math/conv_depthwise.h +++ b/lite/backends/arm/math/conv_depthwise.h @@ -85,38 +85,6 @@ void conv_depthwise_3x3s2_fp32(const float* din, bool flag_relu, ARMContext* ctx); -void conv_depthwise_3x3p0_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - -void conv_depthwise_3x3p1_fp32(const float* din, - float* dout, - int num, - int ch_out, - int h_out, - int w_out, - int ch_in, - int h_in, - int w_in, - const float* weights, - const float* bias, - int stride, - bool flag_bias, - bool flag_relu, - ARMContext* ctx); - template void conv_depthwise_3x3s1_int8(Dtype* dout, const int8_t* din, diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc index 010563bf936c2f8454162c8aad48cd8815c5f7af..dc68e65f42a799d7fa7e8be75f5afcf3166b1df3 100644 --- a/lite/backends/arm/math/conv_impl.cc +++ b/lite/backends/arm/math/conv_impl.cc @@ -107,29 +107,35 @@ void im2col(const Dtype* data_im, int width, int kernel_h, int kernel_w, - int pad_h, - int pad_w, + int pad_top, + int pad_bottom, + int pad_left, + int pad_right, int stride_h, int stride_w, int dilation_h, int dilation_w, Dtype* data_col) { const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) / + stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_top + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { for (int output_cols = output_w; output_cols; output_cols--) { *(data_col++) = 0; } } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_left + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { *(data_col++) = data_im[input_row * width + input_col]; @@ -202,7 +208,8 @@ void conv1x1s1_gemm(const float* i_data, k, flag_bias, bias_group, - flag_relu); + flag_relu, + ctx); } else { sgemm_prepack(false, m, @@ -361,6 +368,8 @@ void conv_im2col_gemm(const float* i_data, float* tmp_work_space = ctx->workspace_data() + ctx->llc_size() / sizeof(float); + auto paddings = *param.paddings; + auto dilations = *param.dilations; //! use gemv when the output channel size = 1 for (int b = 0; b < num; ++b) { // dC @@ -378,12 +387,14 @@ void conv_im2col_gemm(const float* i_data, win, kernel_h, kernel_w, - param.paddings[0], - param.paddings[1], + paddings[0], + paddings[1], + paddings[2], + paddings[3], param.strides[0], param.strides[1], - param.dilations[0], - param.dilations[1], + dilations[0], + dilations[1], dB); if (n == 1) { @@ -395,7 +406,8 @@ void conv_im2col_gemm(const float* i_data, k, flag_bias, bias_group, - flag_relu); + flag_relu, + ctx); } else { int ldb = n; sgemm_prepack(false, @@ -434,14 +446,16 @@ void conv_im2col_gemm_int8(const int8_t* i_data, const float* scale) { int group = param.groups; auto filter_dims = param.filter->dims(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; int kernel_h = filter_dims[2]; int kernel_w = filter_dims[3]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; const int m = oc / group; const int n = oh * ow; const int k = ic * kernel_h * kernel_w / group; @@ -482,7 +496,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data, kernel_h, kernel_w, pad_h, + paddings[1], pad_w, + paddings[3], stride_h, stride_w, dila_h, @@ -562,90 +578,83 @@ void conv_depthwise_3x3_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; - if (pad_w != pad_h) { - LOG(FATAL) << "fp32 depthwise conv3x3 pad_w: " << pad_w - << ", pad_h: " << pad_h << " must be equal"; - return; - } + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; int stride = param.strides[1]; int pad = pad_w; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; - if (stride == 1 && pad < 2) { // support pad = [0, 1] - conv_depthwise_3x3s1_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else if (stride == 2 && pad < 2) { // support pad = [0, 1] - conv_depthwise_3x3s2_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - pad, - flag_bias, - flag_relu, - ctx); - } else { - LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride - << " or pad(<2): " << pad << " unsupported"; - } -#if 0 - if (pad == 1) { - conv_depthwise_3x3p1_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - stride, - flag_bias, - flag_relu, - ctx); - } else if (pad == 0 && h_in > 2) { - conv_depthwise_3x3p0_fp32(reinterpret_cast(din), - reinterpret_cast(dout), - num, - ch_out, - h_out, - w_out, - ch_in, - h_in, - w_in, - reinterpret_cast(weights), - bias, - stride, - flag_bias, - flag_relu, - ctx); + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (stride == 1) { + if (pads_equal && (pad_h == pad_w) && (pad < 2)) { // support pad = [0, 1] + conv_depthwise_3x3s1_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + pad, + flag_bias, + flag_relu, + ctx); + } else { + conv_3x3s1_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + ctx); + } + + } else if (stride == 2) { + if (pad_h == pad_w && (pad < 2)) { // support pad = [0, 1] + conv_depthwise_3x3s2_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + pad, + flag_bias, + flag_relu, + ctx); + } else { + conv_3x3s2_depthwise_fp32(reinterpret_cast(din), + reinterpret_cast(dout), + num, + ch_out, + h_out, + w_out, + ch_in, + h_in, + w_in, + reinterpret_cast(weights), + bias, + param, + ctx); + } } else { - LOG(FATAL) << "unsupport this type 3x3 dw conv"; + LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride << " unsupported"; } -#endif } void conv_depthwise_5x5_fp32(const void* din, @@ -662,7 +671,8 @@ void conv_depthwise_5x5_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad = param.paddings[1]; + auto paddings = *param.paddings; + int pad = paddings[0]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -718,8 +728,9 @@ void conv_depthwise_3x3_int8_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -776,8 +787,9 @@ void conv_depthwise_3x3_int8_int8(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -834,8 +846,9 @@ void conv_depthwise_5x5_int8_fp32(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; @@ -875,8 +888,9 @@ void conv_depthwise_5x5_int8_int8(const void* din, const operators::ConvParam& param, ARMContext* ctx, const float* scale) { - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride = param.strides[1]; bool flag_relu = param.fuse_relu; bool flag_bias = param.bias != nullptr; diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h index c5baa31e1414c4a7a0c926728e5c150c0fc3e21c..f4d00039aaa635d0ffb31846fd9ff9077ac0c621 100644 --- a/lite/backends/arm/math/conv_impl.h +++ b/lite/backends/arm/math/conv_impl.h @@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor, const int* bias, int channel, int channel_size); +// new winograd +void weight_trans_c4( + float* dest, const float* src, int ic, int oc, void* workspace); +void conv_compute_6x6_3x3(const float* input, + float* output, + int num, + int chout, + int hout, + int wout, + int chin, + int hin, + int win, + const float* weight, + const float* bias, + const operators::ConvParam& param, + ARMContext* ctx); } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc index 87b08f63102104b325e95c093fe0fc0aaef243e0..894b946a32ccb7c487087291894a01a1d79334fa 100644 --- a/lite/backends/arm/math/conv_winograd_3x3.cc +++ b/lite/backends/arm/math/conv_winograd_3x3.cc @@ -37,9 +37,9 @@ void conv_winograd3x3(const float* din, const operators::ConvParam& param, ARMContext* ctx) { int threads = ctx->threads(); - - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + const int pad_h = paddings[0]; + const int pad_w = paddings[1]; int size_in_channel = win * hin; int size_out_channel = wout * hout; bool flag_relu = param.fuse_relu; diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h index d8ef6ff47d0392ac15caf2d94b7c53ff63659da2..8977b5712c13dec0088d83db4cbfef8494785301 100644 --- a/lite/backends/arm/math/funcs.h +++ b/lite/backends/arm/math/funcs.h @@ -39,10 +39,12 @@ #include "lite/backends/arm/math/im2sequence.h" #include "lite/backends/arm/math/increment.h" #include "lite/backends/arm/math/interpolate.h" +#include "lite/backends/arm/math/layout.h" #include "lite/backends/arm/math/lrn.h" #include "lite/backends/arm/math/negative.h" #include "lite/backends/arm/math/norm.h" #include "lite/backends/arm/math/packed_sgemm.h" +#include "lite/backends/arm/math/packed_sgemm_c4.h" #include "lite/backends/arm/math/pad2d.h" #include "lite/backends/arm/math/pooling.h" #include "lite/backends/arm/math/power.h" diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc index f89410ad11590c60bf5542702b60fa883298d3e6..e9e18043dfc09001ebba23f952a59474630e54aa 100644 --- a/lite/backends/arm/math/interpolate.cc +++ b/lite/backends/arm/math/interpolate.cc @@ -22,6 +22,28 @@ namespace lite { namespace arm { namespace math { +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*tensor->data())); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + // The following function bilinear_interp is partially base on // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp // Tencent is pleased to support the open source community by making ncnn @@ -472,33 +494,52 @@ void nearest_interp(const float* src, void interpolate(lite::Tensor* X, lite::Tensor* OutSize, + std::vector SizeTensor, + lite::Tensor* Scale, lite::Tensor* Out, int out_height, int out_width, - float height_scale, - float width_scale, + float scale, bool with_align, std::string interpolate_type) { + int in_h = X->dims()[2]; + int in_w = X->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height = new_size[0]; + out_width = new_size[1]; + } else { + auto scale_tensor = Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height = static_cast(in_h * scale); + out_width = static_cast(in_w * scale); + } + auto out_size = OutSize; + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_height = static_cast(out_size_data[0]); + out_width = static_cast(out_size_data[1]); + } + } + float height_scale = scale; + float width_scale = scale; if (out_width > 0 && out_height > 0) { height_scale = static_cast(out_height / X->dims()[2]); width_scale = static_cast(out_width / X->dims()[3]); } - if (OutSize != nullptr) { - auto OutSize_data = OutSize->data(); - int h_out = OutSize_data[0]; // HW - int w_out = OutSize_data[1]; // HW - int num_cout = Out->dims()[0]; - int c_cout = Out->dims()[1]; - Out->Resize({num_cout, c_cout, h_out, w_out}); - } + int num_cout = X->dims()[0]; + int c_cout = X->dims()[1]; + Out->Resize({num_cout, c_cout, out_height, out_width}); float* dout = Out->mutable_data(); const float* din = X->data(); int out_num = Out->dims()[0]; int out_c = Out->dims()[1]; int count = out_num * out_c; - int in_h = X->dims()[2]; - int in_w = X->dims()[3]; int out_h = Out->dims()[2]; int out_w = Out->dims()[3]; int spatial_in = in_h * in_w; diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h index be250f6a5e7581ba70809362d169167fea1d1c11..e9c41c5bc86c8f00d57e096e3cd2b5f37df3a474 100644 --- a/lite/backends/arm/math/interpolate.h +++ b/lite/backends/arm/math/interpolate.h @@ -44,11 +44,12 @@ void nearest_interp(const float* src, void interpolate(lite::Tensor* X, lite::Tensor* OutSize, + std::vector SizeTensor, + lite::Tensor* Scale, lite::Tensor* Out, int out_height, int out_width, - float height_scale, - float width_scale, + float scale, bool with_align, std::string interpolate_type); diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc new file mode 100644 index 0000000000000000000000000000000000000000..fd9126ab48c8f829c82d0c78a338074c695f0b9c --- /dev/null +++ b/lite/backends/arm/math/layout.cc @@ -0,0 +1,668 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/layout.h" +#include +#include +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +#ifdef __aarch64__ +#define TRANS_C4 \ + "ld1 {v0.4s}, [%[din0_ptr]] \n" \ + "ld1 {v1.4s}, [%[din1_ptr]] \n" \ + "ld1 {v2.4s}, [%[din2_ptr]] \n" \ + "ld1 {v3.4s}, [%[din3_ptr]] \n" \ + \ + "1: \n" \ + "trn1 v4.4s, v0.4s, v1.4s \n" /*00 10 02 12 */ \ + "trn1 v5.4s, v2.4s, v3.4s \n" /*20 30 22 32 */ \ + "trn2 v6.4s, v0.4s, v1.4s \n" /*01 11 03 13 */ \ + "trn2 v7.4s, v2.4s, v3.4s \n" /*21 31 23 33 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \ + \ + "trn1 v8.2d, v4.2d, v5.2d \n" /*00 10 20 30 */ \ + "trn1 v9.2d, v6.2d, v7.2d \n" /*01 11 21 31 */ \ + "trn2 v10.2d, v4.2d, v5.2d \n" /*02 12 22 32 */ \ + "trn2 v11.2d, v6.2d, v7.2d \n" /*03 13 23 33 */ \ + \ + "ld1 {v0.4s}, [%[din0_ptr]] \n" \ + "ld1 {v1.4s}, [%[din1_ptr]] \n" \ + "ld1 {v2.4s}, [%[din2_ptr]] \n" \ + "ld1 {v3.4s}, [%[din3_ptr]] \n" \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + "str q8, [%[out0_ptr]], #16 \n" \ + "str q9, [%[out1_ptr]], #16 \n" \ + "str q10, [%[out2_ptr]], #16 \n" \ + "str q11, [%[out3_ptr]], #16 \n" \ + "bne 1b \n" + +#define TRANS_C8 \ + "1: \n" \ + "ld1 {v0.8b}, [%[din0_ptr]] \n" \ + "ld1 {v1.8b}, [%[din1_ptr]] \n" \ + "ld1 {v2.8b}, [%[din2_ptr]] \n" \ + "ld1 {v3.8b}, [%[din3_ptr]] \n" \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "trn1 v8.8b, v0.8b, v1.8b \n" /*00 10 02 12 04 14 06 16 */ \ + "trn1 v9.8b, v2.8b, v3.8b \n" /*20 30 22 32 */ \ + "trn2 v12.8b, v0.8b, v1.8b \n" /*01 11 03 13 05 15 07 17 */ \ + "trn2 v13.8b, v2.8b, v3.8b \n" /*21 31 23 33 */ \ + \ + "ld1 {v4.8b}, [%[din0_ptr]] \n" \ + "ld1 {v5.8b}, [%[din1_ptr]] \n" \ + "ld1 {v6.8b}, [%[din2_ptr]] \n" \ + "ld1 {v7.8b}, [%[din3_ptr]] \n" \ + \ + "trn1 v10.8b, v4.8b, v5.8b \n" /*40 50 42 52 */ \ + "trn1 v11.8b, v6.8b, v7.8b \n" /*60 70 62 72 */ \ + "trn2 v14.8b, v4.8b, v5.8b \n" /*41 51 43 53 */ \ + "trn2 v15.8b, v6.8b, v7.8b \n" /*61 71 63 73 */ \ + \ + "trn1 v0.4h, v8.4h, v9.4h \n" /*00 10 20 30 04 14 24 34*/ \ + "trn1 v2.4h, v12.4h, v13.4h \n" /*01 11 21 31 05 15 25 35*/ \ + "trn1 v1.4h, v10.4h, v11.4h \n" /*40 50 60 70 44 54 64 74*/ \ + "trn1 v3.4h, v14.4h, v15.4h \n" /*41 51 61 71 45 55 65 75*/ \ + \ + "trn2 v4.4h, v8.4h, v9.4h \n" /*02 10 20 30 06 14 24 34*/ \ + "trn2 v6.4h, v12.4h, v13.4h \n" /*03 11 21 31 07 15 25 35*/ \ + "trn2 v5.4h, v10.4h, v11.4h \n" /*42 50 60 70 46 54 64 74*/ \ + "trn2 v7.4h, v14.4h, v15.4h \n" /*43 51 61 71 47 55 65 75*/ \ + \ + "trn1 v8.2s, v0.2s, v1.2s \n" /*00 10 20 30 40 50 60 70*/ \ + "trn1 v9.2s, v2.2s, v3.2s \n" /*01 11 21 31 41 51 61 71*/ \ + "trn1 v10.2s, v4.2s, v5.2s \n" /*02 12 22 32 42 50 60 70*/ \ + "trn1 v11.2s, v6.2s, v7.2s \n" /*03 13 23 33 41 51 61 71*/ \ + \ + "trn2 v12.2s, v0.2s, v1.2s \n" /*04 14 24 34 44 54 64 74*/ \ + "trn2 v13.2s, v2.2s, v3.2s \n" /*05 15 25 35 45 55 65 75*/ \ + "trn2 v14.2s, v4.2s, v5.2s \n" /*06 16 22 32 42 50 60 70*/ \ + "trn2 v15.2s, v6.2s, v7.2s \n" /*07 17 23 33 41 51 61 71*/ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "subs %w[cnt], %w[cnt], #1 \n" \ + "st1 {v8.8b}, [%[out0_ptr]], #8 \n" \ + "st1 {v9.8b}, [%[out1_ptr]], #8 \n" \ + "st1 {v10.8b}, [%[out2_ptr]], #8 \n" \ + "st1 {v11.8b}, [%[out3_ptr]], #8 \n" \ + \ + "st1 {v11.8b}, [%[out4_ptr]], #8 \n" \ + "st1 {v12.8b}, [%[out5_ptr]], #8 \n" \ + "st1 {v13.8b}, [%[out6_ptr]], #8 \n" \ + "st1 {v14.8b}, [%[out7_ptr]], #8 \n" \ + "bne 1b \n" + +#else +#define TRANS_C4 \ + "1: \n" \ + "vld1.32 {d0-d1}, [%[din0_ptr]] \n" \ + "vld1.32 {d2-d3}, [%[din1_ptr]] \n" \ + "vld1.32 {d4-d5}, [%[din2_ptr]] \n" \ + "vld1.32 {d6-d7}, [%[din3_ptr]] \n" \ + \ + "vtrn.32 q0, q1 \n" /*00 10 02 12 01 11 03 13*/ \ + "vtrn.32 q2, q3 \n" /*20 30 22 32 21 31 23 33 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \ + "vswp d1, d4 \n" \ + "vswp d3, d6 \n" \ + \ + "subs %[cnt], %[cnt], #1 \n" \ + "vst1.32 {d0-d1}, [%[out0_ptr]]! \n" \ + "vst1.32 {d2-d3}, [%[out1_ptr]]! \n" \ + "vst1.32 {d4-d5}, [%[out2_ptr]]! \n" \ + "vst1.32 {d6-d7}, [%[out3_ptr]]! \n" \ + "bne 1b \n" + +#define TRANS_C8 \ + "1: \n" \ + "vld1.8 d0, [%[din0_ptr]] \n" \ + "vld1.8 d1, [%[din1_ptr]] \n" \ + "vld1.8 d2, [%[din2_ptr]] \n" \ + "vld1.8 d3, [%[din3_ptr]] \n" \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "vtrn.8 d0, d1 \n" /*00 10 02 12 04 14 06 16*/ \ + "vtrn.8 d2, d3 \n" /*20 30 22 32 24 34 26 36 */ \ + \ + "vld1.8 d4, [%[din0_ptr]] \n" \ + "vld1.8 d5, [%[din1_ptr]] \n" \ + "vld1.8 d6, [%[din2_ptr]] \n" \ + "vld1.8 d7, [%[din3_ptr]] \n" \ + \ + "vtrn.16 d0, d2 \n" /*00 10 20 30 04 14 24 34*/ \ + "vtrn.16 d1, d3 \n" /* 01 11 21 31 05 15 25 35 */ \ + "vtrn.8 d4, d5 \n" /*40 50 02 12 04 14 06 16*/ \ + "vtrn.8 d6, d7 \n" /*60 70 22 32 24 34 26 36 */ \ + \ + "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \ + "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \ + \ + "vtrn.16 d4, d6 \n" /*40 50 60 70 04 14 24 34*/ \ + "vtrn.16 d5, d7 \n" /* 41 51 61 71 05 15 25 35 */ \ + \ + "vtrn.32 d0, d4 \n" /*00 10 20 30 40 50 60 70*/ \ + "vtrn.32 d1, d5 \n" /* 01 11 21 31 41 51 61 71 */ \ + "vtrn.32 d2, d6 \n" /*02 12 22 32 42 52 62 72*/ \ + "vtrn.32 d3, d7 \n" /* 03 11 21 33 43 53 63 73 */ \ + \ + "subs %[cnt], %[cnt], #1 \n" \ + "vst1.8 {d0}, [%[out0_ptr]]! \n" \ + "vst1.8 {d1}, [%[out1_ptr]]! \n" \ + "vst1.8 {d2}, [%[out2_ptr]]! \n" \ + "vst1.8 {d3}, [%[out3_ptr]]! \n" \ + "vst1.8 {d4}, [%[out4_ptr]]! \n" \ + "vst1.8 {d5}, [%[out5_ptr]]! \n" \ + "vst1.8 {d6}, [%[out6_ptr]]! \n" \ + "vst1.8 {d7}, [%[out7_ptr]]! \n" \ + "bne 1b \n" + +#endif +template <> +void NCHW2NHWC(int N, int C, int size, const float* X, float* Y) { + int cnt = C >> 2; + int remain = C % 4; + int sum = C * size; + int stride = size << 4; // 4 * size + int stride_w = stride >> 2; + for (int n = 0; n < N; n++) { + const float* din = X + n * sum; + float* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < size - 3; s += 4) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + size; + const float* din2_ptr = din1_ptr + size; + const float* din3_ptr = din2_ptr + size; + float* out0_ptr = dout + s * C; + float* out1_ptr = out0_ptr + C; + float* out2_ptr = out1_ptr + C; + float* out3_ptr = out2_ptr + C; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12"); +#else + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const float* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + din0_ptr += size; + } + } + // remain size + for (; s < size; s++) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + size; + const float* din2_ptr = din1_ptr + size; + const float* din3_ptr = din2_ptr + size; + float* out0_ptr = dout + s * C; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + din0_ptr += stride_w; + din1_ptr += stride_w; + din2_ptr += stride_w; + din3_ptr += stride_w; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += size; + } + } + } +} +template <> +void NCHW2NHWC(int N, int C, int size, const int8_t* X, int8_t* Y) { + int cnt = C >> 3; + int remain = C % 8; + int sum = C * size; + int stride = size << 3; // 8 * size + int stride_w = size << 4; // 4 * size * 4 + for (int n = 0; n < N; n++) { + const int8_t* din = X + n * sum; + int8_t* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < size - 7; s += 8) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + size; + const int8_t* din2_ptr = din1_ptr + size; + const int8_t* din3_ptr = din2_ptr + size; + int8_t* out0_ptr = dout + s * C; + int8_t* out1_ptr = out0_ptr + C; + int8_t* out2_ptr = out1_ptr + C; + int8_t* out3_ptr = out2_ptr + C; + int8_t* out4_ptr = out3_ptr + C; + int8_t* out5_ptr = out4_ptr + C; + int8_t* out6_ptr = out5_ptr + C; + int8_t* out7_ptr = out6_ptr + C; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel + for (int i = 0; i < remain; i++) { + const int8_t* ptr = din0_ptr; + *out0_ptr = *ptr++; + *out1_ptr = *ptr++; + *out2_ptr = *ptr++; + *out3_ptr = *ptr++; + din0_ptr += size; + *out4_ptr = *ptr++; + *out5_ptr = *ptr++; + *out6_ptr = *ptr++; + *out7_ptr = *ptr++; + } + } + // remain size + for (; s < size; s++) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + size; + const int8_t* din2_ptr = din1_ptr + size; + const int8_t* din3_ptr = din2_ptr + size; + const int8_t* din4_ptr = din3_ptr + size; + const int8_t* din5_ptr = din4_ptr + size; + const int8_t* din6_ptr = din5_ptr + size; + const int8_t* din7_ptr = din6_ptr + size; + int8_t* out0_ptr = dout + s * C; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + *out0_ptr++ = *din4_ptr; + *out0_ptr++ = *din5_ptr; + *out0_ptr++ = *din6_ptr; + *out0_ptr++ = *din7_ptr; + din0_ptr += stride; + din1_ptr += stride; + din2_ptr += stride; + din3_ptr += stride; + din4_ptr += stride; + din5_ptr += stride; + din6_ptr += stride; + din7_ptr += stride; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += size; + } + } + } +} +template <> +void NHWC2NCHW(int N, int C, int size, const float* X, float* Y) { + int cnt = size >> 2; + int remain = size % 4; + int sum = C * size; + int stride = C << 4; // 4 * size + int stride_w = C << 2; + for (int n = 0; n < N; n++) { + const float* din = X + n * sum; + float* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < C - 3; s += 4) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + C; + const float* din2_ptr = din1_ptr + C; + const float* din3_ptr = din2_ptr + C; + float* out0_ptr = dout + s * size; + float* out1_ptr = out0_ptr + size; + float* out2_ptr = out1_ptr + size; + float* out3_ptr = out2_ptr + size; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11"); +#else + asm volatile(TRANS_C4 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [cnt] "+r"(cnt_num), + [stride] "+r"(stride) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const float* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + din0_ptr += C; + } + } + // remain size + for (; s < C; s++) { + const float* din0_ptr = din + s; + const float* din1_ptr = din0_ptr + C; + const float* din2_ptr = din1_ptr + C; + const float* din3_ptr = din2_ptr + C; + float* out0_ptr = dout + s * size; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + din0_ptr += stride_w; + din1_ptr += stride_w; + din2_ptr += stride_w; + din3_ptr += stride_w; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += C; + } + } + } +} +template <> +void NHWC2NCHW(int N, int C, int size, const int8_t* X, int8_t* Y) { + int cnt = size >> 3; + int remain = size % 8; + int sum = C * size; + int stride = C << 3; // 8 * size + int stride_w = C << 4; // 4 * size + for (int n = 0; n < N; n++) { + const int8_t* din = X + n * sum; + int8_t* dout = Y + n * sum; + int s = 0; +#pragma omp parallel for + for (s = 0; s < C - 7; s += 8) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + C; + const int8_t* din2_ptr = din1_ptr + C; + const int8_t* din3_ptr = din2_ptr + C; + const int8_t* din4_ptr = din3_ptr + C; + const int8_t* din5_ptr = din4_ptr + C; + const int8_t* din6_ptr = din5_ptr + C; + const int8_t* din7_ptr = din6_ptr + C; + int8_t* out0_ptr = dout + s * size; + int8_t* out1_ptr = out0_ptr + size; + int8_t* out2_ptr = out1_ptr + size; + int8_t* out3_ptr = out2_ptr + size; + int8_t* out4_ptr = out3_ptr + size; + int8_t* out5_ptr = out4_ptr + size; + int8_t* out6_ptr = out5_ptr + size; + int8_t* out7_ptr = out6_ptr + size; + int cnt_num = cnt; + if (cnt_num > 0) { +#ifdef __aarch64__ + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", + "memory", + "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15"); +#else + asm volatile(TRANS_C8 + : [din0_ptr] "+r"(din0_ptr), + [din1_ptr] "+r"(din1_ptr), + [din2_ptr] "+r"(din2_ptr), + [din3_ptr] "+r"(din3_ptr), + [out0_ptr] "+r"(out0_ptr), + [out1_ptr] "+r"(out1_ptr), + [out2_ptr] "+r"(out2_ptr), + [out3_ptr] "+r"(out3_ptr), + [out4_ptr] "+r"(out4_ptr), + [out5_ptr] "+r"(out5_ptr), + [out6_ptr] "+r"(out6_ptr), + [out7_ptr] "+r"(out7_ptr), + [cnt] "+r"(cnt_num), + [stride_w] "+r"(stride_w) + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +#endif + } + for (int i = 0; i < remain; i++) { + const int8_t* ptr = din0_ptr; + *out0_ptr++ = *ptr++; + *out1_ptr++ = *ptr++; + *out2_ptr++ = *ptr++; + *out3_ptr++ = *ptr++; + *out4_ptr++ = *ptr++; + *out5_ptr++ = *ptr++; + *out6_ptr++ = *ptr++; + *out7_ptr++ = *ptr++; + din0_ptr += C; + } + } + // remain size + for (; s < C; s++) { + const int8_t* din0_ptr = din + s; + const int8_t* din1_ptr = din0_ptr + C; + const int8_t* din2_ptr = din1_ptr + C; + const int8_t* din3_ptr = din2_ptr + C; + const int8_t* din4_ptr = din3_ptr + C; + const int8_t* din5_ptr = din4_ptr + C; + const int8_t* din6_ptr = din5_ptr + C; + const int8_t* din7_ptr = din6_ptr + C; + int8_t* out0_ptr = dout + s * size; + for (int i = 0; i < cnt; i++) { + *out0_ptr++ = *din0_ptr; + *out0_ptr++ = *din1_ptr; + *out0_ptr++ = *din2_ptr; + *out0_ptr++ = *din3_ptr; + *out0_ptr++ = *din4_ptr; + *out0_ptr++ = *din5_ptr; + *out0_ptr++ = *din6_ptr; + *out0_ptr++ = *din7_ptr; + din0_ptr += stride; + din1_ptr += stride; + din2_ptr += stride; + din3_ptr += stride; + din4_ptr += stride; + din5_ptr += stride; + din6_ptr += stride; + din7_ptr += stride; + } + for (int i = 0; i < remain; i++) { + *out0_ptr++ = *din0_ptr; + din0_ptr += C; + } + } + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/layout.h b/lite/backends/arm/math/layout.h new file mode 100644 index 0000000000000000000000000000000000000000..ed0e2f8b78a280c513161a02bb3b3b479008145a --- /dev/null +++ b/lite/backends/arm/math/layout.h @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace lite { +namespace arm { +namespace math { +template +void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y); + +template +void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y); + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc index 0d6eed9904902aa9539caf95172b0e4109e11f7d..092e6937c4fd4237410ff29565f418423494507f 100644 --- a/lite/backends/arm/math/packed_sgemm.cc +++ b/lite/backends/arm/math/packed_sgemm.cc @@ -53,6 +53,38 @@ void sgemm_prepacked_8x12(bool is_transB, bool has_bias, bool has_relu, ARMContext *ctx); + +void pack_m4(float *out, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax); + +void pack_trans_m4(float *out, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax); +void sgemm_prepacked_4x4(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float beta, + float *C, + int ldc, + const float *bias, + bool has_bias, + bool has_relu, + ARMContext *ctx); #else // for kA72 void prepackA_6x8(float *out, @@ -139,13 +171,21 @@ void prepackA(float *out, bool is_trans, ARMContext *ctx) { #ifdef __aarch64__ - if (is_trans) { - prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + if (mmax <= 4) { + if (is_trans) { + pack_trans_m4(out, in, alpha, ldin, m0, mmax, k0, kmax); + } else { + pack_m4(out, in, alpha, ldin, m0, mmax, k0, kmax); + } } else { - prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + if (is_trans) { + prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + } else { + prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax); + } } #else - if (ctx->arch() == kA73) { + if (ctx->arch() == kA73 || mmax <= 4) { if (is_trans) { prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax); } else { @@ -212,22 +252,39 @@ void sgemm_prepack(bool is_transB, bool has_relu, ARMContext *ctx) { #ifdef __aarch64__ - sgemm_prepacked_8x12(is_transB, - M, - N, - K, - A_packed, - B, - ldb, - beta, - C, - ldc, - bias, - has_bias, - has_relu, - ctx); + if (M <= 4) { + sgemm_prepacked_4x4(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + has_relu, + ctx); + } else { + sgemm_prepacked_8x12(is_transB, + M, + N, + K, + A_packed, + B, + ldb, + beta, + C, + ldc, + bias, + has_bias, + has_relu, + ctx); + } #else // armv7 - if (ctx->arch() == kA73) { + if (ctx->arch() == kA73 || M <= 4) { sgemm_prepacked_4x8(is_transB, M, N, @@ -522,6 +579,147 @@ void prepackA_8x12(float *dout, } } } +void pack_m4(float *dout, + const float *inptr, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax) { + int x_len = kmax - k0; + int stride = x_len * 4; + float zerobuff[x_len]; // NOLINT + memset(zerobuff, 0, sizeof(float) * x_len); + bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; + +#pragma omp parallel for + for (int y = m0; y < mmax; y += 4) { + float *outptr = dout + stride * (y - m0) / 4; + + const float *inptr0 = inptr + y * ldin + k0; + const float *inptr1 = inptr0 + ldin; + const float *inptr2 = inptr1 + ldin; + const float *inptr3 = inptr2 + ldin; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(inptr0), + [ptr1] "r"(inptr1), + [ptr2] "r"(inptr2), + [ptr3] "r"(inptr3) + : "memory"); + + int x = x_len; + //! cope with row index exceed real size, set to zero buffer + if ((y + 3) >= mmax) { + switch ((y + 3) - mmax) { + case 2: + inptr1 = zerobuff; + case 1: + inptr2 = zerobuff; + case 0: + inptr3 = zerobuff; + default: + break; + } + } + for (; x > 7; x -= 8) { + asm volatile( + "cbz %w[has_alpha], 0f\n" /* check alpha == 1.f? */ + "dup v31.4s, %w[alpha]\n" /* alpha to vector */ + "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ + "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ + "fmul v0.4s, v31.4s, v0.4s\n" /* mul alpha */ + "fmul v1.4s, v31.4s, v1.4s\n" /* mul alpha */ + "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ + "fmul v2.4s, v31.4s, v2.4s\n" /* mul alpha */ + "fmul v3.4s, v31.4s, v3.4s\n" /* mul alpha */ + "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ + "fmul v4.4s, v31.4s, v4.4s\n" /* mul alpha */ + "fmul v5.4s, v31.4s, v5.4s\n" /* mul alpha */ + "fmul v6.4s, v31.4s, v6.4s\n" /* mul alpha */ + "fmul v7.4s, v31.4s, v7.4s\n" /* mul alpha */ + "b 1f\n" /* to main process */ + "0: \n" /* alpha == 1 */ + "ldp q0, q1, [%[inptr0]], #32\n" /* load r0, a0~a7 */ + "ldp q2, q3, [%[inptr1]], #32\n" /* load r1, b0~b7 */ + "ldp q4, q5, [%[inptr2]], #32\n" /* load r2, c0~c7 */ + "ldp q6, q7, [%[inptr3]], #32\n" /* load r3, d0~d7 */ + "1: \n" /* main process */ + "trn1 v8.4s, v0.4s, v2.4s\n" /* a0b0a2b2*/ + "trn2 v9.4s, v0.4s, v2.4s\n" /* a1b1a3b3*/ + "trn1 v10.4s, v1.4s, v3.4s\n" /* a4b4a6b6*/ + "trn2 v11.4s, v1.4s, v3.4s\n" /* a5b5a7b7*/ + + "trn1 v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/ + "trn2 v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/ + "trn1 v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/ + "trn2 v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/ + + "trn1 v0.2d, v8.2d, v12.2d\n" /* a0b0c0d0 */ + "trn1 v1.2d, v9.2d, v13.2d\n" /* a1b1c1d1 */ + "trn1 v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */ + "trn1 v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */ + + "trn2 v4.2d, v8.2d, v12.2d\n" /* a2b2c2d2 */ + "trn2 v5.2d, v9.2d, v13.2d\n" /* a3b3c3d3 */ + "stp q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/ + "trn2 v6.2d, v10.2d, v14.2d\n" /* a6b6c6d6 */ + "trn2 v7.2d, v11.2d, v15.2d\n" /* a7b7c7d7 */ + "stp q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/ + "stp q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/ + "stp q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/ + + : [inptr0] "+r"(inptr0), + [inptr1] "+r"(inptr1), + [inptr2] "+r"(inptr2), + [inptr3] "+r"(inptr3), + [outptr] "+r"(outptr) + : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha) + : "v0", + "v1", + "v2", + "v3", + "v4", + "v5", + "v6", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v13", + "v14", + "v15", + "cc", + "memory"); + } + + for (; x > 0; x--) { + if (has_alpha) { + *outptr++ = *inptr0++ * alpha; + *outptr++ = *inptr1++ * alpha; + *outptr++ = *inptr2++ * alpha; + *outptr++ = *inptr3++ * alpha; + } else { + *outptr++ = *inptr0++; + *outptr++ = *inptr1++; + *outptr++ = *inptr2++; + *outptr++ = *inptr3++; + } + } + } +} void prepackA_trans_8x12(float *outptr, const float *in, @@ -682,6 +880,128 @@ void prepackA_trans_8x12(float *outptr, } } } +void pack_trans_m4(float *outptr, + const float *in, + float alpha, + int ldin, + int m0, + int mmax, + int k0, + int kmax) { + auto inptr = in + k0 * ldin + m0; + uint32_t mask_buffer[4] = {0, 1, 2, 3}; + int x_len = mmax - m0; + int y_len = kmax - k0; + int right_remain = x_len - 4 * (x_len / 4); + int stride_out = 4 * y_len; + + float32x4_t vzero = vdupq_n_f32(0.f); + uint32x4_t vmask1 = + vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain)); + + bool has_alpha = fabsf(alpha - 1.f) > 1e-8f; + float32x4_t valpha = vdupq_n_f32(alpha); + +#pragma omp parallel for + for (int y = 0; y < y_len - 3; y += 4) { + const float *ptr0 = inptr + y * ldin; + const float *ptr1 = ptr0 + ldin; + const float *ptr2 = ptr1 + ldin; + const float *ptr3 = ptr2 + ldin; + + asm volatile( + "prfm pldl1keep, [%[ptr0]] \n" + "prfm pldl1keep, [%[ptr0], #64] \n" + "prfm pldl1keep, [%[ptr1]] \n" + "prfm pldl1keep, [%[ptr1], #64] \n" + "prfm pldl1keep, [%[ptr2]] \n" + "prfm pldl1keep, [%[ptr2], #64] \n" + "prfm pldl1keep, [%[ptr3]] \n" + "prfm pldl1keep, [%[ptr3], #64] \n" + : + : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3) + : "memory"); + + float *outptr_row_col = outptr + y * 4; + int i = 0; + for (; i < x_len - 3; i += 4) { + float32x4_t vr00 = vld1q_f32(ptr0); + float32x4_t vr10 = vld1q_f32(ptr1); + float32x4_t vr20 = vld1q_f32(ptr2); + float32x4_t vr30 = vld1q_f32(ptr3); + if (has_alpha) { + vr00 = vmulq_f32(vr00, valpha); + vr10 = vmulq_f32(vr10, valpha); + vr20 = vmulq_f32(vr20, valpha); + vr30 = vmulq_f32(vr30, valpha); + } + + vst1q_f32(outptr_row_col, vr00); + vst1q_f32(outptr_row_col + 4, vr10); + vst1q_f32(outptr_row_col + 8, vr20); + vst1q_f32(outptr_row_col + 12, vr30); + + ptr0 += 4; + ptr1 += 4; + ptr2 += 4; + ptr3 += 4; + + outptr_row_col += stride_out; + } + if (right_remain > 0) { + float32x4_t vr00 = vld1q_f32(ptr0); + float32x4_t vr10 = vld1q_f32(ptr1); + float32x4_t vr20 = vld1q_f32(ptr2); + float32x4_t vr30 = vld1q_f32(ptr3); + + if (has_alpha) { + vr00 = vmulq_f32(vr00, valpha); + vr10 = vmulq_f32(vr10, valpha); + vr20 = vmulq_f32(vr20, valpha); + vr30 = vmulq_f32(vr30, valpha); + } + + float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero); + float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero); + float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero); + float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero); + + vst1q_f32(outptr_row_col, vr00_1); + vst1q_f32(outptr_row_col + 4, vr10_1); + vst1q_f32(outptr_row_col + 8, vr20_1); + vst1q_f32(outptr_row_col + 12, vr30_1); + } + } + +#pragma omp parallel for + for (int y = 4 * (y_len / 4); y < y_len; ++y) { + const float *ptr0 = inptr + y * ldin; + float *outptr_row_col = outptr + y * 4; + int i = 0; + for (; i < x_len - 3; i += 4) { + float32x4_t vr0 = vld1q_f32(ptr0); + if (has_alpha) { + vr0 = vmulq_f32(vr0, valpha); + } + vst1q_f32(outptr_row_col, vr0); + + ptr0 += 4; + + outptr_row_col += stride_out; + } + if (right_remain > 0) { + float32x4_t vr0 = vld1q_f32(ptr0); + + if (has_alpha) { + vr0 = vmulq_f32(vr0, valpha); + } + + float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero); + + vst1q_f32(outptr_row_col, vr0_1); + } + } +} #else // __aarch64__ void prepackA_6x8(float* outptr, @@ -2592,6 +2912,292 @@ void sgemm_prepacked_8x12(bool is_transB, } } } + +void sgemm_prepacked_4x4(bool is_transB, + int M, + int N, + int K, + const float *A_packed, + const float *B, + int ldb, + float beta, + float *C, + int ldc, + const float *bias, + bool has_bias, + bool has_relu, + ARMContext *ctx) { + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + auto workspace = ctx->workspace_data(); + int threads = ctx->threads(); + + const int n_block = 4; + const int m_block = 4; + //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2 + int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block)); + x_block /= n_block; + x_block *= n_block; + int x_num = (N + (x_block - 1)) / x_block; + x_block = (N + x_num - 1) / x_num; + x_block = (x_block + n_block - 1) / n_block; + x_block *= n_block; + x_block = x_block < n_block ? n_block : x_block; + + // unroll 2 loop + int tail_pre = (K & (KBLOCK - 1)); + int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1; + if (tail_pre == 0) { + tail_pre = KBLOCK; + } + + bool flag_p_remain = false; + int remain = 0; + + int has_beta = fabsf(beta) > 1e-8f ? 1 : 0; + //! apanel is pre_compute outside gemm + for (unsigned int x0 = 0; x0 < N; x0 += x_block) { + unsigned int xmax = x0 + x_block; + if (xmax > N) { + xmax = N; + } + int bblocks = (xmax - x0 + n_block - 1) / n_block; + remain = xmax - x0 - (bblocks - 1) * n_block; + if (remain > 0) { + flag_p_remain = true; + } + //! load bpanel + float *b_pannel = workspace; + if (is_transB) { + pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K); + } else { + pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K); + } +#pragma omp parallel for num_threads(threads) + for (unsigned int y = 0; y < M; y += m_block) { + unsigned int ymax = y + m_block; + if (ymax > M) { + ymax = M; + } + + float bias_local[4] = {0}; + if (has_bias) { + bias_local[0] = bias[y]; + bias_local[1] = bias[y + 1]; + bias_local[2] = bias[y + 2]; + bias_local[3] = bias[y + 3]; + } + + float cout0[n_block]; // NOLINT + float cout1[n_block]; // NOLINT + float cout2[n_block]; // NOLINT + float cout3[n_block]; // NOLINT + + float *c_ptr0 = C + y * ldc + x0; + float *c_ptr1 = c_ptr0 + ldc; + float *c_ptr2 = c_ptr1 + ldc; + float *c_ptr3 = c_ptr2 + ldc; + + float *pout0 = c_ptr0; + float *pout1 = c_ptr1; + float *pout2 = c_ptr2; + float *pout3 = c_ptr3; + + const float *a_ptr_l = A_packed + y * K; + const float *b_ptr_l = b_pannel; + for (int xb = 0; xb < bblocks; xb++) { + if ((y + 3) >= ymax) { + switch ((y + 3) - ymax) { + case 2: + c_ptr1 = cout1; + case 1: + c_ptr2 = cout2; + case 0: + c_ptr3 = cout3; + default: + break; + } + } + if (flag_p_remain && (xb == bblocks - 1)) { + pout0 = c_ptr0; + pout1 = c_ptr1; + pout2 = c_ptr2; + pout3 = c_ptr3; + + c_ptr0 = cout0; + c_ptr1 = cout1; + c_ptr2 = cout2; + c_ptr3 = cout3; + if (has_beta) { + for (int i = 0; i < remain; ++i) { + cout0[i] = pout0[i]; + cout1[i] = pout1[i]; + cout2[i] = pout2[i]; + cout3[i] = pout3[i]; + } + } + } + const float *a_ptr = a_ptr_l; + const float *b_ptr = b_ptr_l + xb * K * 4; + int tail = tail_pre; + int k = k_pre; + // clang-format off + asm volatile( + "prfm pldl1keep, [%[a_ptr]]\n" /* preload a*/ + "ld1 {v2.4s}, [%[bias_ptr]]\n" /* load bias to q2, q3*/ + "dup v8.4s, v2.s[0]\n" /* out0 = 0 */ + "prfm pldl1keep, [%[b_ptr]]\n" /* preload b*/ + "dup v9.4s, v2.s[1]\n" /* out1 = 0*/ + "prfm pldl1keep, [%[a_ptr], #64]\n" /* preload a*/ + "dup v10.4s, v2.s[2]\n" /* out2 = 0*/ + "prfm pldl1keep, [%[b_ptr], #64]\n" /* preload b*/ + "dup v11.4s, v2.s[3]\n" /* out3 = 0*/ + "cbz %w[has_beta], 0f\n" /* check beta == 0? */ + /* process beta */ + "dup v7.4s, %w[beta]\n" /* beta to vector */ + "ld1 {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */ + "ld1 {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */ + "fmla v8.4s, v0.4s, v7.4s\n" /* cr00 += beta * c_r00*/ + "fmla v9.4s, v1.4s, v7.4s\n" /* cr10 += beta * c_r10*/ + "ld1 {v2.4s}, [%[c_ptr2]]\n" + "ld1 {v3.4s}, [%[c_ptr3]]\n" + "fmla v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/ + "fmla v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/ + + "0: \n" /* check loop count */ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ + "cbz %w[k], 2f\n" /* check loop count > 0 */ + /* main loop */ + /* unrool 0*/ + "1:\n" /* main loop */ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b3 to q6, q7 */ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a20, a30 to q2, q3 */ + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5 */ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + "ldp q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/ + + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b2 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b2 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b2 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b2 =q6*/ + "ldp q0, q1, [%[a_ptr]], #32\n" /* load a00, a10 to q0, q1 */ + + "fmla v8.4s, v7.4s, v3.s[0]\n" /* out0 = b3 * a30[0], b3 =q7*/ + "fmla v9.4s, v7.4s, v3.s[1]\n" /* out1 = b3 * a30[1], b3 =q7*/ + "subs %w[k], %w[k], #1\n" /* loop count - 1*/ + "fmla v10.4s, v7.4s, v3.s[2]\n" /* out2 = b3 * a30[2], b3 =q7*/ + "fmla v11.4s, v7.4s, v3.s[3]\n" /* out3 = b3 * a30[3], b3 =q7*/ + + "bne 1b\n" + "2:\n" /* process tail*/ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "beq 3f\n" /*jump to tail = 1*/ + /* final unrool 0*/ + /* unrool 0, tail > 1*/ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "beq 4f\n" /*jump to tail = 2*/ + /* unrool 1, tail > 2*/ + "ldp q6, q7, [%[b_ptr]], #32\n" /* load b2, b3 to q6, q7 */ + + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5*/ + "subs %w[tail], %w[tail], #1\n" /* tail--*/ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + "ldp q2, q3, [%[a_ptr]], #32\n" /* load a20, a30 to q2, q3 */ + + "beq 5f\n" /*jump to tail = 3*/ + /* unrool 2, tail = 4*/ + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b1 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b1 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ + + /* unrool 3, tail = 4*/ + + "fmla v8.4s, v7.4s, v3.s[0]\n" /* out0 = b3 * a30[0], b3 =q7*/ + "fmla v9.4s, v7.4s, v3.s[1]\n" /* out1 = b3 * a30[1], b3 =q7*/ + "fmla v10.4s, v7.4s, v3.s[2]\n" /* out2 = b3 * a30[2], b3 =q7*/ + "fmla v11.4s, v7.4s, v3.s[3]\n" /* out3 = b3 * a30[3], b3 =q7*/ + + "b 11f\n" + /* tails==1 final tail*/ + "3: \n" /* tail=1*/ + "fmla v8.4s, v4.4s, v0.s[0]\n" /* out0 = b0 * a00[0], b0 =q4 */ + "fmla v9.4s, v4.4s, v0.s[1]\n" /* out1 = b0 * a00[1], b0 =q4 */ + "fmla v10.4s, v4.4s, v0.s[2]\n" /* out2 = b0 * a00[2], b0 =q4 */ + "fmla v11.4s, v4.4s, v0.s[3]\n" /* out3 = b0 * a00[3], b0 =q4 */ + + "b 11f\n" + /* tails==2 final tail*/ + "4:\n" /* tail = 2*/ + + "fmla v8.4s, v5.4s, v1.s[0]\n" /* out0 = b1 * a10[0], b1 =q5 */ + "fmla v9.4s, v5.4s, v1.s[1]\n" /* out1 = b1 * a10[1], b1 =q5*/ + "fmla v10.4s, v5.4s, v1.s[2]\n" /* out2 = b1 * a10[2], b1 =q5 */ + "fmla v11.4s, v5.4s, v1.s[3]\n" /* out3 = b1 * a10[3], b1 =q5 */ + + "b 11f\n" + /* tails==3 final tail*/ + "5:\n" /* tail = 3*/ + "fmla v8.4s, v6.4s, v2.s[0]\n" /* out0 = b2 * a20[0], b1 =q6 */ + "fmla v9.4s, v6.4s, v2.s[1]\n" /* out1 = b2 * a20[1], b1 =q6 */ + "fmla v10.4s, v6.4s, v2.s[2]\n" /* out2 = b2 * a20[2], b1 =q6*/ + "fmla v11.4s, v6.4s, v2.s[3]\n" /* out3 = b2 * a20[3], b1 =q6*/ + + "11: \n" /* check if relu */ + "cbz %w[relu], 12f\n" /* skip relu */ + "movi v2.4s, #0\n" /* for relu*/ + "fmax v8.4s, v8.4s, v2.4s\n" /* relu*/ + "fmax v9.4s, v9.4s, v2.4s\n" /* relu*/ + "fmax v10.4s, v10.4s, v2.4s\n" /* relu*/ + "fmax v11.4s, v11.4s, v2.4s\n" /* relu*/ + "12: \n" + "st1 {v8.4s}, [%[c_ptr0]], #16\n" /* store r0 */ + "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */ + "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */ + "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */ + + : [a_ptr] "+r"(a_ptr), + [b_ptr] "+r"(b_ptr), + [k] "+r"(k), + [tail] "+r"(tail), + [c_ptr0] "+r"(c_ptr0), + [c_ptr1] "+r"(c_ptr1), + [c_ptr2] "+r"(c_ptr2), + [c_ptr3] "+r"(c_ptr3) + : [bias_ptr] "r"(bias_local), + [relu] "r"(has_relu), + [has_beta] "r"(has_beta), + [beta] "r"(beta) + : "cc","memory", + "v0","v1","v2","v3","v4","v5","v6","v7", + "v8","v9","v10","v11"); + // clang-format on + if (flag_p_remain && (xb == bblocks - 1)) { + for (int i = 0; i < remain; ++i) { + *pout0++ = cout0[i]; + *pout1++ = cout1[i]; + *pout2++ = cout2[i]; + *pout3++ = cout3[i]; + } + } + } + } + } +} #else // __aarch64__ /** * \brief gemm with ablock = 6, bblock = 8, output 6x8 diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc new file mode 100644 index 0000000000000000000000000000000000000000..8087e0337bda0866f5d399a07ecb674f0fa55a3e --- /dev/null +++ b/lite/backends/arm/math/packed_sgemm_c4.cc @@ -0,0 +1,1171 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/arm/math/packed_sgemm_c4.h" +#include + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +void loadb_c4(float* out, + const float* in, + const int xstart, + const int xend, + const int k_round, + const int n) { + const int xlen = (xend - xstart + NBLOCK_C4 - 1) / NBLOCK_C4 * NBLOCK_C4; + int xloop = xlen / NBLOCK_C4; + const int flag_remain = n < xstart + xlen; + int remain = 0; + int remain4 = 0; + int remain1 = 0; + if (flag_remain) { + remain = (n - xstart) - (xloop - 1) * NBLOCK_C4; + remain4 = remain >> 2; + remain1 = remain & 3; + xloop -= 1; + } + const int ldo = NBLOCK_C4 * k_round; + const int kloop = k_round >> 2; + in += xstart * 4; + if (xloop > 0) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out + 4 * NBLOCK_C4 * i; + const float* in_ptr = in + i * 4 * n; + for (int j = 0; j < xloop; ++j) { + float* out_p = out_ptr + j * ldo; +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "st1 {v0.4s, v1.4s}, [%[out]], #32 \n" + "ld1 {v4.4s, v5.4s}, [%[in]], #32 \n" + "st1 {v2.4s, v3.4s}, [%[out]], #32 \n" + "ld1 {v6.4s, v7.4s}, [%[in]], #32 \n" + "st1 {v4.4s, v5.4s}, [%[out]], #32 \n" + "st1 {v6.4s, v7.4s}, [%[out]], #32 \n" + : [in] "+r"(in_ptr), [out] "+r"(out_p) + : + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! \n" + "vld1.32 {d4-d7}, [%[in]]! \n" + "vst1.32 {d0-d3}, [%[out]]! \n" + "vld1.32 {d8-d11}, [%[in]]! \n" + "vst1.32 {d4-d7}, [%[out]]! \n" + "vld1.32 {d12-d15}, [%[in]]! \n" + "vst1.32 {d8-d11}, [%[out]]! \n" + "vst1.32 {d12-d15}, [%[out]]! \n" + : [in] "+r"(in_ptr), [out] "+r"(out_p) + : + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"); +#endif // __aarch674__ + } + } + } + float* out_remain4 = out + xloop * k_round * NBLOCK_C4; + const float* in_remain4 = in + xloop * NBLOCK_C4 * 4; + if (remain4) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out_remain4 + 4 * 4 * i; + const float* in_ptr = in_remain4 + i * 4 * n; +#ifdef __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%[in]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[in]], #32 \n" + "st1 {v0.4s, v1.4s}, [%[out]], #32 \n" + "st1 {v2.4s, v3.4s}, [%[out]], #32 \n" + : [in] "+r"(in_ptr), [out] "+r"(out_ptr) + : + : "v0", "v1", "v2", "v3"); +#else + asm volatile( + "vld1.32 {d0-d3}, [%[in]]! \n" + "vld1.32 {d4-d7}, [%[in]]! \n" + "vst1.32 {d0-d3}, [%[out]]! \n" + "vst1.32 {d4-d7}, [%[out]]! \n" + : [in] "+r"(in_ptr), [out] "+r"(out_ptr) + : + : "q0", "q1", "q2", "q3"); +#endif // __aarch64__ + } + } + float* out_remain1 = out_remain4 + remain4 * k_round * 4; + const float* in_remain1 = in_remain4 + remain4 * 4 * 4; + if (remain1) { +#pragma omp parallel for + for (int i = 0; i < kloop; ++i) { + float* out_ptr = out_remain1 + 4 * remain1 * i; + const float* in_ptr = in_remain1 + i * 4 * n; + for (int j = 0; j < remain1; ++j) { + float32x4_t vin = vld1q_f32(in_ptr); + in_ptr += 4; + vst1q_f32(out_ptr, vin); + out_ptr += 4; + } + } + } +} + +void sgemm_prepack_c4_common(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + const int m_round = (M + 3) / 4 * 4; + const int k_round = (K + 3) / 4 * 4; + size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024; + int threads = ctx->threads(); + auto workspace = ctx->workspace_data(); + // l2 = ablock * K * threads + K * bchunk_w + threads * ablock * bchunk_w; + int bchunk_w = (l2_cache - threads * k_round * sizeof(float)) / + ((k_round + threads * MBLOCK_C4) * sizeof(float)); + bchunk_w = bchunk_w > N ? N : bchunk_w; + bchunk_w = bchunk_w / NBLOCK_C4 * NBLOCK_C4; + bchunk_w = bchunk_w > NBLOCK_C4 ? bchunk_w : NBLOCK_C4; + int bchunk_loop = (N + bchunk_w - 1) / bchunk_w; + + const int h_loop = m_round >> 2; // MBLOCK_C4 == 4; + const int kcnt = (k_round + KBLOCK_C4 - 1) / KBLOCK_C4; + const int ldc = N * 4; + const int lda = k_round * 4; + float bias_buf[m_round]; // NOLINT + if (has_bias) { + memcpy(bias_buf, bias, M * sizeof(float)); + memset(bias_buf + M, 0, (m_round - M) * sizeof(float)); + } else { + memset(bias_buf, 0, m_round * sizeof(float)); + } + // bchunk_loop + float* c = C; + for (int n = 0; n < bchunk_loop; ++n) { + int x_start = n * bchunk_w; + int x_end = x_start + bchunk_w; + int w_loop = bchunk_w / NBLOCK_C4; + int flag_remain = 0; + int w_loop4 = 0; + int remain = 0; + if (x_end > N) { + w_loop = (N - x_start) / NBLOCK_C4; + int w_loop_rem = (N - x_start) - w_loop * NBLOCK_C4; + w_loop4 = w_loop_rem >> 2; + remain = w_loop_rem & 3; + x_end = N; + flag_remain = 1; + } + float* bchunk = workspace; + loadb_c4(bchunk, B, x_start, x_end, k_round, N); + float* cchunk = c + n * bchunk_w * 4; + int has_remain = (n == bchunk_loop - 1) && flag_remain; +#pragma omp parallel for num_threads(threads) + for (int h = 0; h < h_loop; ++h) { + float* bias_h = bias_buf + h * 4; +#ifdef __aarch64__ + float32x4_t vzero = vdupq_n_f32(0.f); + float32x4_t vbias = vld1q_f32(bias_h); +#endif + const float* ablock = A_packed + h * lda; + const float* bblock = bchunk; + float* cblock = cchunk + h * ldc; + for (int w = 0; w < w_loop; ++w) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "prfm pldl1keep, [%[b], #64] \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "mov v12.16b, %[vbias].16b \n" /* mov bias to c3*/ + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "mov v13.16b, %[vbias].16b \n" /* mov bias to c4*/ + "mov v14.16b, %[vbias].16b \n" /* mov bias to c5*/ + "mov v15.16b, %[vbias].16b \n" /* mov bias to c6*/ + "mov v16.16b, %[vbias].16b \n" /* mov bias to c7*/ + "1:\n" + /* load b0b1b2b3 to v5-v8 */ + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s, v8.4s}, [%[b]], #32 \n" + "prfm pldl1keep, [%[b]] \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "fmla v12.4s, v1.4s, v8.s[0] \n" + /* load b4b5b6b7 to v25-v28 */ + "ld1 {v25.4s, v26.4s}, [%[b]], #32 \n" + "ld1 {v27.4s, v28.4s}, [%[b]], #32 \n" + "prfm pldl1keep, [%[a], #32] \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "fmla v12.4s, v2.4s, v8.s[1] \n" + "prfm pldl1keep, [%[b], #64] \n" + "fmla v13.4s, v1.4s, v25.s[0] \n" + "fmla v14.4s, v1.4s, v26.s[0] \n" + "fmla v15.4s, v1.4s, v27.s[0] \n" + "fmla v16.4s, v1.4s, v28.s[0] \n" + /* load a2a3 to v3-v4 */ + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "prfm pldl1keep, [%[b], #128] \n" + "fmla v13.4s, v2.4s, v25.s[1] \n" + "fmla v14.4s, v2.4s, v26.s[1] \n" + "fmla v15.4s, v2.4s, v27.s[1] \n" + "fmla v16.4s, v2.4s, v28.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "fmla v12.4s, v3.4s, v8.s[2] \n" + "fmla v13.4s, v3.4s, v25.s[2] \n" + "fmla v14.4s, v3.4s, v26.s[2] \n" + "fmla v15.4s, v3.4s, v27.s[2] \n" + "fmla v16.4s, v3.4s, v28.s[2] \n" + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "fmla v12.4s, v4.4s, v8.s[3] \n" + + "fmla v13.4s, v4.4s, v25.s[3] \n" + "fmla v14.4s, v4.4s, v26.s[3] \n" + "fmla v15.4s, v4.4s, v27.s[3] \n" + "fmla v16.4s, v4.4s, v28.s[3] \n" + "bne 1b\n" + "cbz %w[relu], 2f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "fmax v13.4s, v13.4s, %[vzero].4s \n" + "fmax v14.4s, v14.4s, %[vzero].4s \n" + "fmax v15.4s, v15.4s, %[vzero].4s \n" + "fmax v16.4s, v16.4s, %[vzero].4s \n" + "2:\n" + "st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[c]], #64 \n" + "st1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[c]], #64 \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu), + [vbias] "w"(vbias), [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v25", "v26", "v27", "v28", "cc", "memory"); +#else + asm volatile( + "vld1.32 {d6-d7}, [%[bias]] \n" + "pld [%[a]] \n" + "pld [%[b]] \n" + "pld [%[b], #64] \n" + "vmov.32 q8, q3 \n" /* mov bias to c0*/ + "vmov.32 q9, q3 \n" /* mov bias to c1*/ + "vmov.32 q10, q3 \n" /* mov bias to c2*/ + "vmov.32 q11, q3 \n" /* mov bias to c3*/ + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmov.32 q12, q3 \n" /* mov bias to c4*/ + "vmov.32 q13, q3 \n" /* mov bias to c5*/ + "vmov.32 q14, q3 \n" /* mov bias to c6*/ + "vmov.32 q15, q3 \n" /* mov bias to c7*/ + "1:\n" + /* c0c1c2c3 */ + "vld1.32 {d8-d11}, [%[b]]! \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + "pld [%[b]] \n" + "vmla.f32 q8, q0, d8[0] \n" + "vmla.f32 q9, q0, d10[0] \n" + "vmla.f32 q10, q0, d12[0] \n" + "vmla.f32 q11, q0, d14[0] \n" + "vld1.32 {d4-d7}, [%[a]]! \n" + "vmla.f32 q8, q1, d8[1] \n" + "vmla.f32 q9, q1, d10[1] \n" + "vmla.f32 q10, q1, d12[1] \n" + "vmla.f32 q11, q1, d14[1] \n" + "pld [%[b], #64] \n" + "vmla.f32 q8, q2, d9[0] \n" + "vmla.f32 q9, q2, d11[0] \n" + "vmla.f32 q10, q2, d13[0] \n" + "vmla.f32 q11, q2, d15[0] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q3, d9[1] \n" + "vmla.f32 q9, q3, d11[1] \n" + "vld1.f32 {d8-d11}, [%[b]]! \n" + "vmla.f32 q10, q3, d13[1] \n" + "vmla.f32 q11, q3, d15[1] \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + /* c4c5c6c7 */ + "vmla.f32 q12, q0, d8[0] \n" + "vmla.f32 q13, q0, d10[0] \n" + "vmla.f32 q14, q0, d12[0] \n" + "vmla.f32 q15, q0, d14[0] \n" + "pld [%[a], #32] \n" + "vmla.f32 q12, q1, d8[1] \n" + "vmla.f32 q13, q1, d10[1] \n" + "vmla.f32 q14, q1, d12[1] \n" + "vmla.f32 q15, q1, d14[1] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmla.f32 q12, q2, d9[0] \n" + "vmla.f32 q13, q2, d11[0] \n" + "vmla.f32 q14, q2, d13[0] \n" + "vmla.f32 q15, q2, d15[0] \n" + "pld [%[b], #64] \n" + "vmla.f32 q12, q3, d9[1] \n" + "vmla.f32 q13, q3, d11[1] \n" + "vmla.f32 q14, q3, d13[1] \n" + "vmla.f32 q15, q3, d15[1] \n" + "bne 1b\n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "vmax.f32 q12, q12, q0 \n" + "vmax.f32 q13, q13, q0 \n" + "vmax.f32 q14, q14, q0 \n" + "vmax.f32 q15, q15, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + "vst1.32 {d24-d27}, [%[c]]! \n" + "vst1.32 {d28-d31}, [%[c]]! \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory"); +#endif + // clang-format on + } + if (has_remain) { + if (w_loop4 > 0) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "mov v12.16b, %[vbias].16b \n" /* mov bias to c3*/ + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "1:\n" + /* load b0b1b2b3 to v5-v8 */ + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s, v8.4s}, [%[b]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "fmla v12.4s, v1.4s, v8.s[0] \n" + /* load a2a3 to v3-v4 */ + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "prfm pldl1keep, [%[a]] \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "fmla v12.4s, v2.4s, v8.s[1] \n" + "prfm pldl1keep, [%[b]] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "fmla v12.4s, v3.4s, v8.s[2] \n" + /* load a0a1 to v1-v2 */ + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "fmla v12.4s, v4.4s, v8.s[3] \n" + "bne 1b\n" + "cbz %w[relu], 2f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "2:\n" + "st1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[c]], #64 \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu), + [vbias] "w"(vbias), + [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v10", "v11", "v12", "cc", "memory"); +#else + asm volatile( + "pld [%[a]] \n" + "pld [%[b]] \n" + "vld1.32 {d6-d7}, [%[bias]] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" /* load a0 a1 */ + "vmov.32 q8, q3 \n" /* mov bias to c0 */ + "vmov.32 q9, q3 \n" /* mov bias to c1 */ + "vmov.32 q10, q3 \n" /* mov bias to c2 */ + "vmov.32 q11, q3 \n" /* mov bias to c3 */ + "1:\n" + /* c0c1c2c3 */ + "vld1.32 {d8-d11}, [%[b]]! \n" + "vld1.32 {d12-d15}, [%[b]]! \n" + "pld [%[b]] \n" + "vmla.f32 q8, q0, d8[0] \n" + "vmla.f32 q9, q0, d10[0] \n" + "vmla.f32 q10, q0, d12[0] \n" + "vmla.f32 q11, q0, d14[0] \n" + "vld1.32 {d4-d7}, [%[a]]! \n" + "pld [%[a]] \n" + "vmla.f32 q8, q1, d8[1] \n" + "vmla.f32 q9, q1, d10[1] \n" + "vmla.f32 q10, q1, d12[1] \n" + "vmla.f32 q11, q1, d14[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q2, d9[0] \n" + "vmla.f32 q9, q2, d11[0] \n" + "vmla.f32 q10, q2, d13[0] \n" + "vmla.f32 q11, q2, d15[0] \n" + "vld1.32 {d0-d3}, [%[a]]! \n" + "vmla.f32 q8, q3, d9[1] \n" + "vmla.f32 q9, q3, d11[1] \n" + "vmla.f32 q10, q3, d13[1] \n" + "vmla.f32 q11, q3, d15[1] \n" + "bne 1b\n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q10", "q11", "cc", "memory"); +#endif + // clang-format on + } + if (remain > 0) { + int cnt = kcnt; + const float* ablock_ptr = ablock; +// clang-format off +#ifdef __aarch64__ + asm volatile( + "prfm pldl1keep, [%[a]] \n" + "prfm pldl1keep, [%[b]] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "cmp %w[remain], #3 \n" + "beq 1f \n" + "cmp %w[remain], #2 \n" + "beq 2f \n" + /* remain 1 */ + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vzero].16b \n" /* mov zero to c1*/ + "3: \n" + "ld1 {v5.4s}, [%[b]], #16 \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v2.4s, v5.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v4.4s, v5.s[3] \n" + "bne 3b \n" + "fadd v9.4s, v9.4s, v10.4s \n" + "cbz %w[relu], 6f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "6: \n" + "st1 {v9.4s}, [%[c]], #16 \n" + "b 9f \n" + /* remain 2 */ + "2: \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vzero].16b \n" /* mov zero to c2*/ + "mov v12.16b, %[vzero].16b \n" /* mov zero to c3*/ + "4: \n" + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v2.4s, v5.s[1] \n" + "fmla v12.4s, v2.4s, v6.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v4.4s, v5.s[3] \n" + "fmla v12.4s, v4.4s, v6.s[3] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "bne 4b \n" + "fadd v9.4s, v9.4s, v11.4s \n" + "fadd v10.4s, v10.4s, v12.4s \n" + "cbz %w[relu], 7f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "7: \n" + "st1 {v9.4s, v10.4s}, [%[c]], #32 \n" + "b 9f \n" + /* remain 3 */ + "1: \n" + "mov v9.16b, %[vbias].16b \n" /* mov bias to c0*/ + "mov v10.16b, %[vbias].16b \n" /* mov bias to c1*/ + "mov v11.16b, %[vbias].16b \n" /* mov bias to c2*/ + "5: \n" + "ld1 {v5.4s, v6.4s}, [%[b]], #32 \n" + "ld1 {v7.4s}, [%[b]], #16 \n" + "fmla v9.4s, v1.4s, v5.s[0] \n" + "fmla v10.4s, v1.4s, v6.s[0] \n" + "fmla v11.4s, v1.4s, v7.s[0] \n" + "ld1 {v3.4s, v4.4s}, [%[a]], #32 \n" + "fmla v9.4s, v2.4s, v5.s[1] \n" + "fmla v10.4s, v2.4s, v6.s[1] \n" + "fmla v11.4s, v2.4s, v7.s[1] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "fmla v9.4s, v3.4s, v5.s[2] \n" + "fmla v10.4s, v3.4s, v6.s[2] \n" + "fmla v11.4s, v3.4s, v7.s[2] \n" + "prfm pldl1keep, [%[a]] \n" + "fmla v9.4s, v4.4s, v5.s[3] \n" + "fmla v10.4s, v4.4s, v6.s[3] \n" + "fmla v11.4s, v4.4s, v7.s[3] \n" + "ld1 {v1.4s, v2.4s}, [%[a]], #32 \n" + "bne 5b \n" + "cbz %w[relu], 8f \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "8: \n" + "st1 {v9.4s, v10.4s}, [%[c]], #32 \n" + "st1 {v11.4s}, [%[c]], #16 \n" + "9:\n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), [relu] "r"(has_relu), + [remain] "r"(remain), [vbias] "w"(vbias), + [vzero] "w" (vzero) + : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", + "v10", "v11", "v12", "cc","memory"); +#else + asm volatile( + "pld [%[a]] \n" + "pld [%[b]] \n" + "vld1.32 {d0-d1}, [%[bias]] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "vmov.u32 q15, #0 \n" + "cmp %[remain], #3 \n" + "beq 1f \n" + "cmp %[remain], #2 \n" + "beq 2f \n" + /* remain 1 */ + "vmov.32 q9, q0 \n" /* mov bias to c0*/ + "vmov.32 q10, q15 \n" /* mov zero to c1*/ + "3: \n" + "vld1.32 {d10-d11}, [%[b]]! \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q2, d10[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q4, d11[1] \n" + "bne 3b \n" + "vadd.f32 q9, q9, q10 \n" + "cmp %[relu], #0 \n" + "beq 6f \n" + "vmax.f32 q9, q9, q15 \n" + "6: \n" + "vst1.32 {d18-d19}, [%[c]]! \n" + "b 9f \n" + /* remain 2 */ + "2: \n" + "vmov.u32 q9, q0 \n" /* mov bias to c0*/ + "vmov.u32 q10, q0 \n" /* mov bias to c1*/ + "vmov.u32 q11, q15 \n" /* mov zero to c2*/ + "vmov.u32 q12, q15 \n" /* mov zero to c3*/ + "4: \n" + "vld1.32 {d10-d13}, [%[b]]! \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q1, d12[0] \n" + "vmla.f32 q11, q2, d10[1] \n" + "vmla.f32 q12, q2, d12[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q3, d13[0] \n" + "vmla.f32 q11, q4, d11[1] \n" + "vmla.f32 q12, q4, d13[1] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 4b \n" + "vadd.f32 q9, q9, q11 \n" + "vadd.f32 q10, q10, q12 \n" + "cmp %[relu], #0 \n" + "beq 7f \n" + "vmax.f32 q9, q9, q15 \n" + "vmax.f32 q10, q10, q15 \n" + "7: \n" + "vst1.32 {d18-d21}, [%[c]]! \n" + "b 9f \n" + /* remain 3 */ + "1: \n" + "vmov.u32 q9, q0 \n" /* mov bias to c0*/ + "vmov.u32 q10, q0 \n" /* mov bias to c1*/ + "vmov.u32 q11, q0 \n" /* mov bias to c2*/ + "5: \n" + "vld1.32 {d10-d13}, [%[b]]! \n" + "vld1.32 {d14-d15}, [%[b]]! \n" + "vmla.f32 q9, q1, d10[0] \n" + "vmla.f32 q10, q1, d12[0] \n" + "vmla.f32 q11, q1, d14[0] \n" + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q9, q2, d10[1] \n" + "vmla.f32 q10, q2, d12[1] \n" + "vmla.f32 q11, q2, d14[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q9, q3, d11[0] \n" + "vmla.f32 q10, q3, d13[0] \n" + "vmla.f32 q11, q3, d15[0] \n" + "pld [%[a]] \n" + "vmla.f32 q9, q4, d11[1] \n" + "vmla.f32 q10, q4, d13[1] \n" + "vmla.f32 q11, q4, d15[1] \n" + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 5b \n" + "cmp %[relu], #0 \n" + "beq 8f \n" + "vmax.f32 q9, q9, q15 \n" + "vmax.f32 q10, q10, q15 \n" + "vmax.f32 q11, q11, q15 \n" + "8: \n" + "vst1.32 {d18-d21}, [%[c]]! \n" + "vst1.32 {d22-d23}, [%[c]]! \n" + "9:\n" + : [a] "+r"(ablock_ptr), + [b] "+r"(bblock), + [c] "+r"(cblock), + [cnt] "+r"(cnt) + : [bias] "r"(bias_h), + [relu] "r"(has_relu), + [remain] "r"(remain) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q9", + "q10", "q11", "q12", "q15", "cc","memory"); +#endif + // clang-format on + } + } + } + } +} + +void sgemm_prepack_c4_small(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + const int m_round = (M + 3) / 4 * 4; + const int k_round = (K + 3) / 4 * 4; + const int mloop = m_round >> 2; + const int lda = 4 * k_round; + const int ldb_byte = 4 * N * sizeof(float); + const int kcnt = k_round >> 2; + float bias_buf[m_round]; // NOLINT + if (has_bias) { + memcpy(bias_buf, bias, M * sizeof(float)); + memset(bias_buf + M, 0, (m_round - M) * sizeof(float)); + } else { + memset(bias_buf, 0, m_round * sizeof(float)); + } +#ifdef __aarch64__ + float32x4_t vzero = vdupq_n_f32(0.f); +#endif + const float* bias_ptr = bias_buf; + for (int m = 0; m < mloop; ++m) { +#ifdef __aarch64__ + float32x4_t vbias = vld1q_f32(bias_ptr); +#endif + const float* b = B; + int n = N; +#ifdef __aarch64__ + for (; n > 7; n -= 8) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + // clang-format off + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0-c7*/ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vbias].16b \n" + "mov v10.16b, %[vbias].16b \n" + "mov v11.16b, %[vbias].16b \n" + /* load b0, b1 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "mov v12.16b, %[vbias].16b \n" + "mov v13.16b, %[vbias].16b \n" + "mov v14.16b, %[vbias].16b \n" + "mov v15.16b, %[vbias].16b \n" + "1:\n" + /* load b2, b3 */ + "ld1 {v2.4s, v3.4s}, [%[b]], #32 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v16.4s, v1.s[0] \n" + "fmla v10.4s, v16.4s, v2.s[0] \n" + "fmla v11.4s, v16.4s, v3.s[0] \n" + "prfm pldl1keep, [%[b]] \n" + "fmla v8.4s, v17.4s, v0.s[1] \n" + "fmla v9.4s, v17.4s, v1.s[1] \n" + "fmla v10.4s, v17.4s, v2.s[1] \n" + "fmla v11.4s, v17.4s, v3.s[1] \n" + /* load b4, b5 */ + "ld1 {v4.4s, v5.4s}, [%[b]], #32 \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v18.4s, v1.s[2] \n" + "fmla v10.4s, v18.4s, v2.s[2] \n" + "fmla v11.4s, v18.4s, v3.s[2] \n" + /* load b6, b7 */ + "ld1 {v6.4s, v7.4s}, [%[b]], #32 \n" + "fmla v8.4s, v19.4s, v0.s[3] \n" + "fmla v9.4s, v19.4s, v1.s[3] \n" + "fmla v10.4s, v19.4s, v2.s[3] \n" + "fmla v11.4s, v19.4s, v3.s[3] \n" + "sub %[b], %[b], #128 \n" + "fmla v12.4s, v16.4s, v4.s[0] \n" + "fmla v13.4s, v16.4s, v5.s[0] \n" + "fmla v14.4s, v16.4s, v6.s[0] \n" + "fmla v15.4s, v16.4s, v7.s[0] \n" + "add %[b], %[b], %[ldb] \n" + "fmla v12.4s, v17.4s, v4.s[1] \n" + "fmla v13.4s, v17.4s, v5.s[1] \n" + "fmla v14.4s, v17.4s, v6.s[1] \n" + "fmla v15.4s, v17.4s, v7.s[1] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "fmla v12.4s, v18.4s, v4.s[2] \n" + "fmla v13.4s, v18.4s, v5.s[2] \n" + "fmla v14.4s, v18.4s, v6.s[2] \n" + "fmla v15.4s, v18.4s, v7.s[2] \n" + /* load b0, b1 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "fmla v12.4s, v19.4s, v4.s[3] \n" + "fmla v13.4s, v19.4s, v5.s[3] \n" + "fmla v14.4s, v19.4s, v6.s[3] \n" + "fmla v15.4s, v19.4s, v7.s[3] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "bne 1b \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "fmax v12.4s, v12.4s, %[vzero].4s \n" + "fmax v13.4s, v13.4s, %[vzero].4s \n" + "fmax v14.4s, v14.4s, %[vzero].4s \n" + "fmax v15.4s, v15.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[c]], #64 \n" + "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", + "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", + "v19", "cc", "memory" + ); + b += 4 * 8; + } + for (; n > 3; n -= 4) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0-c3*/ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vbias].16b \n" + "mov v10.16b, %[vbias].16b \n" + "mov v11.16b, %[vbias].16b \n" + "1:\n" + /* load b0-b3 */ + "ld1 {v0.4s, v1.4s}, [%[b]], #32 \n" + "ld1 {v2.4s, v3.4s}, [%[b]], #32 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v16.4s, v1.s[0] \n" + "fmla v10.4s, v16.4s, v2.s[0] \n" + "fmla v11.4s, v16.4s, v3.s[0] \n" + "sub %[b], %[b], #64 \n" + "fmla v8.4s, v17.4s, v0.s[1] \n" + "fmla v9.4s, v17.4s, v1.s[1] \n" + "fmla v10.4s, v17.4s, v2.s[1] \n" + "fmla v11.4s, v17.4s, v3.s[1] \n" + "add %[b], %[b], %[ldb] \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v18.4s, v1.s[2] \n" + "fmla v10.4s, v18.4s, v2.s[2] \n" + "fmla v11.4s, v18.4s, v3.s[2] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "fmla v8.4s, v19.4s, v0.s[3] \n" + "fmla v9.4s, v19.4s, v1.s[3] \n" + "fmla v10.4s, v19.4s, v2.s[3] \n" + "fmla v11.4s, v19.4s, v3.s[3] \n" + "subs %w[cnt], %w[cnt], #1 \n" + "bne 1b \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "fmax v9.4s, v9.4s, %[vzero].4s \n" + "fmax v10.4s, v10.4s, %[vzero].4s \n" + "fmax v11.4s, v11.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%[c]], #64 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v1", "v2", "v3", "v8", "v9", + "v10", "v11", "v16", "v17", "v18", + "v19", "cc", "memory" + ); + b += 4 * 4; + } + for (; n > 0; n--) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + /* mov bias to c0 */ + "mov v8.16b, %[vbias].16b \n" + "mov v9.16b, %[vzero].16b \n" + "1:\n" + /* load b0 */ + "ld1 {v0.4s}, [%[b]], #16 \n" + /* load a2, a3 */ + "ld1 {v18.4s, v19.4s}, [%[a]], #32 \n" + "fmla v8.4s, v16.4s, v0.s[0] \n" + "fmla v9.4s, v17.4s, v0.s[1] \n" + "sub %[b], %[b], #16 \n" + "subs %w[cnt], %w[cnt], #1 \n" + "add %[b], %[b], %[ldb] \n" + "fmla v8.4s, v18.4s, v0.s[2] \n" + "fmla v9.4s, v19.4s, v0.s[3] \n" + /* load a0, a1 */ + "ld1 {v16.4s, v17.4s}, [%[a]], #32 \n" + "bne 1b \n" + "fadd v8.4s, v8.4s, v9.4s \n" + "cbz %w[relu], 2f \n" + "fmax v8.4s, v8.4s, %[vzero].4s \n" + "2:\n" + "st1 {v8.4s}, [%[c]], #16 \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [vbias] "w" (vbias), + [vzero] "w" (vzero) + : "v0", "v8", "v9", "v16", "v17", + "v18", "v19", "cc", "memory" + ); + b += 4; + } +#else + for (; n > 7; n -= 8) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + // clang-format off + asm volatile( + "vld1.32 {d6-d7}, [%[bias]] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + /* mov bias to c0-c7*/ + "vmov.u32 q8, q3 \n" + "vmov.u32 q9, q3 \n" + "vmov.u32 q10, q3 \n" + "vmov.u32 q11, q3 \n" + /* load b0, b1 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmov.u32 q12, q3 \n" + "vmov.u32 q13, q3 \n" + "vmov.u32 q14, q3 \n" + "vmov.u32 q15, q3 \n" + "1:\n" + /* load b2, b3 */ + "vld1.32 {d4-d7}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d12-d15}, [%[a]]! \n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q4, d2[0] \n" + "vmla.f32 q10, q4, d4[0] \n" + "vmla.f32 q11, q4, d6[0] \n" + "pld [%[b]] \n" + "vmla.f32 q8, q5, d0[1] \n" + "vmla.f32 q9, q5, d2[1] \n" + "vmla.f32 q10, q5, d4[1] \n" + "vmla.f32 q11, q5, d6[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "vmla.f32 q8, q6, d1[0] \n" + "vmla.f32 q9, q6, d3[0] \n" + "vmla.f32 q10, q6, d5[0] \n" + "vmla.f32 q11, q6, d7[0] \n" + "pld [%[b], #64] \n" + "vmla.f32 q8, q7, d1[1] \n" + "vmla.f32 q9, q7, d3[1] \n" + /* load b4, b5 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmla.f32 q10, q7, d5[1] \n" + "vmla.f32 q11, q7, d7[1] \n" + /* load b6, b7 */ + "vld1.32 {d4-d7}, [%[b]]! \n" + "vmla.f32 q12, q4, d0[0] \n" + "vmla.f32 q13, q4, d2[0] \n" + "vmla.f32 q14, q4, d4[0] \n" + "vmla.f32 q15, q4, d6[0] \n" + "sub %[b], %[b], #128 \n" + "vmla.f32 q12, q5, d0[1] \n" + "vmla.f32 q13, q5, d2[1] \n" + "vmla.f32 q14, q5, d4[1] \n" + "vmla.f32 q15, q5, d6[1] \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q12, q6, d1[0] \n" + "vmla.f32 q13, q6, d3[0] \n" + "vmla.f32 q14, q6, d5[0] \n" + "vmla.f32 q15, q6, d7[0] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + "vmla.f32 q12, q7, d1[1] \n" + "vmla.f32 q13, q7, d3[1] \n" + /* load b0, b1 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vmla.f32 q14, q7, d5[1] \n" + "vmla.f32 q15, q7, d7[1] \n" + "bne 1b \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmov.u32 q0, #0 \n" + "vmax.f32 q8, q8, q0 \n" + "vmax.f32 q9, q9, q0 \n" + "vmax.f32 q10, q10, q0 \n" + "vmax.f32 q11, q11, q0 \n" + "vmax.f32 q12, q12, q0 \n" + "vmax.f32 q13, q13, q0 \n" + "vmax.f32 q14, q14, q0 \n" + "vmax.f32 q15, q15, q0 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]! \n" + "vst1.32 {d20-d23}, [%[c]]! \n" + "vst1.32 {d24-d27}, [%[c]]! \n" + "vst1.32 {d28-d31}, [%[c]]! \n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15", "cc", "memory" + ); + b += 4 * 8; + } + for (; n > 3; n -= 4) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + "vld1.32 {d24-d25}, [%[bias]] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + /* mov bias to c0-c3*/ + "vmov.u32 q8, q12 \n" + "vmov.u32 q9, q12 \n" + "vmov.u32 q10, q12 \n" + "vmov.u32 q11, q12 \n" + "vmov.u32 q13, #0 \n" + "1:\n" + /* load b0-b3 */ + "vld1.32 {d0-d3}, [%[b]]! \n" + "vld1.32 {d4-d7}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d12-d15}, [%[a]]!\n" + "vmla.f32 q8, q4, d0[0] \n" + "vmla.f32 q9, q4, d2[0] \n" + "vmla.f32 q10, q4, d4[0] \n" + "vmla.f32 q11, q4, d6[0] \n" + "sub %[b], %[b], #64 \n" + "vmla.f32 q8, q5, d0[1] \n" + "vmla.f32 q9, q5, d2[1] \n" + "vmla.f32 q10, q5, d4[1] \n" + "vmla.f32 q11, q5, d6[1] \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q8, q6, d1[0] \n" + "vmla.f32 q9, q6, d3[0] \n" + "vmla.f32 q10, q6, d5[0] \n" + "vmla.f32 q11, q6, d7[0] \n" + /* load a0, a1 */ + "vld1.32 {d8-d11}, [%[a]]! \n" + "vmla.f32 q8, q7, d1[1] \n" + "vmla.f32 q9, q7, d3[1] \n" + "vmla.f32 q10, q7, d5[1] \n" + "vmla.f32 q11, q7, d7[1] \n" + "subs %[cnt], %[cnt], #1 \n" + "bne 1b \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmax.f32 q8, q8, q13 \n" + "vmax.f32 q9, q9, q13 \n" + "vmax.f32 q10, q10, q13 \n" + "vmax.f32 q11, q11, q13 \n" + "2:\n" + "vst1.32 {d16-d19}, [%[c]]!\n" + "vst1.32 {d20-d23}, [%[c]]!\n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", + "q6", "q7", "q8", "q9", "q10", "q11", + "q12", "q13", "cc", "memory" + ); + b += 4 * 4; + } + for (; n > 0; n--) { + int cnt = kcnt; + const float* a_ptr = A_packed; + const float* b_ptr = b; + asm volatile( + "vld1.32 {d14-d15}, [%[bias]] \n" + "vmov.u32 q8, #0 \n" + /* load a0, a1 */ + "vld1.32 {d2-d5}, [%[a]]! \n" + /* mov bias to c0 */ + "vmov.u32 q5, q7 \n" + "vmov.u32 q6, q8 \n" + "1:\n" + /* load b0 */ + "vld1.32 {d0-d1}, [%[b]]! \n" + /* load a2, a3 */ + "vld1.32 {d6-d9}, [%[a]]! \n" + "vmla.f32 q5, q1, d0[0] \n" + "vmla.f32 q6, q2, d0[1] \n" + "sub %[b], %[b], #16 \n" + "subs %[cnt], %[cnt], #1 \n" + "add %[b], %[b], %[ldb] \n" + "vmla.f32 q5, q3, d1[0] \n" + "vmla.f32 q6, q4, d1[1] \n" + /* load a0, a1 */ + "vld1.32 {d2-d5}, [%[a]]! \n" + "bne 1b \n" + "vadd.f32 q5, q5, q6 \n" + "cmp %[relu], #0 \n" + "beq 2f \n" + "vmax.f32 q5, q5, q8 \n" + "2:\n" + "vst1.32 {d10-d11}, [%[c]]!\n" + : [a] "+r" (a_ptr), + [b] "+r" (b_ptr), + [c] "+r" (C), + [cnt] "+r" (cnt) + : [relu] "r" (has_relu), + [ldb] "r" (ldb_byte), + [bias] "r" (bias_ptr) + : "q0", "q1", "q2", "q3", "q4", + "q5", "q6", "q7", "q8", "cc", "memory" + ); + // clang-format on + b += 4; + } +#endif + bias_ptr += 4; + A_packed += lda; + } +} + +void sgemm_prepack_c4(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx) { + if (N > 16) { + sgemm_prepack_c4_common( + M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx); + } else { + sgemm_prepack_c4_small( + M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx); + } +} + +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h new file mode 100644 index 0000000000000000000000000000000000000000..21e5af634315a7da66914bb04775088fec55550c --- /dev/null +++ b/lite/backends/arm/math/packed_sgemm_c4.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace arm { +namespace math { + +constexpr int MBLOCK_C4 = 4; +constexpr int NBLOCK_C4 = 8; +constexpr int KBLOCK_C4 = 4; + +void sgemm_prepack_c4(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx); +void sgemm_prepack_c4_small(int M, + int N, + int K, + const float* A_packed, + const float* B, + float* C, + const float* bias, + bool has_bias, + bool has_relu, + ARMContext* ctx); +} // namespace math +} // namespace arm +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc index a857e9830c54b568c93afa4c1aa119ed2baffa1e..8524d7376f2bb7e337dfc11b890c00e281d2e880 100644 --- a/lite/backends/arm/math/pooling.cc +++ b/lite/backends/arm/math/pooling.cc @@ -46,7 +46,7 @@ void pooling_basic(const float* din, int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -125,18 +125,22 @@ void pooling_basic(const float* din, int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc index 506451932dcccd98368a050484a38bc8a922eb22..1830423136cc883d30d4eecad0eb9fcfc9ded6ba 100644 --- a/lite/backends/arm/math/sgemv.cc +++ b/lite/backends/arm/math/sgemv.cc @@ -14,6 +14,7 @@ #include "lite/backends/arm/math/sgemv.h" #include +#include #include "lite/utils/cp_logging.h" namespace paddle { @@ -50,6 +51,495 @@ void sgemv_bias_relu(const bool transA, const float *x, float *y, const float *bias); +#ifdef __aarch64__ +void sgemv_trans(const int M, + const int N, + const float *A, + const float *x, + float *y, + bool flag_bias, + const float *bias, + bool flag_relu, + const ARMContext *ctx) { + int m_cnt16 = M >> 4; + int m_cnt8 = (M & 15) >> 3; + int m_cnt4 = (M & 15 & 7) >> 2; + int m_remain = M & 15 & 7 & 3; + int ths = ctx->threads(); + int valid_ths = std::min((N + 3) / 4, ths); + int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4); + valid_ths = (N + valid_block - 1) / valid_block; + int block_cnt = valid_block / 4; + float zero_buf[M]; // NOLINT + float y_buf[valid_ths * M]; // NOLINT + memset(zero_buf, 0, M * sizeof(float)); + if (flag_bias) { + memcpy(y_buf, bias, M * sizeof(float)); + memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float)); + } else { + memset(y_buf, 0, valid_ths * M * sizeof(float)); + } +#pragma omp parallel for + for (int t = 0; t < valid_ths; ++t) { + float *block_y = y_buf + t * M; + const float *block_x = x + t * valid_block; + const float *block_A = A + t * valid_block * M; + for (int i = 0; i < block_cnt; ++i) { + float *y_ptr = block_y; + const float *x_ptr = block_x + i * 4; + const float *in0_ptr = block_A + i * 4 * M; + const float *in1_ptr = in0_ptr + M; + const float *in2_ptr = in1_ptr + M; + const float *in3_ptr = in2_ptr + M; + int offset = t * valid_block + (i + 1) * 4 - N; + if (offset > 0) { + if (offset > 3) { + in0_ptr = zero_buf; + in1_ptr = zero_buf; + in2_ptr = zero_buf; + in3_ptr = zero_buf; + } else { + switch (offset) { + case 3: + in1_ptr = zero_buf; + case 2: + in2_ptr = zero_buf; + case 1: + in3_ptr = zero_buf; + default: + break; + } + } + } + // clang-format off + if (m_cnt16 > 0) { + int cnt16 = m_cnt16; + asm volatile( + "ld1 {v4.4s}, [%[x]] \n" /* load x to v4 */ + "ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[in0]], #64 \n" /* load in0 to v5, v6, v7, v8 */ + "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[in1]], #64 \n" /* load in1 to v9, v10, v11, v12 */ + "ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */ + "ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */ + "1:\n" + "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]] \n" /*load y to v0, v1, v2, v3 */ + "fmla v0.4s, v5.4s, v4.s[0] \n" /* v0 += v5 * v4[0] */ + "fmla v1.4s, v6.4s, v4.s[0] \n" /* v1 += v6 * v4[0] */ + "fmla v2.4s, v7.4s, v4.s[0] \n" /* v2 += v7 * v4[0] */ + "fmla v3.4s, v8.4s, v4.s[0] \n" /* v3 += v8 * v4[0] */ + "ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [%[in0]], #64 \n" /* load in0 to v5, v6, v7, v8 */ + "fmla v0.4s, v9.4s, v4.s[1] \n" /* v0 += v9 * v4[1] */ + "fmla v1.4s, v10.4s, v4.s[1] \n" /* v1 += v10 * v4[1] */ + "fmla v2.4s, v11.4s, v4.s[1] \n" /* v2 += v11 * v4[1] */ + "fmla v3.4s, v12.4s, v4.s[1] \n" /* v3 += v12 * v4[1] */ + "ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [%[in1]], #64 \n" /* load in1 to v9, v10, v11, v12 */ + "fmla v0.4s, v13.4s, v4.s[2] \n" /* v0 += v13 * v4[2] */ + "fmla v1.4s, v14.4s, v4.s[2] \n" /* v1 += v14 * v4[2] */ + "fmla v2.4s, v15.4s, v4.s[2] \n" /* v2 += v15 * v4[2] */ + "fmla v3.4s, v16.4s, v4.s[2] \n" /* v3 += v16 * v4[2] */ + "ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */ + "fmla v0.4s, v17.4s, v4.s[3] \n" /* v0 += v17 * v4[3] */ + "fmla v1.4s, v18.4s, v4.s[3] \n" /* v1 += v18 * v4[3] */ + "fmla v2.4s, v19.4s, v4.s[3] \n" /* v2 += v19 * v4[3] */ + "fmla v3.4s, v20.4s, v4.s[3] \n" /* v3 += v20 * v4[3] */ + "ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]], #64 \n" /* store v0, v1, v2, v3 to y */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #64 \n" /* restore in0 address */ + "sub %[in1], %[in1], #64 \n" /* restore in1 address */ + "sub %[in2], %[in2], #64 \n" /* restore in2 address */ + "sub %[in3], %[in3], #64 \n" /* restore in3 address */ + : [cnt] "+r"(cnt16), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", + "v17", "v18", "v19", "v20", "cc", "memory" + ); + } + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "ld1 {v2.4s}, [%[x]] \n" /* load x to v2 */ + "ld1 {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */ + "ld1 {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in1 to v5, v6 */ + "ld1 {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in2 to v7, v8 */ + "ld1 {v9.4s, v10.4s}, [%[in3]], #32 \n" /* load in3 to v9, v10*/ + "1:\n" + "ld1 {v0.4s, v1.4s}, [%[y]] \n" /* load y to v0, v1 */ + "fmla v0.4s, v3.4s, v2.s[0] \n" /* v0 += v3 * v2[0] */ + "fmla v1.4s, v4.4s, v2.s[0] \n" /* v1 += v4 * v2[0] */ + "prfm pldl1keep, [%[in0]] \n" /* preload in0 */ + "ld1 {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */ + "fmla v0.4s, v5.4s, v2.s[1] \n" /* v0 += v5 * v2[1] */ + "fmla v1.4s, v6.4s, v2.s[1] \n" /* v1 += v6 * v2[1] */ + "prfm pldl1keep, [%[in1]] \n" /* preload in1 */ + "ld1 {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in0 to v5, v6 */ + "fmla v0.4s, v7.4s, v2.s[2] \n" /* v0 += v7 * v2[2] */ + "fmla v1.4s, v8.4s, v2.s[2] \n" /* v1 += v8 * v2[2] */ + "prfm pldl1keep, [%[in2]] \n" /* preload in2 */ + "ld1 {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in0 to v7, v8 */ + "fmla v0.4s, v9.4s, v2.s[3] \n" /* v0 += v9 * v2[3] */ + "fmla v1.4s, v10.4s, v2.s[3] \n" /* v1 += v10 * v2[3] */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "prfm pldl1keep, [%[in3]] \n" /* preload in3 */ + "st1 {v0.4s, v1.4s}, [%[y]], #32 \n" /* store v0, v1 to y */ + "ld1 {v9.4s, v10.4s},[%[in3]], #32 \n" /* load in0 to v9, v10*/ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #32 \n" /* restore in0 address */ + "sub %[in1], %[in1], #32 \n" /* restore in1 address */ + "sub %[in2], %[in2], #32 \n" /* restore in2 address */ + "sub %[in3], %[in3], #32 \n" /* restore in3 address */ + : [cnt] "+r"(cnt8), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "v6", + "v7", "v8", "v9", "v10", "cc", "memory" + ); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "ld1 {v1.4s}, [%[in0]], #16 \n" /* load in0 to v1 */ + "ld1 {v2.4s}, [%[in1]], #16 \n" /* load in1 to v2 */ + "ld1 {v3.4s}, [%[in2]], #16 \n" /* load in2 to v3 */ + "ld1 {v4.4s}, [%[in3]], #16 \n" /* load in3 to v4 */ + "ld1 {v5.4s}, [%[x]] \n" /* load x to v5 */ + "1:\n" + "ld1 {v0.4s}, [%[y]] \n" /* load y to v0 */ + "fmla v0.4s, v1.4s, v5.s[0] \n" /* v0 += v1 * v5[0] */ + "prfm pldl1keep, [%[in0]] \n" /* preload in0 */ + "ld1 {v1.4s}, [%[in0]], #16 \n" /* load in0 to v1 */ + "fmla v0.4s, v2.4s, v5.s[1] \n" /* v0 += v2 * v5[1] */ + "prfm pldl1keep, [%[in1]] \n" /* preload in1 */ + "ld1 {v2.4s}, [%[in1]], #16 \n" /* load in1 to v2 */ + "fmla v0.4s, v3.4s, v5.s[2] \n" /* v0 += v3 * v5[2] */ + "prfm pldl1keep, [%[in2]] \n" /* preload in2 */ + "ld1 {v3.4s}, [%[in2]], #16 \n" /* load in2 to v3 */ + "fmla v0.4s, v4.4s, v5.s[3] \n" /* v0 += v4 * v5[3] */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "prfm pldl1keep, [%[in3]] \n" /* preload in3 */ + "st1 {v0.4s}, [%[y]], #16 \n" /* store v0 to y */ + "ld1 {v4.4s}, [%[in3]], #16 \n" /* load in3 to v4 */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #16 \n" /* restore in0 address*/ + "sub %[in1], %[in1], #16 \n" /* restore in1 address*/ + "sub %[in2], %[in2], #16 \n" /* restore in2 address*/ + "sub %[in3], %[in3], #16 \n" /* restore in3 address*/ + : [cnt] "+r"(cnt4), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory" + ); + } + // clang-format on + for (int r = 0; r < m_remain; ++r) { + float val0 = x_ptr[0] * in0_ptr[r]; + float val1 = x_ptr[1] * in1_ptr[r]; + float val2 = x_ptr[2] * in2_ptr[r]; + float val3 = x_ptr[3] * in3_ptr[r]; + y_ptr[r] += val0 + val1 + val2 + val3; + } + } + } + int cnt4 = M >> 2; + int remain = M & 3; + //! do reduction + int rdc_ths = valid_ths >> 1; + while (rdc_ths > 0) { +#pragma omp parallel for + for (int t = 0; t < rdc_ths; ++t) { + float *y0 = y_buf + t * M; + for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) { + float *y0_ptr = y0; + float *y_ptr = y_buf + i * M; + for (int j = 0; j < cnt4; ++j) { + float32x4_t val0 = vld1q_f32(y0_ptr + j * 4); + float32x4_t val1 = vld1q_f32(y_ptr + j * 4); + float32x4_t val = vaddq_f32(val0, val1); + vst1q_f32(y0_ptr + j * 4, val); + } + y0_ptr += cnt4 * 4; + y_ptr += cnt4 * 4; + for (int j = 0; j < remain; ++j) { + y0_ptr[j] += y_ptr[j]; + } + } + } + valid_ths = rdc_ths; + rdc_ths = rdc_ths >> 1; + } + if (flag_relu) { + float *in_y = y_buf; + float32x4_t vzero = vdupq_n_f32(0.f); + if (cnt4 > 0) { + int cnt = cnt4; + asm volatile( + "ld1 {v0.4s}, [%[in_y]], #16 \n" /* load y to v0 */ + "1:\n" + "fmax v1.4s, v0.4s, %[vzero].4s \n" /* v0 relu */ + "ld1 {v0.4s}, [%[in_y]], #16 \n" /* load y to v0 */ + "subs %w[cnt], %w[cnt], #1 \n" /* sub cnt */ + "st1 {v1.4s}, [%[out_y]], #16 \n" /* store v1 to y */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #16 \n" /* restore in_y */ + : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "v0", "v1", "cc", "memory"); + } + for (int r = 0; r < remain; ++r) { + y[r] = in_y[r] > 0.f ? in_y[r] : 0.f; + } + } else { + memcpy(y, y_buf, M * sizeof(float)); + } +} +#else +void sgemv_trans(const int M, + const int N, + const float *A, + const float *x, + float *y, + bool flag_bias, + const float *bias, + bool flag_relu, + const ARMContext *ctx) { + int m_cnt8 = M >> 3; + int m_cnt4 = (M & 7) >> 2; + int m_remain = M & 7 & 3; + int ths = ctx->threads(); + int valid_ths = std::min((N + 3) / 4, ths); + int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4); + valid_ths = (N + valid_block - 1) / valid_block; + int block_cnt = valid_block / 4; + float zero_buf[M]; // NOLINT + float y_buf[valid_ths * M]; // NOLINT + memset(zero_buf, 0, M * sizeof(float)); + if (flag_bias) { + memcpy(y_buf, bias, M * sizeof(float)); + memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float)); + } else { + memset(y_buf, 0, valid_ths * M * sizeof(float)); + } +#pragma omp parallel for + for (int t = 0; t < valid_ths; ++t) { + float *block_y = y_buf + t * M; + const float *block_x = x + t * valid_block; + const float *block_A = A + t * valid_block * M; + for (int i = 0; i < block_cnt; ++i) { + float *y_ptr = block_y; + const float *x_ptr = block_x + i * 4; + const float *in0_ptr = block_A + i * 4 * M; + const float *in1_ptr = in0_ptr + M; + const float *in2_ptr = in1_ptr + M; + const float *in3_ptr = in2_ptr + M; + int offset = t * valid_block + (i + 1) * 4 - N; + if (offset > 0) { + if (offset > 3) { + in0_ptr = zero_buf; + in1_ptr = zero_buf; + in2_ptr = zero_buf; + in3_ptr = zero_buf; + } else { + switch (offset) { + case 3: + in1_ptr = zero_buf; + case 2: + in2_ptr = zero_buf; + case 1: + in3_ptr = zero_buf; + default: + break; + } + } + } + // clang-format off + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "vld1.32 {d4-d5}, [%[x]] \n" /* load x to q2 */ + "vld1.32 {d6-d9}, [%[in0]]! \n" /* load in0 to q3, q4 */ + "vld1.32 {d10-d13},[%[in1]]! \n" /* load in1 to q5, q6 */ + "vld1.32 {d14-d17},[%[in2]]! \n" /* load in2 to q7, q8 */ + "vld1.32 {d18-d21},[%[in3]]! \n" /* load in3 to q9, q10*/ + "1:\n" + "vld1.32 {d0-d3}, [%[y]] \n" /* load y to q0, q1 */ + "vmla.f32 q0, q3, d4[0] \n" /* q0 += q3 * q2[0] */ + "vmla.f32 q1, q4, d4[0] \n" /* q1 += q4 * q2[0] */ + "pld [%[in0]] \n" /* preload in0 */ + "vld1.32 {d6-d9}, [%[in0]]! \n" /* load in0 to q3, q4 */ + "vmla.f32 q0, q5, d4[1] \n" /* q0 += q5 * q2[1] */ + "vmla.f32 q1, q6, d4[1] \n" /* q1 += q6 * q2[1] */ + "pld [%[in1]] \n" /* preload in1 */ + "vld1.32 {d10-d13},[%[in1]]! \n" /* load in0 to q5, q6 */ + "vmla.f32 q0, q7, d5[0] \n" /* q0 += q7 * q2[2] */ + "vmla.f32 q1, q8, d5[0] \n" /* q1 += q8 * q2[2] */ + "pld [%[in2]] \n" /* preload in2 */ + "vld1.32 {d14-d17},[%[in2]]! \n" /* load in0 to q7, q8 */ + "vmla.f32 q0, q9, d5[1] \n" /* q0 += q9 * q2[3] */ + "vmla.f32 q1, q10, d5[1] \n" /* q1 += q10 * q2[3] */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "pld [%[in3]] \n" /* preload in3 */ + "vst1.32 {d0-d3}, [%[y]]! \n" /* store q0, q1 to y */ + "vld1.32 {d18-d21},[%[in3]]! \n" /* load in0 to q9, q10*/ + "pld [%[y], #32] \n" /* preload y */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #32 \n" /* restore in0 address */ + "sub %[in1], %[in1], #32 \n" /* restore in1 address */ + "sub %[in2], %[in2], #32 \n" /* restore in2 address */ + "sub %[in3], %[in3], #32 \n" /* restore in3 address */ + : [cnt] "+r"(cnt8), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", "q6", + "q7", "q8", "q9", "q10", "cc", "memory" + ); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "vld1.32 {d2-d3}, [%[in0]]! \n" /* load in0 to q1 */ + "vld1.32 {d4-d5}, [%[in1]]! \n" /* load in1 to q2 */ + "vld1.32 {d6-d7}, [%[in2]]! \n" /* load in2 to q3 */ + "vld1.32 {d8-d9}, [%[in3]]! \n" /* load in3 to q4 */ + "vld1.32 {d10-d11},[%[x]] \n" /* load x to q5 */ + "1:\n" + "vld1.32 {d0-d1}, [%[y]] \n" /* load y to q0 */ + "vmla.f32 q0, q1, d10[0] \n" /* q0 += q1 * q5[0] */ + "pld [%[in0]] \n" /* preload in0 */ + "vld1.32 {d2-d3}, [%[in0]]! \n" /* load in0 to q1 */ + "vmla.f32 q0, q2, d10[1] \n" /* q0 += q2 * q5[1] */ + "pld [%[in1]] \n" /* preload in1 */ + "vld1.32 {d4-d5}, [%[in1]]! \n" /* load in0 to q2 */ + "vmla.f32 q0, q3, d11[0] \n" /* q0 += q3 * q5[2] */ + "pld [%[in2]] \n" /* preload in2 */ + "vld1.32 {d6-d7}, [%[in2]]! \n" /* load in0 to q3 */ + "vmla.f32 q0, q4, d11[1] \n" /* q0 += q4 * q5[3] */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "pld [%[in3]] \n" /* preload in3 */ + "vst1.32 {d0-d1}, [%[y]]! \n" /* store q0 to y */ + "vld1.32 {d8-d9}, [%[in3]]! \n" /* load in0 to q4 */ + "bne 1b \n" /* branch to label 1 */ + "sub %[in0], %[in0], #16 \n" /* restore in0 address*/ + "sub %[in1], %[in1], #16 \n" /* restore in1 address*/ + "sub %[in2], %[in2], #16 \n" /* restore in2 address*/ + "sub %[in3], %[in3], #16 \n" /* restore in3 address*/ + : [cnt] "+r"(cnt4), + [in0] "+r"(in0_ptr), + [in1] "+r"(in1_ptr), + [in2] "+r"(in2_ptr), + [in3] "+r"(in3_ptr), + [y] "+r"(y_ptr) + : [x] "r"(x_ptr) + : "q0", "q1", "q2", "q3", "q4", "q5", "cc", "memory" + ); + } + // clang-format on + for (int r = 0; r < m_remain; ++r) { + float val0 = x_ptr[0] * in0_ptr[r]; + float val1 = x_ptr[1] * in1_ptr[r]; + float val2 = x_ptr[2] * in2_ptr[r]; + float val3 = x_ptr[3] * in3_ptr[r]; + y_ptr[r] += val0 + val1 + val2 + val3; + } + } + } + //! do reduction + int rdc_ths = valid_ths >> 1; + while (rdc_ths > 0) { +#pragma omp parallel for + for (int t = 0; t < rdc_ths; ++t) { + float *y0 = y_buf + t * M; + for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) { + float *y0_ptr = y0; + float *y_ptr = y_buf + i * M; + for (int j = 0; j < m_cnt8; ++j) { + float32x4_t val00 = vld1q_f32(y0_ptr + j * 8); + float32x4_t val01 = vld1q_f32(y0_ptr + j * 8 + 4); + float32x4_t val10 = vld1q_f32(y_ptr + j * 8); + float32x4_t val11 = vld1q_f32(y_ptr + j * 8 + 4); + float32x4_t val0 = vaddq_f32(val00, val10); + float32x4_t val1 = vaddq_f32(val01, val11); + vst1q_f32(y0_ptr + j * 8, val0); + vst1q_f32(y0_ptr + j * 8 + 4, val1); + } + y0_ptr += m_cnt8 * 8; + y_ptr += m_cnt8 * 8; + for (int j = 0; j < m_cnt4; ++j) { + float32x4_t val0 = vld1q_f32(y0_ptr + j * 4); + float32x4_t val1 = vld1q_f32(y_ptr + j * 4); + float32x4_t val = vaddq_f32(val0, val1); + vst1q_f32(y0_ptr + j * 4, val); + } + y0_ptr += m_cnt4 * 4; + y_ptr += m_cnt4 * 4; + for (int j = 0; j < m_remain; ++j) { + y0_ptr[j] += y_ptr[j]; + } + } + } + valid_ths = rdc_ths; + rdc_ths = rdc_ths >> 1; + } + if (flag_relu) { + float *in_y = y_buf; + float32x4_t vzero = vdupq_n_f32(0.f); + if (m_cnt8 > 0) { + int cnt8 = m_cnt8; + asm volatile( + "vld1.32 {d0-d3}, [%[in_y]]! \n" /* load y to q0, q1 */ + "1:\n" + "vmax.f32 q2, q0, %q[vzero] \n" /* q0 relu */ + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "vmax.f32 q3, q1, %q[vzero] \n" /* q1 relu */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "vst1.32 {d4-d7}, [%[out_y]]! \n" /* store q0, q1 to y*/ + "vld1.32 {d2-d3}, [%[in_y]]! \n" /* load y to q0 */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #32 \n" /* restore in_y */ + : [cnt] "+r"(cnt8), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "q0", "q1", "q2", "q3", "cc", "memory"); + } + if (m_cnt4 > 0) { + int cnt4 = m_cnt4; + asm volatile( + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "1:\n" + "vmax.f32 q1, q0, %q[vzero] \n" /* q0 relu */ + "vld1.32 {d0-d1}, [%[in_y]]! \n" /* load y to q0 */ + "subs %[cnt], %[cnt], #1 \n" /* sub cnt */ + "vst1.32 {d2-d3}, [%[out_y]]! \n" /* store q1 to y */ + "bne 1b \n" /* branch to label 1*/ + "sub %[in_y], %[in_y], #16 \n" /* restore in_y */ + : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y) + : [vzero] "w"(vzero) + : "q0", "q1", "cc", "memory"); + } + for (int r = 0; r < m_remain; ++r) { + y[r] = in_y[r] > 0.f ? in_y[r] : 0.f; + } + } else { + memcpy(y, y_buf, M * sizeof(float)); + } +} +#endif // __aarch64__ bool sgemv(const float *A, const float *x, @@ -59,33 +549,34 @@ bool sgemv(const float *A, int N, bool is_bias, const float *bias, - bool is_relu) { + bool is_relu, + const ARMContext *ctx) { if (transA) { - LOG(ERROR) << " sgemv, transA is not supported now"; - return false; - } - if (is_bias) { - //! with bias - if (is_relu) { - //! with relu - sgemv_bias_relu(transA, M, N, A, x, y, bias); - } else { - //! without relu - sgemv_bias(transA, M, N, A, x, y, bias); - } + sgemv_trans(M, N, A, x, y, is_bias, bias, is_relu, ctx); } else { - //! without bias - if (is_relu) { - //! with relu - sgemv_relu(transA, M, N, A, x, y); + if (is_bias) { + //! with bias + if (is_relu) { + //! with relu + sgemv_bias_relu(transA, M, N, A, x, y, bias); + } else { + //! without relu + sgemv_bias(transA, M, N, A, x, y, bias); + } } else { - //! without relu - sgemv(transA, M, N, A, x, y); + //! without bias + if (is_relu) { + //! with relu + sgemv_relu(transA, M, N, A, x, y); + } else { + //! without relu + sgemv(transA, M, N, A, x, y); + } } } return true; } - +// clang-format off //! define compute kernel #ifdef __aarch64__ #define SGEMV_IN_8 \ @@ -179,8 +670,8 @@ bool sgemv(const float *A, "fmla v5.4s, v9.4s, v21.4s \n" /* mul + add*/ \ "fmla v6.4s, v9.4s, v23.4s \n" /* mul + add*/ \ "fmla v7.4s, v9.4s, v25.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ + "bne 1b \n" /* jump to main loop */ \ + /* pair add to final result */ \ "2: \n" /* reduce to scale */ \ "faddp v16.4s, v0.4s, v0.4s\n" /* pair add to vector */ \ "faddp s8, v16.2s \n" /* pair add to scale */ \ @@ -231,8 +722,8 @@ bool sgemv(const float *A, "fmla v0.4s, v8.4s, v10.4s \n" /* mul + add*/ \ "subs %w[cnt], %w[cnt], #1 \n" /* sub main loop count */ \ "fmla v1.4s, v9.4s, v11.4s \n" /* mul + add*/ \ - "bne 1b \n" /* jump to main loop */ /* pair add to final \ - result */ \ + "bne 1b \n" /* jump to main loop */ \ + /* pair add to final result */ \ "2: \n" /* reduce to scale */ \ "fadd v9.4s, v0.4s, v1.4s \n" /* add 2 vector */ \ "faddp v10.4s, v9.4s, v9.4s\n" /* pair add to vector */ \ @@ -283,7 +774,7 @@ bool sgemv(const float *A, "fmax s8, s8, s0 \n" /* relu */ \ "str s8, [%[out]] \n" /* save result */ -#else //__aarch64__ +#else // __aarch64__ #define SGEMV_IN_4 \ "pld [%[in]] @ preload cache line, input\n" \ @@ -349,8 +840,8 @@ bool sgemv(const float *A, "vmla.f32 q1, q5, q9 @ mul add\n" \ "vmla.f32 q2, q5, q11 @ mul add\n" \ "vmla.f32 q3, q5, q13 @ mul add\n" \ - "bne 1b @ jump to main loop\n" /* pair add to final \ - result */ \ + "bne 1b @ jump to main loop\n" \ + /* pair add to final result */ \ "2: @ pair add \n" \ "vpadd.f32 d8, d0, d1 @ pair add, first step\n" \ "vpadd.f32 d9, d2, d3 @ pair add, first step\n" \ @@ -382,13 +873,10 @@ bool sgemv(const float *A, "vmla.f32 q0, q12, q14 @ mul add\n" \ "vmla.f32 q0, q13, q15 @ mul add\n" \ "subs %[cnt] , #1 @ sub loop count \n" \ - "bne 1b @ jump to main loop\n" /* pair add to \ - final result \ - */ \ + "bne 1b @ jump to main loop\n" \ "2: @ end processing\n" \ "vpadd.f32 d2, d0, d1 @ pair add, first step\n" \ - "vpadd.f32 d0, d2, d2 @ pair add, final step\n" /* check tails \ - */ \ + "vpadd.f32 d0, d2, d2 @ pair add, final step\n"/*check tails*/ \ "cmp %[tail], #1 @ check whether has mid cols\n" \ "blt 4f @ jump to end\n" \ "3: @ tail loop\n" \ @@ -422,7 +910,7 @@ bool sgemv(const float *A, "vmax.f32 d0, d0, d1 @ relu\n" \ "vst1.32 {d0[0]}, [%[out]] @ save result\n" #endif - +// clang-format on void sgemv(const bool transA, const int M, const int N, @@ -523,7 +1011,7 @@ void sgemv(const bool transA, [tmp4] "r"(tmp4) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -579,7 +1067,7 @@ void sgemv(const bool transA, : [out] "r"(ptr_out) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_relu(const bool transA, @@ -671,7 +1159,7 @@ void sgemv_relu(const bool transA, : [out] "r"(ptr_out) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -727,7 +1215,7 @@ void sgemv_relu(const bool transA, : [out] "r"(ptr_out) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_bias(const bool transA, @@ -822,7 +1310,7 @@ void sgemv_bias(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -887,7 +1375,7 @@ void sgemv_bias(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } void sgemv_bias_relu(const bool transA, @@ -980,7 +1468,7 @@ void sgemv_bias_relu(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory"); } -#else //__aarch64__ +#else // __aarch64__ int out_cnt = M >> 2; #pragma omp parallel for for (int j = 0; j < out_cnt; j++) { @@ -1045,7 +1533,7 @@ void sgemv_bias_relu(const bool transA, : [out] "r"(ptr_out), [bias0] "r"(bias0) : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory"); } -#endif //__aarch64__ +#endif // __aarch64__ } } // namespace math diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h index 4d74006f9320ee770bc4f57a52a58df3bce4db9e..aa17349c99e61f7135090318be829149ecd6bb57 100644 --- a/lite/backends/arm/math/sgemv.h +++ b/lite/backends/arm/math/sgemv.h @@ -15,6 +15,8 @@ #pragma once #include +#include "lite/core/context.h" +#include "lite/core/device_info.h" namespace paddle { namespace lite { @@ -28,9 +30,10 @@ bool sgemv(const float* A, bool transA, int M, int N, - bool is_bias = false, - const float* bias = nullptr, - bool is_relu = false); + bool is_bias, + const float* bias, + bool is_relu, + const ARMContext* ctx); } // namespace math } // namespace arm diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt index a6c3fcc66a789f159cd3a756ed893627b393e1fe..f73b4120e6a48bfdec04d0706a47bcc4a54fcf5e 100644 --- a/lite/backends/cuda/CMakeLists.txt +++ b/lite/backends/cuda/CMakeLists.txt @@ -1,8 +1,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -set(cuda_static_deps cudnn_static cublas_static curand_static - culibos_static cudart_static) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps}) diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h index 13bf8190efe1592e7509039a569d31f6bddc5b66..9da70262f5b2e32ae8509d9370142b2499886bfb 100644 --- a/lite/backends/cuda/cuda_utils.h +++ b/lite/backends/cuda/cuda_utils.h @@ -56,6 +56,15 @@ CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \ } +const int CUDA_NUM_THREADS = 512; +// CUDA: number of blocks for threads. +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} +inline int CUDA_GET_BLOCKS(const int N, const int base) { + return (N + base - 1) / base; +} + namespace paddle { namespace lite { namespace cuda { diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt index a5ee25643b4c87c9488df5b2acaead26773855a9..fafd74ae7a43d1a769456edfe408c71593d21201 100644 --- a/lite/backends/cuda/math/CMakeLists.txt +++ b/lite/backends/cuda/math/CMakeLists.txt @@ -2,8 +2,7 @@ if(NOT LITE_WITH_CUDA) return() endif() -set(cuda_static_deps cudnn_static cublas_static curand_static - culibos_static cudart_static) +get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES) nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps}) nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps}) @@ -12,6 +11,9 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps}) nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps}) nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps}) +nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps}) +nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps}) +nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps}) set ( math_cuda @@ -21,6 +23,9 @@ set ( cuda_type_trans cuda_transpose cuda_elementwise + cudnn_pool + cuda_gemm + cuda_batched_gemm ) set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda") diff --git a/lite/backends/cuda/math/batched_gemm.cc b/lite/backends/cuda/math/batched_gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..e81510927615daa88e7f5bef3ce7b8421d8f6539 --- /dev/null +++ b/lite/backends/cuda/math/batched_gemm.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/batched_gemm.h" +#include +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template <> +bool BatchedGemm::init(const bool trans_a, + const bool trans_b, + const int max_batch_size, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + cudaMalloc(reinterpret_cast(&A_), + 3 * max_batch_size * sizeof(float *)); + return true; +} + +template <> +bool BatchedGemm::run(const float alpha, + const float beta, + const float *a[], + const float *b[], + float *c[], + const int m, + const int n, + const int k, + const int batch_size) { + CHECK(a != nullptr); + CHECK(b != nullptr); + CHECK(c != nullptr); + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cudaMemcpyAsync(A_, + a, + batch_size * sizeof(const float *), + cudaMemcpyHostToDevice, + exe_stream_); + cudaMemcpyAsync(A_ + batch_size, + b, + batch_size * sizeof(const float *), + cudaMemcpyHostToDevice, + exe_stream_); + cudaMemcpyAsync(A_ + batch_size * 2, + c, + batch_size * sizeof(float *), + cudaMemcpyHostToDevice, + exe_stream_); + CUBLAS_CALL(cublasSgemmBatched(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + const_cast(A_ + batch_size), + ldb_, + const_cast(A_), + lda_, + &beta, + A_ + batch_size * 2, + ldc_, + batch_size)); + return true; +} + +template <> +bool BatchedGemm::run(const float alpha, + const float beta, + const float *a[], + const int m, + const int n, + const int k, + const int batch_size) { + CHECK(a != nullptr); + lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m; + ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cudaMemcpyAsync(A_, + a, + 3 * batch_size * sizeof(const float *), + cudaMemcpyDefault, + exe_stream_); + CUBLAS_CALL(cublasSgemmBatched(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + const_cast(A_ + batch_size), + ldb_, + const_cast(A_), + lda_, + &beta, + A_ + batch_size * 2, + ldc_, + batch_size)); + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/batched_gemm.h b/lite/backends/cuda/math/batched_gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..2b91d3a524596bf03b4a26a81c14eddcfe64452f --- /dev/null +++ b/lite/backends/cuda/math/batched_gemm.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class BatchedGemm { + public: + BatchedGemm() : cu_handle_(nullptr) {} + ~BatchedGemm() { + if (A_ != nullptr) { + cudaFree(A_); + } + } + + bool init(const bool trans_a, + const bool trans_b, + const int max_batch_size, + Context* ctx); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a[], + const PtypeIn* b[], + PtypeOut* c[], + const int m, + const int n, + const int k, + const int batch_size); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a[], + const int m, + const int n, + const int k, + const int batch_size); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; + PtypeIn** A_{nullptr}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc index 72ed3951f6b9b22a5ae1ee6caef8c69708102885..a4f33f467feb8626696595e95a29fde7b636919d 100644 --- a/lite/backends/cuda/math/cudnn_conv.cc +++ b/lite/backends/cuda/math/cudnn_conv.cc @@ -31,6 +31,9 @@ bool CudnnConv2D::create(const operators::ConvParam& param, auto o_dims = param.output->dims(); int batch = x_dims[0]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int iw = x_dims[3]; // nchw int ih = x_dims[2]; int ic = x_dims[1]; @@ -41,10 +44,10 @@ bool CudnnConv2D::create(const operators::ConvParam& param, int kh = w_dims[2]; int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; CHECK(ic % param.groups == 0) << "The conv input channel shoud be divide group number."; @@ -133,8 +136,8 @@ bool CudnnConv2D::create(const operators::ConvParam& param, this->fwd_algo_ = algo_cache.GetAlgorithm(x_dims.Vectorize(), w_dims.Vectorize(), param.strides, - param.paddings, - param.dilations, + *param.paddings, + *param.dilations, 0, search_func); @@ -311,12 +314,15 @@ bool CudnnConv2DInt8::create(const operators::ConvParam& param, int kw = w_dims[2]; int kh = w_dims[1]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; std::vector weight_scale = param.weight_scale; float input_scale = param.input_scale; diff --git a/lite/backends/cuda/math/cudnn_pool.cc b/lite/backends/cuda/math/cudnn_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..f970fc326b29c4c226e7dc9643e416a3cf24f0eb --- /dev/null +++ b/lite/backends/cuda/math/cudnn_pool.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/cudnn_pool.h" +#include "lite/backends/cuda/math/activation.h" +#include "lite/backends/cuda/math/scale.h" +#include "lite/backends/cuda/math/type_trans.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +inline void UpdatePadding(std::vector* paddings, + const bool global_pooling, + const bool adaptive, + const std::vector& data_dims, + const std::vector& strides, + const std::vector& ksize) { + if (paddings->size() == data_dims.size()) { + for (size_t i = 0; i < data_dims.size(); ++i) { + int copy_pad = *(paddings->begin() + 2 * i); + paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); + } + } else { + CHECK(data_dims.size() * 2 == paddings->size()) + << "Paddings size should be the same or twice as the pooling size."; + } + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + +inline void UpdateKsize(std::vector* ksize, + const std::vector& data_dims) { + ksize->resize(static_cast(data_dims.size())); + for (size_t i = 0; i < ksize->size(); ++i) { + *(ksize->begin() + i) = static_cast(data_dims[i]); + } +} + +template <> +bool CudnnPool2DNHWC::create( + const operators::PoolParam& param, Context* ctx) { + return true; +} + +template <> +bool CudnnPool2DNHWC::init(const operators::PoolParam& param, + Context* ctx) { + this->stream_ = ctx->exec_stream(); + CUDNN_CHECK(cudnnCreate(&this->handle_)); + CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_)); + + cudnnCreateTensorDescriptor(&this->input_desc_); + cudnnCreateTensorDescriptor(&this->output_desc_); + cudnnCreatePoolingDescriptor(&this->pooling_desc_); + + return create(param, ctx); +} + +template <> +bool CudnnPool2DNHWC::run( + const operators::PoolParam& param) { + auto x_dims = param.x->dims(); + auto o_dims = param.output->dims(); + int batch = x_dims[0]; + const float* in_data = param.x->data(); + float* out_data = param.output->mutable_data(TARGET(kCUDA)); + + int ih = x_dims[1]; + int iw = x_dims[2]; // nchw + int ic = x_dims[3]; + + int oh = o_dims[1]; + int ow = o_dims[2]; + int oc = o_dims[3]; + + std::vector ksize = param.ksize; + std::vector strides = param.strides; + std::vector paddings = *(param.paddings.get()); + + std::string pooling_type = param.pooling_type; + bool global_pooling = param.global_pooling; + bool exclusive = param.exclusive; + bool adaptive = param.adaptive; + + std::vector data_dims = {ih, iw}; + UpdatePadding(&paddings, global_pooling, adaptive, data_dims, strides, ksize); + + if (data_dims.size() * 2 == paddings.size()) { + for (size_t i = 0; i < data_dims.size(); ++i) { + paddings.erase(paddings.begin() + i + 1); + } + } + + if (global_pooling) { + UpdateKsize(&ksize, data_dims); + } + CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_, + CUDNN_TENSOR_NHWC, + CUDNN_DATA_FLOAT, + batch, + ic, + ih, + iw)); + + CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_, + CUDNN_TENSOR_NHWC, + CUDNN_DATA_FLOAT, + batch, + oc, + oh, + ow)); + cudnnPoolingMode_t mode; + if (pooling_type == "max") { + mode = CUDNN_POOLING_MAX; + } else { + mode = exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING + : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + } + CUDNN_CHECK(cudnnSetPoolingNdDescriptor(this->pooling_desc_, + mode, + CUDNN_NOT_PROPAGATE_NAN, + ksize.size(), + ksize.data(), + paddings.data(), + strides.data())); + float alpha = 1.0f; + float beta = 0.0f; + CUDNN_CHECK(cudnnPoolingForward(this->handle_, + this->pooling_desc_, + &alpha, + this->input_desc_, + in_data, + &beta, + this->output_desc_, + out_data)); + + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/cudnn_pool.h b/lite/backends/cuda/math/cudnn_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..acdc695b500ab41d615cb98c9501efd729c2fe6a --- /dev/null +++ b/lite/backends/cuda/math/cudnn_pool.h @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class CudnnPool2DBase { + public: + CudnnPool2DBase() + : handle_(NULL), + input_desc_(NULL), + output_desc_(NULL), + pooling_desc_(NULL) {} + + ~CudnnPool2DBase() { + if (handle_ != NULL) { + CUDNN_CHECK(cudnnDestroy(handle_)); + } + if (input_desc_) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_)); + } + if (output_desc_) { + CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_)); + } + if (pooling_desc_) { + cudnnDestroyPoolingDescriptor(pooling_desc_); + } + } + + protected: + cudaStream_t stream_; + cudnnHandle_t handle_; + cudnnTensorDescriptor_t input_desc_; + cudnnTensorDescriptor_t output_desc_; + cudnnPoolingDescriptor_t pooling_desc_; +}; + +template +class CudnnPool2DNHWC : public CudnnPool2DBase { + public: + CudnnPool2DNHWC() : CudnnPool2DBase() {} + virtual ~CudnnPool2DNHWC() = default; + virtual bool init(const operators::PoolParam& param, + Context* ctx); + + virtual bool create(const operators::PoolParam& param, + Context* ctx); + + virtual bool run(const operators::PoolParam& param); +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu index 57c9ec022a6e49551fd2d56a9b2036de13bf5a2c..8f0ebd1f97a03f03b568de694b986e9540f07c55 100644 --- a/lite/backends/cuda/math/elementwise.cu +++ b/lite/backends/cuda/math/elementwise.cu @@ -13,13 +13,55 @@ // limitations under the License. #include "lite/backends/cuda/math/elementwise.h" -#include "lite/backends/cuda/math/utils.h" namespace paddle { namespace lite { namespace cuda { namespace math { +template +__global__ void elementwise_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; +#if __CUDA_ARCH__ >= 350 + out_data[tid] = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); +#else + out_data[tid] = binary_calc(x_data[tid], y_data[idx], type); +#endif + } +} + +template +__global__ void elementwise_relu_kernel(const size_t total, + const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < total) { + int idx = tid / post % n; + Dtype temp; +#if __CUDA_ARCH__ >= 350 + temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type); + +#else + temp = binary_calc(x_data[tid], y_data[idx], type); +#endif + out_data[tid] = temp > 0 ? temp : 0; + } +} + template __global__ void elementwise_add_kernel(const size_t total, const Dtype* x_data, @@ -76,6 +118,56 @@ __global__ void elementwise_add_nhwc4_int8_kernel(const size_t total, } } +template +void elementwise(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream) { + int num = pre * n * post; + int thread = 256; + int block = (num + thread - 1) / thread; + elementwise_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); +} + +template +void elementwise_relu(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream) { + int num = pre * n * post; + int thread = 256; + int block = (num + thread - 1) / thread; + elementwise_relu_kernel<<>>( + num, x_data, y_data, out_data, pre, n, post, type); +} + +template void elementwise(const float*, + const float*, + float*, + int, + int, + int, + BinaryOperation, + cudaStream_t); + +template void elementwise_relu(const float*, + const float*, + float*, + int, + int, + int, + BinaryOperation, + cudaStream_t); + template void elementwise_add(int num, const Dtype* x_data, diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h index 7fcdf95021ff21379bf94298ed06328dd6d2db09..ce45d0544e5a55a9cdc34bdfacc2b48157f5a198 100644 --- a/lite/backends/cuda/math/elementwise.h +++ b/lite/backends/cuda/math/elementwise.h @@ -15,12 +15,33 @@ #pragma once #include #include +#include "lite/backends/cuda/math/utils.h" namespace paddle { namespace lite { namespace cuda { namespace math { +template +void elementwise(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream); + +template +void elementwise_relu(const Dtype* x_data, + const Dtype* y_data, + Dtype* out_data, + int pre, + int n, + int post, + BinaryOperation type, + cudaStream_t stream); + template void elementwise_add(int num, const Dtype* x_data, diff --git a/lite/backends/cuda/math/gemm.cc b/lite/backends/cuda/math/gemm.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9f12984aa5cddfc0acb24de1ebd66735c5d498e --- /dev/null +++ b/lite/backends/cuda/math/gemm.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/math/gemm.h" +#include +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template <> +bool Gemm::init(const bool trans_a, + bool trans_b, + const int m, + const int n, + const int k, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + lda_ = (!trans_a) ? k : m; + ldb_ = (!trans_b) ? n : k; + ldc_ = n; + m_ = m; + n_ = n; + k_ = k; + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool Gemm::init(const bool trans_a, + bool trans_b, + const int m, + const int n, + const int k, + const int lda, + const int ldb, + const int ldc, + Context *ctx) { + if (cu_handle_ == nullptr) { + this->exe_stream_ = ctx->exec_stream(); + CUBLAS_CALL(cublasCreate(&cu_handle_)); + CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_)); + } + m_ = m; + n_ = n; + k_ = k; + lda_ = lda; + ldb_ = ldb; + ldc_ = ldc; + cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; + cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N; + return true; +} + +template <> +bool Gemm::run(const float alpha, + const float beta, + const float *a, + const float *b, + float *c, + Context *ctx) { + CUBLAS_CALL(cublasSgemm(cu_handle_, + cu_trans_b_, + cu_trans_a_, + n_, + m_, + k_, + &alpha, + b, + ldb_, + a, + lda_, + &beta, + c, + ldc_)); + return true; +} + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h new file mode 100644 index 0000000000000000000000000000000000000000..12194d54b08a533a3812e10b5d2f78134c19da24 --- /dev/null +++ b/lite/backends/cuda/math/gemm.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/api/paddle_place.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/context.h" +#include "lite/core/target_wrapper.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace cuda { +namespace math { + +template +class Gemm { + public: + Gemm() : cu_handle_(nullptr) {} + ~Gemm() {} + bool init(const bool trans_a, + const bool trans_b, + const int m, + const int n, + const int k, + Context* ctx); + bool init(const bool trans_a, + const bool trans_b, + const int m, + const int n, + const int k, + const int lda, + const int ldb, + const int ldc, + Context* ctx); + + bool run(const PtypeOut alpha, + const PtypeOut beta, + const PtypeIn* a, + const PtypeIn* b, + PtypeOut* c, + Context* ctx); + + private: + cudaStream_t exe_stream_; + cublasHandle_t cu_handle_; + cublasOperation_t cu_trans_a_; + cublasOperation_t cu_trans_b_; + int m_{-1}; + int n_{-1}; + int k_{-1}; + int lda_{-1}; + int ldb_{-1}; + int ldc_{-1}; +}; + +} // namespace math +} // namespace cuda +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h index b4cd82fd8df6df063d92df709311f3c90e7cf4b6..b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff 100644 --- a/lite/backends/cuda/math/utils.h +++ b/lite/backends/cuda/math/utils.h @@ -25,6 +25,24 @@ namespace lite { namespace cuda { namespace math { +enum class BinaryOperation { + kADD = 0, + kMUL = 1, + kDIV = 2, +}; + +template +__device__ T binary_calc(T x, T y, BinaryOperation type); + +template <> +__device__ __forceinline__ float binary_calc(float x, + float y, + BinaryOperation type) { + if (type == BinaryOperation::kADD) return x + y; + if (type == BinaryOperation::kMUL) return x * y; + if (type == BinaryOperation::kDIV) return x / y; +} + template __device__ T from_float(float x); diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp index fd17218d06f050df3dc935bdde0a320e52b56a40..23332b422df65250f8cadf07f5e0d95e970d316a 100644 --- a/lite/backends/fpga/KD/pes/conv_process.hpp +++ b/lite/backends/fpga/KD/pes/conv_process.hpp @@ -294,10 +294,17 @@ inline void split_filter_num(const ConvParam& c_param) { args.image.channels = input->shape().channel(); args.image.width = input->shape().width(); args.image.height = input->shape().height(); - args.image.pad_width = param.paddings[1]; + auto paddings = *param.padding; + args.image.pad_width = param.paddings[2]; args.image.pad_height = param.paddings[0]; args.output.address = out_address; args.output.scale_address = out_scale_address; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.splitParams().push_back(conv_param); } } @@ -372,10 +379,18 @@ inline void split_channel(const ConvParam& c_param) { args.image.channels = conv_param->input.shape().channel(); args.image.width = conv_param->input.shape().width(); args.image.height = conv_param->input.shape().height(); - args.image.pad_width = param.paddings[1]; - args.image.pad_height = param.paddings[0]; + auto paddings = *param.paddings; + args.image.pad_width = paddings[2]; + args.image.pad_height = paddings[0]; + args.output.address = conv_param->output.mutableData(); args.output.scale_address = conv_param->output.scale(); + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.splitParams().push_back(conv_param); } } diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp old mode 100755 new mode 100644 index 9d7b9b544bff953662bab86f095823c5c7b3075b..f86806102d4a217ae4bb7355b36ca10d96ca4a05 --- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp +++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp @@ -61,14 +61,21 @@ class DepthwiseConvPE : public PE { args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - args.image.pad_width = param.paddings[0]; - args.image.pad_height = param.paddings[1]; + auto paddings = *param.paddings; + args.image.pad_width = param.paddings[2]; + args.image.pad_height = param.paddings[0]; args.image.scale_address = input->scale(); args.output.address = output->data(); args.output.scale_address = output->scale(); args.out_width = param.output->shape().width(); args.out_height = param.output->shape().height(); args.sub_conv_num = 1; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; + } param.args = args; inplace_.relu_enable = param_.relu.enabled; diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp index fd3be1f463d3bfce925cc4ce5444d119c33e5692..5bb4f5285a48c7696b1f0f78a9b1c4fe6a9d76c5 100644 --- a/lite/backends/fpga/KD/pes/pooling_pe.hpp +++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp @@ -45,13 +45,14 @@ class PoolingPE : public PE { PoolingArgs args = {0}; args.mode = param_.type; + auto paddings = *param_.paddings; args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height)); args.image.address = input->data(); args.image.channels = input->shape().channel(); args.image.height = input->shape().height(); args.image.width = input->shape().width(); - args.image.pad_height = param_.paddings[0]; - args.image.pad_width = param_.paddings[1]; + args.image.pad_height = paddings[0]; + args.image.pad_width = paddings[2]; args.image.scale_address = input->scale(); args.output.address = output->mutableData(); args.output.scale_address = output->scale(); @@ -76,12 +77,13 @@ class PoolingPE : public PE { float* image_addr = float_input.mutableData(FP32, input->shape()); float_input.copyFrom(input); float16* data_out = output->data(); + auto paddings = *param_.paddings; int image_height = input->shape().height(); int image_width = input->shape().width(); int image_channels = input->shape().channel(); - int image_pad_h = param_.paddings[0]; - int image_pad_w = param_.paddings[1]; + int image_pad_h = paddings[0]; + int image_pad_w = paddings[2]; int kernel_height = param_.kernelSize[1]; int kernel_width = param_.kernelSize[0]; int kernel_step_h = param_.strides[0]; diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc index ad5bed5be91298744abc0675bf12adb117afb60b..954fad8c916e152c5de06ce285b4ac17ecf22a01 100644 --- a/lite/backends/npu/builder.cc +++ b/lite/backends/npu/builder.cc @@ -142,21 +142,25 @@ ge::TensorPtr CvtTensor(lite::Tensor* in_tensor, int CvtActMode(std::string act_type) { int act_mode = 1; - if (act_type == "sigmod") { + if (act_type == "sigmoid") { act_mode = 0; } else if (act_type == "relu") { act_mode = 1; } else if (act_type == "tanh") { act_mode = 2; + } else if (act_type == "relu_clipped") { + act_mode = 3; } else if (act_type == "elu") { act_mode = 4; + } else if (act_type == "leaky_relu") { + act_mode = 5; } else if (act_type == "abs") { act_mode = 6; } else if (act_type == "softsign") { act_mode = 8; } else if (act_type == "softplus") { act_mode = 9; - } else if (act_type == "hardsigmoid") { + } else if (act_type == "hard_sigmoid") { act_mode = 10; } else { // TODO(hong19860320) support more activation mode diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h index 02f7071a4e1c5436cce4b4956aa5529fd74be282..70200354fbab15f043a537300e92e2a26a3d739e 100644 --- a/lite/backends/npu/builder.h +++ b/lite/backends/npu/builder.h @@ -31,117 +31,6 @@ // Extended Ops of HIAI DDK namespace ge { -/** - * Multiply the matrix x1 by the matrix x2 to generate x1 * x2. - * The inputs must be two-dimensional matrices and the inner dimension of "x1" - * (after being transposed if transpose_x1 is true) must match the outer - * dimension of "x2" (after being transposed if transposed_x2 is true). - * x : the first input tensor, must be non const op. - * w : the second input tensor, must be const op. - * bias: the optional bias tensor, must be const op. - * - * y : the output tensor. - * - * has_bias: If true, enable input bias. - */ -REG_OP(MatMul) - .INPUT(x, TensorType({DT_FLOAT})) - .INPUT(w, TensorType({DT_FLOAT})) - .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT})) // bias must be const input - .OUTPUT(y, TensorType({DT_FLOAT})) - .ATTR(has_bias, AttrValue::BOOL{false}) // when has input::bias,set true - .OP_END(); - -/** - * Computes the gradients of convolution with respect to the input. - * - * input_sizes : An integer vector representing the shape of input, - * where input is a 4-D [batch, height, width, channels] tensor. - * filter : the filter tensor, with shape [H , W, filter_channel, - * filter_number], filter_channel must be same as x channel. - * x : The input tensor. - * - * y : The output tensor. - * - * format: 0: NCHW. 1: NHWC - * group : 1: default - * num_output : 0: default, num_output must be equal to - * (filter_channel * group) - * pad : Padding for the beginning and ending along each axis - * stride : Stride along each axis. - * dilation : dilation value along each axis of the filter. - * pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET - * bias_term : 0: default - * kernel : The shape of the convolution kernel - */ -REG_OP(Deconvolution) - .INPUT(input_sizes, TensorType({DT_UINT8})) - .INPUT(filter, TensorType({DT_FLOAT})) - .INPUT(x, TensorType({DT_FLOAT})) - .OPTIONAL_INPUT(b, TensorType({DT_FLOAT})) - .OUTPUT(y, TensorType({DT_FLOAT})) - .ATTR(mode, AttrValue::INT{1}) - .ATTR(format, AttrValue::INT{1}) - .ATTR(group, AttrValue::INT{1}) - .ATTR(num_output, AttrValue::INT{0}) - .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0})) - .ATTR(stride, AttrValue::LIST_INT({1, 1})) - .ATTR(dilation, AttrValue::LIST_INT({1, 1})) - .ATTR(pad_mode, AttrValue::INT{0}) - .ATTR(bias_term, AttrValue::INT{0}) - .ATTR(kernel, AttrValue::LIST_INT({0, 0})) - .OP_END(); - -/** - * Resize images to size using bilinear interpolation. - * - * x : The tensor of 4-D - * w : A int32 Tensor of 2 elements: [height, width]. - * - * y : the output tensor - * - * align_corners : If true, the centers of the 4 corner pixels of the - * input and output tensors are aligned, preserving the values at the corner - * pixels. - * output_dim_mode : Defaults 2, including 0: zoom_factor , 1: - * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is - * controled by the [height, width] of w. - * shrink_factor : shrink factor. - * zoom_factor : zoom factor. - * pad_begin : begin of pad. - * pad_end : end of pad. - */ -REG_OP(ResizeBilinear) - .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) - .INPUT(w, TensorType({DT_FLOAT, DT_INT32})) - .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32})) - .ATTR(align_corners, AttrValue::BOOL{false}) - .ATTR(output_dim_mode, AttrValue::INT{2}) - .ATTR(shrink_factor, AttrValue::INT{1}) - .ATTR(zoom_factor, AttrValue::INT{1}) - .ATTR(pad_begin, AttrValue::INT{0}) - .ATTR(pad_end, AttrValue::INT{0}) - .OP_END(); - -/** - * Resize images to size using nearest neighbor interpolation. - * - * image : Resize images to size using nearest neighbor interpolation. - * size : Must be one dimension and two elements - * - * output : the output tensor - * - * align_corners : If true, the centers of the 4 corner pixels of the - * input and output tensors are aligned, preserving the values at the corner - * pixels. Defaults to false - */ -REG_OP(ResizeNearestNeighbor) - .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) - .INPUT(size, TensorType({DT_INT32})) - .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) - .ATTR(align_corners, AttrValue::BOOL{false}) - .OP_END(); - /** * Pads a tensor. * diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc index 357ac8c2d6ae340743fa713641e3e89449f1489f..93e176f9ed102f0675c987e57ddde6088158ec97 100644 --- a/lite/backends/opencl/cl_wrapper.cc +++ b/lite/backends/opencl/cl_wrapper.cc @@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() { do { \ cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \ if (cl_func##_ == nullptr) { \ - LOG(ERROR) << "Cannot find the " << #cl_func \ + LOG(FATAL) << "Cannot find the " << #cl_func \ << " symbol in libOpenCL.so!"; \ break; \ } \ diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt index 2dea4364d5ee2d11d6d266935fad2a1180954369..a89107632341cf063ac3166aa9890ff383e3383f 100644 --- a/lite/backends/x86/math/CMakeLists.txt +++ b/lite/backends/x86/math/CMakeLists.txt @@ -50,7 +50,8 @@ math_library(unpooling) math_library(vol2col) ## math_library(prelu) math_library(tree2col DEPS math_function) - +math_library(sequence_topk_avg_pooling) +math_library(search_fc DEPS blas dynload_mklml) # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index bbe35b4de5508c70496e5c8566c8d1b982a7155c..8d61fb3bbb97705c697fba934e6cab9424f85bad 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "lite/backends/x86/math/beam_search.h" #include +#include #include #include "lite/fluid/lod.h" diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc index 9da239f9c63371350403cc0bd0eecc94eab87590..ab6c1edb481f914d5751149aca2595fee550ca51 100644 --- a/lite/backends/x86/math/pooling.cc +++ b/lite/backends/x86/math/pooling.cc @@ -49,7 +49,7 @@ class Pool2dFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -130,7 +130,7 @@ class Pool2dGradFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -213,7 +213,7 @@ class MaxPool2dGradFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; @@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor { const int stride_height = strides[0]; const int stride_width = strides[1]; const int padding_height = paddings[0]; - const int padding_width = paddings[1]; + const int padding_width = paddings[2]; const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc new file mode 100644 index 0000000000000000000000000000000000000000..56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021 --- /dev/null +++ b/lite/backends/x86/math/search_fc.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/x86/math/search_fc.h" +#include +#include + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +class SearchFcFunctor { + public: + void operator()(const lite::X86Context& context, + const lite::Tensor& bottom, + const lite::Tensor& w, + const lite::Tensor& b, + lite::Tensor* top, + int out_size) { + int batch = bottom.dims()[0]; + + int _out = w.dims()[0]; // 100 + int _in = w.dims()[1]; // 228 + + lite::DDim dims(std::vector({bottom.dims()[0], out_size})); + + const auto bottom_data = bottom.data(); + auto top_data = top->mutable_data(lite::TargetType::kX86); + const auto weights = w.data(); + auto blas = math::GetBlas(context); + call_gemm(blas, + CblasNoTrans, + CblasTrans, + batch, + _out, + _in, + 1.0f, + bottom_data, + weights, + 0.0f, + top_data); + if (true) { + const auto* bias_data = b.data(); + for (int i = 0; i < batch; ++i) { + // add bias here + sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out); + } + } + } + + // private: +}; + +#define DEFINE_FUNCTOR(type) \ + template class SearchFcFunctor; + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/search_fc.h b/lite/backends/x86/math/search_fc.h new file mode 100644 index 0000000000000000000000000000000000000000..e415c396023dbc10358992012197f4cfebac554f --- /dev/null +++ b/lite/backends/x86/math/search_fc.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/backends/x86/mklml.h" +#include "lite/core/context.h" +#include "lite/core/tensor.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +void call_gemm(const BlasT blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C) { +#ifndef __NAIVE_GEMM__ + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +#else + naive::gemm((TransA == CblasTrans), + (TransB == CblasTrans), + M, + N, + K, + alpha, + A, + B, + beta, + C); +#endif // !__NAIVE_GEMM__ +} + +// To align with Lego +#ifndef LEGO_USE_FLOAT +#define LEGO_USE_FLOAT +#endif +#ifndef LEGO_SSE +#define LEGO_SSE +#endif + +#if defined(LEGO_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_setzero_px _mm256_setzero_ps +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_hadd_px _mm256_hadd_ps +#define _mm256_permute2f128_px _mm256_permute2f128_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss +#define _mm256_castpx256_px128 _mm256_castps256_ps128 +#define _mm256_max_px _mm256_max_ps +#define _mm256_sub_px _mm256_sub_ps +#define _mm256_set1_px _mm256_set1_ps +#define _mm256_sqrt_px _mm256_sqrt_ps +#define _mm256_div_px _mm256_div_ps +#define _mm_setzero_px _mm_setzero_ps +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_hadd_px _mm_hadd_ps +#define _mm_store_sx _mm_store_ss +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps +#define _mm_max_px _mm_max_ps +#define _mm_sub_px _mm_sub_ps +#define _mm_set1_px _mm_set1_ps +#define _mm_sqrt_px _mm_sqrt_ps +#define _mm_div_px _mm_div_ps + +#elif defined(LEGO_USE_DOUBLE) + +#define __m256x __m256d +#define __m128x __m128d + +static const unsigned int AVX_STEP_SIZE = 4; +static const unsigned int SSE_STEP_SIZE = 2; +static const unsigned int AVX_CUT_LEN_MASK = 3U; +static const unsigned int SSE_CUT_LEN_MASK = 1U; + +#define _mm256_setzero_px _mm256_setzero_pd +#define _mm256_mul_px _mm256_mul_pd +#define _mm256_add_px _mm256_add_pd +#define _mm256_load_px _mm256_loadu_pd +#define _mm256_hadd_px _mm256_hadd_pd +#define _mm256_permute2f128_px _mm256_permute2f128_pd +#define _mm256_store_px _mm256_storeu_pd +#define _mm256_broadcast_sx _mm256_broadcast_sd +#define _mm256_castpx256_px128 _mm256_castpd256_pd128 +#define _mm256_max_px _mm256_max_pd +#define _mm256_sub_px _mm256_sub_pd +#define _mm256_set1_px _mm256_set1_pd +#define _mm256_sqrt_px _mm256_sqrt_pd +#define _mm256_div_px _mm256_div_pd +#define _mm_setzero_px _mm_setzero_pd +#define _mm_add_px _mm_add_pd +#define _mm_mul_px _mm_mul_pd +#define _mm_load_px _mm_loadu_pd +#define _mm_hadd_px _mm_hadd_pd +#define _mm_store_sx _mm_store_sd +#define _mm_store_px _mm_storeu_pd +#define _mm_load1_px _mm_load1_pd +#define _mm_max_px _mm_max_pd +#define _mm_sub_px _mm_sub_pd +#define _mm_set1_px _mm_set1_pd +#define _mm_sqrt_px _mm_sqrt_pd +#define _mm_div_px _mm_div_pd +#endif + +template +inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(LEGO_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px( + z + jjj, + _mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj))); + } +#elif defined(LEGO_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(z + jjj, + _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj))); + } +#endif + for (; jjj < len; jjj++) { + z[jjj] = x[jjj] + y[jjj]; + } +} + +template +class SearchFcFunctor { + public: + void operator()(const lite::Context& context, + const lite::Tensor& X, + const lite::Tensor& W, + const lite::Tensor& b, + lite::Tensor* Out, + int out_size); +}; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle + +#define FOR_ALL_TYPES(macro) macro(float); diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc new file mode 100644 index 0000000000000000000000000000000000000000..035a7923c70f91cf27f1d845f68110f8f33cb73d --- /dev/null +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "lite/backends/x86/math/sequence_topk_avg_pooling.h" +#include +#include + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { + +template +void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) { + size_t real_k = k < length ? k : length; + + std::vector v(data, data + length); + + std::vector topk_pos; + T min_val = -10000000.0; + while (topk_pos.size() < real_k) { + T max_val = min_val; + int max_pos = -1; + for (int i = 0; i < length; ++i) { + if (v[i] > max_val) { + max_pos = i; + max_val = v[i]; + } + } + + assert(max_pos >= 0); + + topk_pos.push_back(max_pos); + v[max_pos] = min_val; + } + + assert(topk_pos.size() > 0); + while (topk_pos.size() < (size_t)k) { + topk_pos.push_back(-1); + } + + for (size_t i = 0; i < topk_pos.size(); ++i) { + pos[i] = topk_pos[i]; + } +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +class SequenceTopkAvgPoolingFunctor { + public: + void operator()(const lite::Tensor& in, + const lite::Tensor& row, + const lite::Tensor& col, + lite::Tensor* out, + lite::Tensor* pos, + int channel_num, + std::vector topks) { + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + std::vector vec_pos_shape; + auto in_lod = in.lod()[0]; + auto row_lod = row.lod()[0]; + auto col_lod = col.lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + vec_pos_shape.push_back(pos_total_size); + lite::DDim dims(vec_pos_shape); + pos->Resize(dims); + auto pos_data = pos->mutable_data(lite::TargetType::kX86); + + int offset = 0; + std::vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + + lite::LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto in_data = in.data(); + auto out_data = out->mutable_data(lite::TargetType::kX86); + + T* sum_data = new T[max_k]; + for (int i = 0; i < batch_size; ++i) { + int total_size = in_lod[i + 1] - in_lod[i]; + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + + CHECK_EQ(total_size, channel_num * row_size * col_size) + << "size wrong in sequence_topk_avg_pooling_op!"; + + int feature_num = row_size * col_size; + for (int j = 0; j < channel_num; ++j) { + auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num; + + for (int r = 0; r < row_size; ++r) { + auto row_data = input_offset_feature_data + r * col_size; + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto out_slice_data = out_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + + get_topk_pos(row_data, col_size, max_k, pos_slice_data); + if (pos_slice_data[0] == -1) { + sum_data[0] = 0.0; + } else { + sum_data[0] = row_data[pos_slice_data[0]]; + } + for (int k = 1; k < max_k; ++k) { + if (pos_slice_data[k] == -1) { + sum_data[k] = sum_data[k - 1]; + } else { + sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]]; + } + } + for (size_t k = 0; k < k_num; ++k) { + out_slice_data[k] = sum_data[topks[k] - 1] / topks[k]; + } + } + } + } + delete[] sum_data; + } +}; + +#define DEFINE_FUNCTOR(type) \ + template class SequenceTopkAvgPoolingFunctor; + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.h b/lite/backends/x86/math/sequence_topk_avg_pooling.h new file mode 100644 index 0000000000000000000000000000000000000000..78d458c4d8fe0bf5a117cb5ad23d44bf0b7f3471 --- /dev/null +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/context.h" +#include "lite/core/tensor.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace x86 { +namespace math { +template +void get_topk_pos( + const T* data, int length, int k, int* pos, bool debug = false); + +template +class SequenceTopkAvgPoolingFunctor { + public: + void operator()(const lite::Tensor& X, + const lite::Tensor& ROW, + const lite::Tensor& COLUMN, + lite::Tensor* Out, + lite::Tensor* pos, + int channel_num, + std::vector topks); +}; + +} // namespace math +} // namespace x86 +} // namespace lite +} // namespace paddle + +#define FOR_ALL_TYPES(macro) macro(float); diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index b02ef8fed6ebd62282352f1e3cb6819a0f66885e..641302cd2d3739b08c18ca010ee72b7ffde9198c 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -100,7 +100,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) #----------------------------------------------- NOT CHANGE ----------------------------------------------- lite_cc_library(kernel SRCS kernel.cc DEPS context type_system target_wrapper any op_params tensor - PROFILE_DEPS basic_profiler + PROFILE_DEPS lite_profiler ) lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel cpp_op_desc tensor @@ -114,7 +114,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper) lite_cc_library(program SRCS program.cc DEPS op kernel model_parser ${ops} ${cpp_wrapper} - PROFILE_DEPS basic_profiler) + PROFILE_DEPS lite_profiler) if (NOT LITE_ON_TINY_PUBLISH) lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program) diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index c59c078787b9a6778227ba6ba51230d1fc2104cb..561a508d20f1db9283a410b8ee35dd851149429c 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -37,6 +37,9 @@ void TestCase::CreateInstruction() { // prepare context (*it)->SetContext(std::move(ctx_)); instruction_.reset(new Instruction(op, std::move(*it))); +#ifdef LITE_WITH_PROFILE + instruction_->set_profiler(new profile::Profiler()); +#endif } void TestCase::PrepareInputsForInstruction() { diff --git a/lite/core/context.h b/lite/core/context.h index 19238f1a9b609c794a3dfe9763a8becdcca8ad16..5063600d3621f28dee8cfa91f79ae3287853f7ab 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -253,6 +253,13 @@ class Context { std::string name() const { return "CUDAContext"; } + CUDAContext& operator=(const CUDAContext& context) { + this->Init( + context.device_id_, context.exec_stream_id_, context.io_stream_id_); + cublas_fp32_ = const_cast(context).cublas_fp32(); + return *this; + } + private: int device_id_; // overall information @@ -345,7 +352,6 @@ class ContextScheduler { std::unique_ptr NewContext(TargetType target) { std::unique_ptr ctx(new KernelContext); - switch (target) { case TARGET(kHost): kernel_contexts_[TargetType::kHost].As().CopySharedTo( @@ -416,6 +422,7 @@ class ContextScheduler { void InitContext() { kernel_contexts_[Type].As().InitOnce(); } + ContextScheduler() { InitContext(); #ifdef LITE_WITH_X86 diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 166c04c000d345eb39822d1d67321a1c6a05e9a5..f5b757ac3ccd6310f6a6fd9fe6483d28ff7adbc6 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -1039,7 +1039,7 @@ int DeviceInfo::Setup() { << ", max freq: " << max_freqs_[i] << ", min freq: " << min_freqs_[i] << ", cluster ID: " << cluster_ids_[core_ids_[i]] - << ", CPU ARCH: A" << archs_[i]; + << ", CPU ARCH: A" << static_cast(archs_[i]); } LOG(INFO) << "L1 DataCache size is: "; for (int i = 0; i < core_num_; ++i) { @@ -1093,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { RequestPowerRandLowMode(shift_num, thread_num); break; default: - LOG(FATAL) << "Unsupported power mode: " << mode; + LOG(FATAL) << "Unsupported power mode: " << static_cast(mode); break; } if (active_ids_.empty()) { diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 05d7a6b333810a8dc988d84a281f096babe8929f..86193235a2984b15a33c2eeaff15865d9f126eeb 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -31,7 +31,7 @@ #include "lite/utils/replace_stl/stream.h" #ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" +#include "lite/core/profile/profiler.h" #endif // LITE_WITH_PROFILE namespace paddle { @@ -58,7 +58,10 @@ class KernelBase { virtual void Run() = 0; #ifdef LITE_WITH_PROFILE - void SetProfileID(uint32_t id) { profile_id_ = id; } + void SetProfiler(profile::Profiler* profiler, int id) { + profiler_ = profiler; + profile_id_ = id; + } #endif void Launch() { @@ -82,10 +85,12 @@ class KernelBase { #endif #ifdef LITE_WITH_PROFILE - if (profile_id_ >= 0) { - profile::ProfileBlock x(profile_id_, "kernel"); - Run(); - } + CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. " + "When LITE_WITH_PROFILE is defined, please set a " + "Profiler for Instruction."; + profiler_->StartTiming(profile_id_, ctx_.get()); + Run(); + profiler_->StopTiming(profile_id_, ctx_.get()); #else Run(); #endif @@ -175,6 +180,7 @@ class KernelBase { bool is_first_epoch_{true}; #ifdef LITE_WITH_PROFILE + profile::Profiler* profiler_{nullptr}; int profile_id_{-1}; #endif }; diff --git a/lite/core/memory.cc b/lite/core/memory.cc index ec94f69be1e5c107cc61af80cdea7d006436021b..eefada3f998d5ad533c832fcd2a2c0b6c90d23d0 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -110,7 +110,7 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) { TargetWrapper::MemcpySync( dst, src, size, IoDirection::DtoD); break; -#endif +#endif #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD); diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc index ff064fb2ee93fc540e932da36fb07bb78eef989a..0d11b47db6a7f767f8cd032877d8647b0872b8d4 100644 --- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc @@ -47,4 +47,5 @@ void ConvActivationFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_activation_fuse_pass, paddle::lite::mir::ConvActivationFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc index d9d9c1bbf55bd33c31aa9a22de934d4eae8657c6..5ab5f8c0a4797e51cce656de43883a68d4931e9b 100644 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc @@ -45,4 +45,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass) .BindTargets({TARGET(kAny)}) - .ExcludeTargets({TARGET(kX86)}); + .ExcludeTargets({TARGET(kX86), TARGET(kXPU)}); diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc index fd9aadc5d01c2cb3b6c7a3e888503072a0798725..b1b492ce030c7a46d8b23936c1661f3d743eb9cb 100644 --- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc @@ -46,4 +46,5 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass, paddle::lite::mir::ConvElementwiseFusePass) - .BindTargets({TARGET(kAny)}); + .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}); diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc index af66f5ab66bd09907cb9d28f00f17d983e54c252..e4391cd24287cafe457074733ba73208288c3375 100644 --- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc +++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc @@ -35,4 +35,5 @@ void ElementwiseAddActivationFusePass::Apply( REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass, paddle::lite::mir::ElementwiseAddActivationFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("fusion_elementwise_add_activation"); diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc index ed10f06f5651f4000485279d682689101d80aa5a..7fc449219251bbd7e639e8092099f43fe8eca626 100644 --- a/lite/core/mir/fusion/fc_fuse_pass.cc +++ b/lite/core/mir/fusion/fc_fuse_pass.cc @@ -33,4 +33,5 @@ void FcFusePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass) .BindTargets({TARGET(kAny)}) + .ExcludeTargets({TARGET(kXPU)}) .BindKernel("fc"); diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index f823f45dc66f8ef6cc67cbb9b0d9860c86ec9340..da611e4490f4ba7268d9011b3dbb391a63a88305 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -396,6 +396,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, op_desc->SetAttr("input_scale", scale_value); op_desc->SetInput("X", {input_act_node->arg()->name}); IR_NODE_LINK_TO(input_act_node, quantized_node) + auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); + quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); // delete nodes and edges std::unordered_set nodes2rm = {input_scale_node, @@ -440,6 +442,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, op_desc->SetInput("Y", {input_act_right_node->arg()->name}); IR_NODE_LINK_TO(input_act_left_node, quantized_node) IR_NODE_LINK_TO(input_act_right_node, quantized_node) + auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); + quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); // delete nodes and edges std::unordered_set nodes2rm = {input_scale_left_node, diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 1f2355e8a3205cce3410bd2cb6ac4a17d8fde602..4f41ba4a601ae763e6fa48c0a98de238252ea7c2 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -255,4 +255,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { } // namespace paddle REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) - .BindTargets({TARGET(kARM)}); + .BindTargets({TARGET(kARM)}) + .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)}); diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h index 4de0fdbf357160348a403d3c8527fe62891237f0..4e8c8be292bbd5e7f46664378634d4f1aeed2965 100644 --- a/lite/core/mir/pass.h +++ b/lite/core/mir/pass.h @@ -52,34 +52,44 @@ class Pass { // Bind targets. At runtime, there must be one device in the bound targets. void BindTargets(const std::set& targets) { - std::set res; for (const auto& target : targets) { const std::set& universe = ExpandValidTargets(target); std::set_union(bound_targets_.begin(), bound_targets_.end(), universe.begin(), universe.end(), - std::inserter(res, res.begin())); + std::inserter(bound_targets_, bound_targets_.begin())); } - bound_targets_ = res; } // Exclude targets. At runtime, there must be one device in the bound targets. + // Disable the pass if one of the valid devices is in the excluded targets. void ExcludeTargets(const std::set& targets) { - std::set res; for (const auto& target : targets) { const std::set& universe = ExpandValidTargets(target); - std::set_difference(bound_targets_.begin(), - bound_targets_.end(), - universe.begin(), - universe.end(), - std::inserter(res, res.begin())); + std::set updated_bound_targets; + std::set_difference( + bound_targets_.begin(), + bound_targets_.end(), + universe.begin(), + universe.end(), + std::inserter(updated_bound_targets, updated_bound_targets.begin())); + bound_targets_ = updated_bound_targets; + std::set_union( + excluded_targets_.begin(), + excluded_targets_.end(), + universe.begin(), + universe.end(), + std::inserter(excluded_targets_, excluded_targets_.begin())); } - bound_targets_ = res; } // Get all bound targets. - const std::set& Targets() const { return bound_targets_; } + const std::set& BoundTargets() const { return bound_targets_; } + // Get all excluded targets. + const std::set& ExcludedTargets() const { + return excluded_targets_; + } // Some passes are only available on qualified kernels and need to be // explicitly declared. @@ -116,6 +126,7 @@ class Pass { std::string name_; std::string doc_; std::set bound_targets_; + std::set excluded_targets_; std::unordered_map> bound_kernels_; }; diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc index 4f6be2c186d2d940a799201812cce397a9e94eb4..5bddfcbd3c17288546dc6e0a0b4ebf984d26c504 100644 --- a/lite/core/mir/pass_utils.cc +++ b/lite/core/mir/pass_utils.cc @@ -47,10 +47,34 @@ bool KernelRegistered(const std::string name, const Place& place) { return false; } -bool PassMatchesTarget(const mir::Pass& pass, TargetType target) { - const auto& targets = pass.Targets(); - if (targets.find(TARGET(kAny)) != targets.end()) return true; - return (targets.find(target) != targets.end()); +bool PassMatchesTarget(const mir::Pass& pass, + const std::set& targets) { + // Whether the pass is suitable for targets ? The condition is the + // intersection of targets and pass's bound targets is not empty, besides the + // intersection of targets and pass's excluded targets is empty. The formula + // is as follows: matched = !empty(targets ^ pass.bound_targets) && + // empty(targets ^ pass.excluded_targets), where ^ is intersection operation. + const auto& bound_targets = pass.BoundTargets(); + bool matched = bound_targets.find(TARGET(kAny)) != bound_targets.end(); + std::set inter_bound_targets; + std::set_intersection( + bound_targets.begin(), + bound_targets.end(), + targets.begin(), + targets.end(), + std::inserter(inter_bound_targets, inter_bound_targets.begin())); + matched |= !inter_bound_targets.empty(); + const auto& excluded_targets = pass.ExcludedTargets(); + matched &= excluded_targets.find(TARGET(kAny)) == excluded_targets.end(); + std::set inter_excluded_targets; + std::set_intersection( + excluded_targets.begin(), + excluded_targets.end(), + targets.begin(), + targets.end(), + std::inserter(inter_excluded_targets, inter_excluded_targets.begin())); + matched &= inter_excluded_targets.empty(); + return matched; } bool PassMatchesKernels(const mir::Pass& pass) { diff --git a/lite/core/mir/pass_utils.h b/lite/core/mir/pass_utils.h index 942f64bf3190be1f399ac6f014be0881b1450d9b..57e8da5e461f40bd79ece8139c3290e17e762996 100644 --- a/lite/core/mir/pass_utils.h +++ b/lite/core/mir/pass_utils.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "lite/core/mir/pass.h" @@ -24,7 +25,8 @@ namespace lite { bool KernelRegistered(const std::string name, const Place& place); // Check if the pass hits the hardware target. -bool PassMatchesTarget(const mir::Pass& pass, TargetType target); +bool PassMatchesTarget(const mir::Pass& pass, + const std::set& targets); // Check if the pass hits all necessary operators. bool PassMatchesKernels(const mir::Pass& pass); diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h index 7187ddcef6626888eaaf372f7b027aa5d9bd2a3a..cd54e2654c22b98cbacc9a73bef7770a029c0b30 100644 --- a/lite/core/mir/static_kernel_pick_pass.h +++ b/lite/core/mir/static_kernel_pick_pass.h @@ -48,7 +48,8 @@ class StaticKernelPickPass : public mir::StmtPass { private: // Score the kernel. - size_t KernelGrade(const lite::KernelBase& kernel, + size_t KernelGrade(const lite::mir::Node::Stmt& instruct, + const lite::KernelBase& kernel, const std::vector& places) { CHECK_GT(places.size(), 0) << "valid_places is empty."; float final_score{-1.}; @@ -66,10 +67,11 @@ class StaticKernelPickPass : public mir::StmtPass { // valid_places.size() as default. // where i is the place's index in valid_places array. // score: score is the weighted sum of target、percision and layout - for (int i = 0; i < place_size; ++i) { + for (size_t i = 0; i < place_size; ++i) { const auto& place = places[i]; float weight = static_cast(place_size - i) / place_size; size_t score{}; + // The more important factor comes first if (kernel_pick_factors_.IsTargetConsidered() && (place.target == kernel.target() || kernel.target() == TARGET(kAny) || @@ -82,8 +84,12 @@ class StaticKernelPickPass : public mir::StmtPass { (place.precision == kernel.precision() || kernel.precision() == PRECISION(kAny) || place.precision == PRECISION(kAny))) { - score += kMax / static_cast( - core::KernelPickFactor::Factor::PrecisionFirst); + // score skipped, if kernel is int8, but op is not int8 + if (!(kernel.precision() == PRECISION(kInt8) && + !instruct.op_info()->HasAttr("enable_int8"))) { + score += kMax / static_cast( + core::KernelPickFactor::Factor::PrecisionFirst); + } } VLOG(4) << "[score s2]:" << score; if (kernel_pick_factors_.IsDataLayoutConsidered() && @@ -102,17 +108,17 @@ class StaticKernelPickPass : public mir::StmtPass { VLOG(4) << "[score(final)]:" << final_score; VLOG(4) << "-------- pick summary --------"; - VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " " - << DataLayoutToStr(winner_place.layout) << " " + VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision) + << " " << DataLayoutToStr(winner_place.layout) << " " << TargetToStr(winner_place.target); VLOG(4) << " ===> kernel.place():" << PrecisionToStr(kernel.place().precision) << " " << DataLayoutToStr(kernel.place().layout) << " " << TargetToStr(kernel.place().target); VLOG(4) << "kernel.op_type():" << kernel.op_type(); - VLOG(4) << "picker tactic " << kernel_pick_factors_; - VLOG(4) << "kernel place " << kernel.place().DebugString(); - VLOG(4) << "picker place " << winner_place.DebugString(); + VLOG(4) << "kernel picker factors:" << kernel_pick_factors_; + VLOG(4) << "kernel place:" << kernel.place().DebugString(); + VLOG(4) << "winner_picker place:" << winner_place.DebugString(); VLOG(4) << "------------------------------"; // The data layout is not considered, for the input and output arguments diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc index c83cd70d8225a0b33a50ebdad331283f377e0059..65c29aa68f1c8c5f5702ca97d27f9579edc7a951 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc @@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph( // persistable=true, Sothat the model parser can recognize it and save it to // param files if (!lite::npu::BuildModel(inputs, outputs, weight)) { - LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")"; - throw std::runtime_error("Build NPU graph failed."); + LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")"; + } else { + LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")"; } - LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")"; return weight_var_name; } @@ -175,40 +175,19 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr& graph) { supported_op_types.push_back(i.first); } - try { - int num_subgraph = FuseSubgraph(graph, supported_op_types); - InferOnce(graph); - auto op_nodes_all = ClassifySubgraph(graph); - CHECK_EQ(op_nodes_all.size(), num_subgraph); - int id = 1; - for (auto& op_nodes : op_nodes_all) { - LOG(INFO) << "[NPU] Converting Subgraph " << id; - GenNPUSubgraph(graph, op_nodes.second, id); - LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n" - << Visualize(graph.get()); - id++; - } - } catch (...) { - LOG(WARNING) << "[NPU] Build NPU graph failed."; - throw std::runtime_error("[NPU] Build NPU graph failed."); - } - - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - LOG(INFO) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } + int num_subgraph = FuseSubgraph(graph, supported_op_types); + InferOnce(graph); + auto op_nodes_all = ClassifySubgraph(graph); + CHECK_EQ(op_nodes_all.size(), num_subgraph); + int id = 1; + for (auto& op_nodes : op_nodes_all) { + LOG(INFO) << "[NPU] Converting Subgraph " << id; + GenNPUSubgraph(graph, op_nodes.second, id); + LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n" + << Visualize(graph.get()); + id++; } } - -std::unique_ptr GenerateNPUProgramPass::GenProgram() { - LOG(INFO) << "[NPU] program insts.size " << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - return program; -} - } // namespace subgraph } // namespace mir } // namespace lite diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h index 823ca5f1f624a9e920a5f395a9d5098c5ea52929..5b1a98c6ed0e10f4fae8832b9ba3c5f98f3d9ed9 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.h +++ b/lite/core/mir/subgraph/generate_npu_program_pass.h @@ -35,7 +35,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { using key2nodes_t = std::map; void Apply(const std::unique_ptr& graph) override; - std::unique_ptr GenProgram(); protected: // nodes2cvt: op nodes to convert @@ -54,9 +53,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass { void GenNPUSubgraph(const std::unique_ptr& graph, const std::unordered_set& op_nodes, int sub_id); - - private: - std::vector insts_; }; } // namespace subgraph diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index 95339d6175c98f22d542db24f02d6d714ccbe2a8..1afb54c692592ca42d8b120dcf1a91922e19149c 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) { TestModel(FLAGS_model_dir, FLAGS_model_file, FLAGS_params_file, - {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}, - lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}}, + {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kARM), PRECISION(kFloat)}}, input_tensor_shape, FLAGS_optimized_model_dir + "/NPU"); // verify results diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc index 319e1e51feb917b803753807ddbb1f72c2cb7084..4340cb4ee3cccad32db9bc333b5856386812c62a 100644 --- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc @@ -115,10 +115,10 @@ std::string GenerateXPUProgramPass::BuildXPUGraph( graph_ctx.params, &ordered_cvted_var_nodes, weight)) { - LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")"; - throw std::runtime_error("[XPU] Build XPU graph failed."); + LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")"; + } else { + LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")"; } - LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")"; return weight_var_name; } @@ -162,40 +162,19 @@ void GenerateXPUProgramPass::Apply(const std::unique_ptr& graph) { supported_op_types.push_back(i.first); } - try { - int num_subgraph = FuseSubgraph(graph, supported_op_types); - InferOnce(graph); - auto op_nodes_all = ClassifySubgraph(graph); - CHECK_EQ(op_nodes_all.size(), num_subgraph); - int id = 1; - for (auto& op_nodes : op_nodes_all) { - LOG(INFO) << "[XPU] Converting Subgraph " << id; - GenXPUSubgraph(graph, op_nodes.second, id); - LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n" - << Visualize(graph.get()); - id++; - } - } catch (...) { - LOG(WARNING) << "[XPU] Build XPU graph failed."; - throw std::runtime_error("[XPU] Build XPU graph failed."); - } - - for (auto& item : graph->StmtTopologicalOrder()) { - if (item->IsStmt()) { - auto& stmt = item->AsStmt(); - LOG(INFO) << stmt; - insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); - } + int num_subgraph = FuseSubgraph(graph, supported_op_types); + InferOnce(graph); + auto op_nodes_all = ClassifySubgraph(graph); + CHECK_EQ(op_nodes_all.size(), num_subgraph); + int id = 1; + for (auto& op_nodes : op_nodes_all) { + LOG(INFO) << "[XPU] Converting Subgraph " << id; + GenXPUSubgraph(graph, op_nodes.second, id); + LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n" + << Visualize(graph.get()); + id++; } } - -std::unique_ptr GenerateXPUProgramPass::GenProgram() { - LOG(INFO) << "[XPU] program insts.size=" << insts_.size(); - std::unique_ptr program( - new RuntimeProgram(std::move(insts_))); - return program; -} - } // namespace subgraph } // namespace mir } // namespace lite diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h index cf121ae9503201e8cf6be40fe9054ccaf6e4b172..777642cfb6c61671a8aeb119c70664297573d9a7 100644 --- a/lite/core/mir/subgraph/generate_xpu_program_pass.h +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h @@ -35,7 +35,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass { using key2nodes_t = std::map; void Apply(const std::unique_ptr& graph) override; - std::unique_ptr GenProgram(); protected: // nodes2cvt: op nodes to convert @@ -58,9 +57,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass { void GenXPUSubgraph(const std::unique_ptr& graph, const std::unordered_set& op_nodes, int sub_id); - - private: - std::vector insts_; }; } // namespace subgraph diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc index 9d63dcbb38b2354c567ca1e0d434ac1a4be424c1..b3b7a858f68367ac789f390c6bd3bd94873f77d5 100644 --- a/lite/core/mir/type_layout_cast_pass.cc +++ b/lite/core/mir/type_layout_cast_pass.cc @@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst( for (auto& kernel : kernels) { const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); -#ifdef LITE_WITH_OPENCL + // layout kernel choose // must ignore [layout check] for layout of kernels's input and output - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->layout() == to.layout()) { -#endif + // note: replace LITE_WITH_OPENCL macro with judge input and output target + // of layout_trans + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout())) { + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->layout() == to.layout()) { is_found = true; + } + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op); break; } } + CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":" << in->AsArg().name << "->" << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 7a3277786553d8a256c48e9e5c99530b8d5681b5..b008faa687474a88988adb9da81c594306298b26 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty; VLOG(4) << "to:" << to << "\n"; -// kernel choose branch for opencl backend -// judge inst's target whether is kOpenCL -// Note: to == *decl_arg_type == in of inst, not output of last inst -#ifdef LITE_WITH_OPENCL + // kernel choose branch for opencl backend + // judge inst's target whether is kOpenCL + // Note: to == *decl_arg_type == in of inst, not output of last inst // ignore [layout check] for layout between [to] and [from] // Because all of origin opencl insts in model, are not default layout // NCHW, @@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst( // [*decl_arg_type] -> [to]: input of inst, not output of last // [in_arg_ty]: in of io_copy // [out_arg_ty]: out of io_copy - if (TargetCompatibleTo(*in_arg_ty, from) && - PrecisionCompatibleTo(*in_arg_ty, from) && - DeviceCompatibleTo(*in_arg_ty, from) && - TargetCompatibleTo(*out_arg_ty, to)) { - VLOG(4) << "do nothing. opencl found"; -#else - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { -#endif + // + // noto: replace LITE_WITH_OPENCL macro with judge input and output target + // of io_copy + if ((in_arg_ty->target() == TARGET(kOpenCL) || + out_arg_ty->target() == TARGET(kOpenCL)) && // judge OpenCL first + (TargetCompatibleTo(*in_arg_ty, from) && + PrecisionCompatibleTo(*in_arg_ty, from) && + DeviceCompatibleTo(*in_arg_ty, from) && + TargetCompatibleTo(*out_arg_ty, to))) { + VLOG(4) << "picked, opencl found"; + is_found = true; + } else if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->target() == to.target()) { VLOG(4) << "picked"; is_found = true; + } + + if (is_found) { selected_kernels.emplace_back(std::move(kernel)); // we pick the kernel io_copy_inst->AsStmt( io_copy_type, std::move(selected_kernels), io_copy_op); break; } + VLOG(4) << "not picked"; } + CHECK(is_found) << "Can't find a io_copy kernel for io_copy op: " << from << ":" << in->AsArg().name << " -> " << to << ":" << inst_node->AsStmt().op_info()->Type(); diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h index fe6ecfd66df23bb704fafcbf94106f7ca973c4f1..3f5d161a56aafa7fd9d058fd404e65cb04572116 100644 --- a/lite/core/mir/variable_place_inference_pass.h +++ b/lite/core/mir/variable_place_inference_pass.h @@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass { } } - // Set the tye of the weight - void SetWeightType(Node* w, const LiteType& type) { -// TODO(xg) to optimize this -#ifdef LITE_WITH_FPGA - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifdef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); -#endif - -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - w->AsArg().type = LiteType::GetTensorTy( - TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); -#endif -#endif + // Set the type of the weight + void SetWeightType(Node* w, + const LiteType& type, + const std::map& lite_with_targets) { + VLOG(4) << "type.precision():" << PrecisionRepr(type.precision()); + if (lite_with_targets.at("kFPGA")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else if (lite_with_targets.at("kOpenCL")) { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + } else { + w->AsArg().type = LiteType::GetTensorTy( + TARGET(kHost), type.precision(), DATALAYOUT(kNCHW)); + } } void InferenceArgumentPlace(SSAGraph* graph) { + auto& valid_places = graph->valid_places(); + auto valid_places_has_target = [&](TargetType t) -> bool { + for (auto& p : valid_places) { + if (p.target == t) { + return true; + } + } + return false; + }; + std::map lite_with_targets{ + {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))}, + {"kFPGA", valid_places_has_target(TARGET(kFPGA))}}; + VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"]; + VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"]; + VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global(); for (auto& x : graph->StmtTopologicalOrder()) { auto& inst = x->AsStmt(); -// The IoCopyOp is a tool operator, it won't support the type inference. -// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for -// tool operator -#ifndef LITE_WITH_FPGA -#ifndef LITE_WITH_OPENCL - VLOG(3) << "inst.op_type() == 'io_copy', continue"; - if (inst.op_type() == "io_copy") continue; -#endif -#endif + // The IoCopyOp is a tool operator, it won't support the type inference. + // in fpga, we has io_copy+cali+layout tool ops, so we need type inference + // for + // tool operator + if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) { + VLOG(3) << "inst.op_type() == 'io_copy', continue"; + if (inst.op_type() == "io_copy") continue; + } // deal with inputs VLOG(4) << "Infering op " << inst.op_info()->Repr(); // TODO(zhaolong): Add check if the node's name in op's arguments. @@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_in->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_in->AsArg().name; if (x_in->AsArg().is_weight) { - SetWeightType(x_in, *type); + SetWeightType(x_in, *type, lite_with_targets); } else { x_in->AsArg().type = type; } @@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass { if (!x_out->AsArg().type) { VLOG(4) << "set type " << *type << " " << x_out->AsArg().name; if (x_out->AsArg().is_weight) { - SetWeightType(x_out, *type); + SetWeightType(x_out, *type, lite_with_targets); } else { x_out->AsArg().type = type; } diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index 1400b254090b31c731a6633d5a3171d2f0c54d03..887ac3c9507b4fb36594c156b7b1b207cd7bb750 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -118,6 +118,8 @@ KernelRegistry::KernelRegistry() INIT_FOR(kCUDA, kAny, kNCHW); INIT_FOR(kCUDA, kAny, kAny); INIT_FOR(kCUDA, kInt8, kNHWC); + INIT_FOR(kCUDA, kInt64, kNCHW); + INIT_FOR(kCUDA, kInt64, kNHWC); INIT_FOR(kHost, kFloat, kNCHW); INIT_FOR(kHost, kAny, kNCHW); diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 7ed632d864d0c7ee1e028787fa20717390f29b55..d78ae690f9b019dff7728bd3e95c0b1406bea463 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -145,6 +145,12 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // - KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 22c5f193308c92d43cef45b663de97a3ba5958c7..38c9d0e29d5766dec21de76b740c1032ad44da7e 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -13,7 +13,9 @@ // limitations under the License. #pragma once +#include #include +#include #include #include #include "lite/core/mir/generate_program_pass.h" @@ -49,23 +51,20 @@ class Optimizer { valid_places_ = valid_places; CHECK(!valid_places.empty()) << "At least one valid_place should be set"; CHECK(!graph_) << "duplicate optimize found"; + graph_.reset(new mir::SSAGraph); graph_->Build(program, valid_places); graph_->SetValidPlaces(valid_places); SpecifyKernelPickTactic(kernel_pick_factor); InitTargetTypeTransformPass(); + if (passes.empty()) { - RunPasses(std::vector{ - { - #if 0 - "lite_quant_dequant_fuse_pass", // + std::vector passes_local{ + {"lite_quant_dequant_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-elemwise-bn "lite_conv_bn_fuse_pass", // "lite_conv_elementwise_fuse_pass", // conv-bn-elemwise - // This pass is disabled to force some opencl kernels selected for - // final running, otherwise, they will be fused to ARM fusion - // kernels, and the OpenCL devices will be discarded. // TODO(Superjomn) Refine the fusion related design to select fusion // kernels for devices automatically. "lite_conv_activation_fuse_pass", // @@ -74,11 +73,10 @@ class Optimizer { "lite_transpose_softmax_transpose_fuse_pass", // "lite_interpolate_fuse_pass", // "identity_scale_eliminate_pass", // -#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK +#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) "lite_elementwise_add_activation_fuse_pass", // -#endif -#endif - "static_kernel_pick_pass", // pick original kernel from graph +#endif + "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) // using kernel info @@ -107,17 +105,12 @@ class Optimizer { "argument_type_display_pass", // "variable_place_inference_pass", // - "argument_type_display_pass", // + "argument_type_display_pass", "runtime_context_assign_pass", - "argument_type_display_pass", // -#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ - !defined(LITE_WITH_XPU) - // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel - "memory_optimize_pass", -#endif - "argument_type_display_pass" - }}); + "argument_type_display_pass", + "memory_optimize_pass"}}; + RunPasses(passes_local); } else { RunPasses(passes); } @@ -128,39 +121,13 @@ class Optimizer { // Generate a new program based on the mir graph. std::unique_ptr GenRuntimeProgram() { -#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) - auto target_place = Place{ -#ifdef LITE_WITH_NPU - TARGET(kNPU), -#endif -#ifdef LITE_WITH_XPU - TARGET(kXPU), -#endif - PRECISION(kFloat)}; - if (std::find(valid_places_.begin(), valid_places_.end(), target_place) != - valid_places_.end()) { -#ifdef LITE_WITH_NPU - auto pass = mir::PassManager::Global() - .LookUp( - "generate_npu_program_pass"); -#endif -#ifdef LITE_WITH_XPU - auto pass = mir::PassManager::Global() - .LookUp( - "generate_xpu_program_pass"); -#endif - try { - pass->Apply(graph_); - auto program = pass->GenProgram(); - CHECK(exec_scope_); - program->set_exec_scope(exec_scope_); - return program; - } catch (...) { - LOG(WARNING) << "Build " << TargetToStr(target_place.target) - << " program failed!"; - } - } -#endif + // Extra passes are applied for NPU and XPU, they depends on the shapes + // of input tensors. so GenRuntimeProgram() must be called after the shapes + // of input tensors are determined. + std::vector subgraph_passes{"generate_npu_program_pass", + "generate_xpu_program_pass"}; + RunPasses(subgraph_passes); + auto pass = mir::PassManager::Global().LookUp( "generate_program_pass"); pass->Apply(graph_); @@ -202,14 +169,16 @@ class Optimizer { for (auto& x : passes) { LOG(INFO) << "== Running pass: " << x; mir::Pass* pass = mir::PassManager::Global().LookUp(x); - CHECK(pass) << "Can not find pass: " << x; - bool matched = false; + if (!pass) { + LOG(INFO) << " - Skip " << x << " because the pass isn't found."; + continue; + } + std::set targets; for (const auto& place : valid_places_) { - if (PassMatchesTarget(*pass, place.target)) { - matched = true; - } + targets.insert(place.target); } - matched = matched && PassMatchesKernels(*pass); + bool matched = + PassMatchesTarget(*pass, targets) && PassMatchesKernels(*pass); if (!matched) { LOG(INFO) << " - Skip " << x << " because the target or kernel does not match."; diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt index 54a239024413834cb30c6e135c378d10480863e7..b7ddd810af46a25e2c331c2f0364a72f466dc636 100644 --- a/lite/core/profile/CMakeLists.txt +++ b/lite/core/profile/CMakeLists.txt @@ -5,4 +5,5 @@ endif() lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler) - +lite_cc_library(lite_profiler SRCS profiler.cc DEPS context) +lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler) diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc new file mode 100644 index 0000000000000000000000000000000000000000..a51b769c8f46a5ca8cb9ed74740b93844882cb16 --- /dev/null +++ b/lite/core/profile/profiler.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/profile/profiler.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace profile { + +int Profiler::NewTimer(const OpCharacter& ch) { + StatisUnit unit; + unit.character = ch; + if (ch.target == TargetType::kCUDA) { +#ifdef LITE_WITH_CUDA + unit.timer.reset(new DeviceTimer()); +#else + LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the " + "default x86 timer is used instead."; +#endif + } else { + unit.timer.reset(new DeviceTimer()); + } + units_.push_back(std::move(unit)); + return units_.size() - 1; +} + +void Profiler::StartTiming(const int index, KernelContext* ctx) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + units_[index].timer->Start(ctx); +} + +float Profiler::StopTiming(const int index, KernelContext* ctx) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + return units_[index].timer->Stop(ctx); +} + +std::string Profiler::Summary(bool concise) { + STL::stringstream ss; + auto cout_title = [&ss](const std::string& title, const std::string& name) { + // clang-format off + ss << "===== " << title << ": " << name << " =====" << std::endl; + ss << std::setw(25) << std::left << "Operator Type" \ + << std::setw(40) << std::left << "Kernel Name" \ + << std::setw(10) << std::left << "Remark" \ + << std::setw(10) << std::left << "Avg (ms)" \ + << std::setw(10) << std::left << "Min (ms)" \ + << std::setw(10) << std::left << "Max (ms)" \ + << std::endl; + // clang-format on + }; + if (concise) { + auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) { + return (c1.target < c2.target) || (c1.op_type < c2.op_type) || + (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark); + }; + std::map summary(op_comp); + for (auto& unit : units_) { + auto ch = summary.find(unit.character); + if (ch != summary.end()) { + ch->second.avg += unit.timer->LapTimes().Avg(); + ch->second.min += unit.timer->LapTimes().Min(); + ch->second.max += unit.timer->LapTimes().Max(); + } else { + TimeInfo info({unit.timer->LapTimes().Avg(), + unit.timer->LapTimes().Min(), + unit.timer->LapTimes().Max()}); + summary.insert({unit.character, info}); + } + } + cout_title("Concise Profiler Summary", name_); + for (const auto& item : summary) { + // clang-format off + ss << std::setw(25) << std::left << item.first.op_type \ + << std::setw(40) << std::left << item.first.kernel_name \ + << std::setw(10) << std::left << item.first.remark \ + << std::setw(10) << std::left << item.second.avg \ + << std::setw(10) << std::left << item.second.min \ + << std::setw(10) << std::left << item.second.max \ + << std::endl; + // clang-format on + } + } else { + cout_title("Detailed Profiler Summary", name_); + for (auto& unit : units_) { + // clang-format off + ss << std::setw(25) << std::left << unit.character.op_type \ + << std::setw(40) << std::left << unit.character.kernel_name \ + << std::setw(10) << std::left << unit.character.remark \ + << std::setw(10) << std::left << unit.timer->LapTimes().Avg() \ + << std::setw(10) << std::left << unit.timer->LapTimes().Min() \ + << std::setw(10) << std::left << unit.timer->LapTimes().Max() \ + << std::endl; + // clang-format on + } + } + return ss.str(); +} + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h new file mode 100644 index 0000000000000000000000000000000000000000..0fce8167cdd5383c2cc4ae5d641433582f0ee6a7 --- /dev/null +++ b/lite/core/profile/profiler.h @@ -0,0 +1,59 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "lite/core/profile/timer.h" + +namespace paddle { +namespace lite { +namespace profile { + +struct TimeInfo { + float avg; + float min; + float max; +}; + +struct OpCharacter { + TargetType target; + std::string op_type{std::string("N/A")}; + std::string kernel_name{std::string("N/A")}; + std::string remark{std::string("N/A")}; +}; + +struct StatisUnit { + std::unique_ptr timer; + OpCharacter character; +}; + +class Profiler final { + public: + Profiler() = default; + explicit Profiler(const std::string& name) : name_(name) {} + int NewTimer(const OpCharacter& ch); + void StartTiming(const int index, KernelContext* ctx); + float StopTiming(const int index, KernelContext* ctx); + std::string Summary(bool concise = true); + + private: + std::string name_{std::string("N/A")}; + std::vector units_; +}; + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f49698ef4a8f83e4192a16801566fdcbd7baf9a --- /dev/null +++ b/lite/core/profile/test_timer.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include // NOLINT +#include // NOLINT +#include "lite/core/context.h" +#include "lite/core/profile/profiler.h" +#include "lite/core/profile/timer.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace profile { + +TEST(timer, real_latency) { + Timer timer; + + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + timer.Stop(); + + timer.Start(); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + timer.Stop(); + + LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg(); +} + +#ifdef LITE_WITH_CUDA +TEST(gpu_timer, real_latency) { + DeviceTimer timer; + KernelContext ctx; + cudaStream_t exec_stream; + cudaStreamCreate(&exec_stream); + (&ctx.As())->SetExecStream(exec_stream); + + timer.Start(&ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + timer.Stop(&ctx); + + (&timer)->Start(&ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + timer.Stop(&ctx); + + LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg(); +} + +TEST(profiler, real_latency) { + KernelContext ctx; + cudaStream_t exec_stream; + cudaStreamCreate(&exec_stream); + (&ctx.As())->SetExecStream(exec_stream); + + Profiler profiler("name"); + profile::OpCharacter ch; + ch.target = TargetType::kCUDA; + ch.op_type = "operator/1"; + ch.kernel_name = "kernel/1"; + int idx = profiler.NewTimer(ch); + profiler.StartTiming(idx, &ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + profiler.StopTiming(idx, &ctx); + std::cout << profiler.Summary(); +} +#endif + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h new file mode 100644 index 0000000000000000000000000000000000000000..1e86f0d7b9be4914bdf1a6874195276d3c1b61ee --- /dev/null +++ b/lite/core/profile/timer.h @@ -0,0 +1,114 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#ifdef LITE_WITH_CUDA +#include "lite/backends/cuda/cuda_utils.h" +#endif +#include "lite/core/context.h" + +namespace paddle { +namespace lite { +namespace profile { + +template +class TimeList { + public: + void Clear() { laps_t_.clear(); } + void Add(T t) { laps_t_.push_back(t); } + T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); } + T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); } + T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); } + size_t Size() const { return laps_t_.size(); } + T Avg() const { + if (!Size()) { + return 0; + } + return Sum() / Size(); + } + const std::list& Raw() const { return laps_t_; } + + private: + std::list laps_t_; +}; + +class Timer { + public: + Timer() = default; + virtual ~Timer() = default; + + void Reset() { laps_t_.Clear(); } + void Start() { t_start_ = std::chrono::system_clock::now(); } + float Stop() { + t_stop_ = std::chrono::system_clock::now(); + auto ts = std::chrono::duration_cast(t_stop_ - + t_start_); + float elapse_ms = 1000.f * static_cast(ts.count()) * + std::chrono::microseconds::period::num / + std::chrono::microseconds::period::den; + this->laps_t_.Add(elapse_ms); + return elapse_ms; + } + virtual void Start(KernelContext* ctx) { return Start(); } + virtual float Stop(KernelContext* ctx) { return Stop(); } + float AvgLapTimeMs() const { return laps_t_.Avg(); } + const TimeList& LapTimes() const { return laps_t_; } + + protected: + std::chrono::time_point t_start_, t_stop_; + TimeList laps_t_; +}; + +template +class DeviceTimer final : public Timer {}; + +#ifdef LITE_WITH_CUDA +template <> +class DeviceTimer final : public Timer { + public: + DeviceTimer() { + CUDA_CALL(cudaEventCreate(&e_start_)); + CUDA_CALL(cudaEventCreate(&e_stop_)); + } + ~DeviceTimer() { + CUDA_CALL(cudaEventDestroy(e_start_)); + CUDA_CALL(cudaEventDestroy(e_stop_)); + } + void Start(KernelContext* ctx) { + cudaStream_t stream; + stream = ctx->As().exec_stream(); + CUDA_CALL(cudaEventRecord(e_start_, stream)); + } + float Stop(KernelContext* ctx) { + cudaStream_t stream; + stream = ctx->As().exec_stream(); + CUDA_CALL(cudaEventRecord(e_stop_, stream)); + CUDA_CALL(cudaEventSynchronize(e_stop_)); + float elapse_ms = 1.f; + CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_)); + this->laps_t_.Add(elapse_ms); + return elapse_ms; + } + + private: + cudaEvent_t e_start_, e_stop_; +}; +#endif + +} // namespace profile +} // namespace lite +} // namespace paddle diff --git a/lite/core/program.cc b/lite/core/program.cc index b60f279c0fc74904477a080579a799f601e359b0..45796a478b3f2309912e6382b3380bf0734bd6ae 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -122,6 +122,9 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PROFILE } +#ifdef LITE_WITH_PROFILE + LOG(INFO) << "\n" << profiler_.Summary(); +#endif // LITE_WITH_PROFILE } void Program::Build(const cpp::ProgramDesc& prog) { @@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { void Instruction::Run() { CHECK(op_) << "op null"; CHECK(kernel_) << "kernel null"; -#ifdef LITE_WITH_PROFILE - if (profile_id_ >= 0) { - profile::ProfileBlock x(profile_id_, "instruction"); - } -#endif // LITE_WITH_PROFILE if (first_epoch_) { first_epoch_ = false; CHECK(op_->CheckShape()); diff --git a/lite/core/program.h b/lite/core/program.h index 7a6700da61f7ba9f35491613d7733b4b637b8ff0..1c1e4975c3a13bcfa9a22999a705f3a78b0fc68e 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -22,9 +22,6 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/model_parser/cpp/program_desc.h" -#ifdef LITE_WITH_PROFILE -#include "lite/core/profile/basic_profiler.h" -#endif // LITE_WITH_PROFILE namespace paddle { namespace lite { @@ -87,22 +84,7 @@ struct Program { struct Instruction { Instruction(const std::shared_ptr& op, std::unique_ptr&& kernel) - : op_(op), kernel_(std::move(kernel)) { -#ifdef LITE_WITH_PROFILE - if (op_->Type() != "feed" && op_->Type() != "fetch") { - profile_id_ = profile::BasicProfiler::Global() - .NewRcd(kernel_->SerializedKernelType()) - .id(); - kernel_->SetProfileID(profile_id_); - // Set profile custom info - auto& profiler = - *profile::BasicProfiler::Global().mutable_record( - profile_id_); - profiler.SetCustomInfo("op_type", op_->Type()); - profiler.SetCustomInfo("op_info", op_->SerializedOpInfo()); - } -#endif // LITE_WITH_PROFILE - } + : op_(op), kernel_(std::move(kernel)) {} // Run the instruction. void Run(); @@ -113,6 +95,20 @@ struct Instruction { const KernelBase* kernel() const { return kernel_.get(); } KernelBase* mutable_kernel() { return kernel_.get(); } +#ifdef LITE_WITH_PROFILE + void set_profiler(profile::Profiler* profiler) { + profiler_ = profiler; + if (op_->Type() != "feed" && op_->Type() != "fetch") { + profile::OpCharacter ch; + ch.target = kernel()->target(); + ch.op_type = op_->Type(); + ch.kernel_name = kernel()->name(); + profile_id_ = profiler->NewTimer(ch); + kernel_->SetProfiler(profiler_, profile_id_); + } + } +#endif + private: std::shared_ptr op_; std::unique_ptr kernel_; @@ -120,7 +116,7 @@ struct Instruction { bool has_run_{false}; #ifdef LITE_WITH_PROFILE - // for profiler + profile::Profiler* profiler_; int profile_id_{-1}; #endif // LITE_WITH_PROFILE }; @@ -135,6 +131,9 @@ class LITE_API RuntimeProgram { if (instructions_.empty()) { LOG(FATAL) << "no instructions"; } +#ifdef LITE_WITH_PROFILE + set_profiler(); +#endif } void Run(); @@ -159,6 +158,15 @@ class LITE_API RuntimeProgram { RuntimeProgram(const RuntimeProgram&) = delete; std::vector instructions_; lite::Scope* exec_scope_{}; + +#ifdef LITE_WITH_PROFILE + profile::Profiler profiler_; + void set_profiler() { + for (auto i = instructions_.begin(); i != instructions_.end(); ++i) { + i->set_profiler(&profiler_); + } + } +#endif }; } // namespace lite diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def index 1b5da970e8fa9b2793f7a4982d5ed22ed21e79fd..cc2e593000a414a915ae8f4242b5ea34d6688438 100644 --- a/lite/demo/cxx/Makefile.def +++ b/lite/demo/cxx/Makefile.def @@ -1,26 +1,22 @@ CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \ -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING -LDFLAGS = -latomic -pthread -ldl +LDFLAGS = -latomic -pthread -ldl -llog -lz SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot - -THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a - + SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \ -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \ -I/opt/android-ndk-r17c/sources/android/support/include \ -I/opt/android-ndk-r17c/sysroot/usr/include \ -THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include - ifeq ($(ARM_ABI), arm8) CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ - CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE + CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64 SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES) + INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android else CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++ CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \ @@ -31,5 +27,5 @@ else /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \ /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a - INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES) + INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi endif diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md index ec72c044e3fd08bd775b23c373945c5bb5743d1d..b7768d763eb4f6d2255119f805753f96d4bef9e6 100644 --- a/lite/demo/cxx/README.md +++ b/lite/demo/cxx/README.md @@ -1,6 +1,6 @@ # C++ Demo 1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像 -2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 +2. 运行并进入docker镜像环境,执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。 3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz ` 4. 执行以下命令准备模拟器环境 ```shell @@ -27,8 +27,10 @@ tar zxvf mobilenet_v1.tar.gz make adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/ adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt" ``` 运行成功将在控制台输出预测结果的前10个类别的预测概率 @@ -37,6 +39,24 @@ adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/da cd ../mobile_light make adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api -adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt" +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt" ``` + +7. 编译并运行目标检测的demo +```shell +cd ../mobile_detection +wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz +tar zxvf mobilenetv1-ssd.tar.gz +make +adb -s emulator-5554 push mobile_detection /data/local/tmp/ +adb -s emulator-5554 push test.jpg /data/local/tmp/ +adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/ +adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection +adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && +/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg" +adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./ +``` +运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 new file mode 100644 index 0000000000000000000000000000000000000000..784ad73da4bf1d37ee23c17ac7c4dfc5c08f2627 --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 @@ -0,0 +1,61 @@ +ARM_ABI = arm7 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_detection: fetch_opencv mobile_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) + +mobile_detection.o: mobile_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_detection.o + rm -f mobile_detection diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 new file mode 100644 index 0000000000000000000000000000000000000000..2304b38efffdd96e7e13073020df4954b5e53034 --- /dev/null +++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 @@ -0,0 +1,61 @@ +ARM_ABI = arm8 +export ARM_ABI + +include ../Makefile.def + +LITE_ROOT=../../../ + +THIRD_PARTY_DIR=${LITE_ROOT}/third_party + +OPENCV_VERSION=opencv4.1.0 + +OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \ + ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a + +OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include + +CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) + +mobile_detection: fetch_opencv mobile_detection.o + $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection $(CXX_LIBS) $(LDFLAGS) + +mobile_detection.o: mobile_detection.cc + $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc + +fetch_opencv: + @ test -d ${THIRD_PARTY_DIR} || mkdir ${THIRD_PARTY_DIR} + @ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \ + (echo "fetch opencv libs" && \ + wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz) + @ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \ + tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR} + + +.PHONY: clean +clean: + rm -f mobile_detection.o + rm -f mobile_detection diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 index f795b41d46acc3be67ff6c1a0bba0de1c1d8c82d..8ab8a3b7436c836f681510e28461628ed1038709 100644 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 @@ -5,9 +5,25 @@ include ../Makefile.def LITE_ROOT=../../../ -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include +THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) +THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a + +CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_full_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_full_bundled.a` + +#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) mobilenetv1_full_api: mobilenetv1_full_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 index d0767145b00bd40a3fbeff2aef4f7a0fc6f542d6..c13320603bcce91ebe1fca9014e36b07540abca1 100644 --- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 @@ -5,9 +5,25 @@ include ../Makefile.def LITE_ROOT=../../../ -CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include +THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) +THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a + +CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include + +CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_full_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_full_bundled.a` + +#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS) mobilenetv1_full_api: mobilenetv1_full_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 index d235d6e25fa9abe47ba50d8336cafcdd6580e30d..9150ae6e44e2314a482f7fcb3d139a20cf9f0304 100644 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 +++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 @@ -7,7 +7,19 @@ LITE_ROOT=../../../ CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) mobilenetv1_light_api: mobilenetv1_light_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 index b91aadcef813de2a6f3371fe2cc4989bd87cf1ab..7a2dbdd0fcc9611fe79fb2660ad215ac4ba0d769 100644 --- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 +++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 @@ -7,7 +7,19 @@ LITE_ROOT=../../../ CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include -CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) +CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS) + +############################################################### +# How to use one of static libaray: # +# `libpaddle_api_full_bundled.a` # +# `libpaddle_api_light_bundled.a` # +############################################################### +# Note: default use lite's shared library. # +############################################################### +# 1. Comment above line using `libpaddle_light_api_shared.so` +# 2. Undo comment below line using `libpaddle_api_light_bundled.a` + +#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS) mobilenetv1_light_api: mobilenetv1_light_api.o $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api $(CXX_LIBS) $(LDFLAGS) diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/mobile_detection/mobile_detection.cc new file mode 100644 index 0000000000000000000000000000000000000000..9b8f02aeedef991496541400e7db67c3e3ff0e51 --- /dev/null +++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc @@ -0,0 +1,210 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "opencv2/core.hpp" +#include "opencv2/imgcodecs.hpp" +#include "opencv2/imgproc.hpp" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +struct Object { + int batch_id; + cv::Rect rec; + int class_id; + float prob; +}; + +int64_t ShapeProduction(const shape_t& shape) { + int64_t res = 1; + for (auto i : shape) res *= i; + return res; +} + +const char* class_names[] = { + "background", "aeroplane", "bicycle", "bird", "boat", + "bottle", "bus", "car", "cat", "chair", + "cow", "diningtable", "dog", "horse", "motorbike", + "person", "pottedplant", "sheep", "sofa", "train", + "tvmonitor"}; + +// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up +void neon_mean_scale(const float* din, + float* dout, + int size, + const std::vector mean, + const std::vector scale) { + if (mean.size() != 3 || scale.size() != 3) { + std::cerr << "[ERROR] mean or scale size must equal to 3\n"; + exit(1); + } + float32x4_t vmean0 = vdupq_n_f32(mean[0]); + float32x4_t vmean1 = vdupq_n_f32(mean[1]); + float32x4_t vmean2 = vdupq_n_f32(mean[2]); + float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]); + float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]); + float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]); + + float* dout_c0 = dout; + float* dout_c1 = dout + size; + float* dout_c2 = dout + size * 2; + + int i = 0; + for (; i < size - 3; i += 4) { + float32x4x3_t vin3 = vld3q_f32(din); + float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0); + float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1); + float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2); + float32x4_t vs0 = vmulq_f32(vsub0, vscale0); + float32x4_t vs1 = vmulq_f32(vsub1, vscale1); + float32x4_t vs2 = vmulq_f32(vsub2, vscale2); + vst1q_f32(dout_c0, vs0); + vst1q_f32(dout_c1, vs1); + vst1q_f32(dout_c2, vs2); + + din += 12; + dout_c0 += 4; + dout_c1 += 4; + dout_c2 += 4; + } + for (; i < size; i++) { + *(dout_c0++) = (*(din++) - mean[0]) * scale[0]; + *(dout_c0++) = (*(din++) - mean[1]) * scale[1]; + *(dout_c0++) = (*(din++) - mean[2]) * scale[2]; + } +} + +void pre_process(const cv::Mat& img, int width, int height, float* data) { + cv::Mat rgb_img; + cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB); + cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f); + cv::Mat imgf; + rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f); + std::vector mean = {0.5f, 0.5f, 0.5f}; + std::vector scale = {0.5f, 0.5f, 0.5f}; + const float* dimg = reinterpret_cast(imgf.data); + neon_mean_scale(dimg, data, width * height, mean, scale); +} + +std::vector detect_object(const float* data, + int count, + float thresh, + cv::Mat& image) { // NOLINT + if (data == nullptr) { + std::cerr << "[ERROR] data can not be nullptr\n"; + exit(1); + } + std::vector rect_out; + for (int iw = 0; iw < count; iw++) { + int oriw = image.cols; + int orih = image.rows; + if (data[1] > thresh && static_cast(data[0]) > 0) { + Object obj; + int x = static_cast(data[2] * oriw); + int y = static_cast(data[3] * orih); + int w = static_cast(data[4] * oriw) - x; + int h = static_cast(data[5] * orih) - y; + cv::Rect rec_clip = + cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows); + obj.batch_id = 0; + obj.class_id = static_cast(data[0]); + obj.prob = data[1]; + obj.rec = rec_clip; + if (w > 0 && h > 0 && obj.prob <= 1) { + rect_out.push_back(obj); + cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA); + std::string str_prob = std::to_string(obj.prob); + std::string text = std::string(class_names[obj.class_id]) + ": " + + str_prob.substr(0, str_prob.find(".") + 4); + int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL; + double font_scale = 1.f; + int thickness = 2; + cv::Size text_size = + cv::getTextSize(text, font_face, font_scale, thickness, nullptr); + float new_font_scale = w * 0.35 * font_scale / text_size.width; + text_size = cv::getTextSize( + text, font_face, new_font_scale, thickness, nullptr); + cv::Point origin; + origin.x = x + 10; + origin.y = y + text_size.height + 10; + cv::putText(image, + text, + origin, + font_face, + new_font_scale, + cv::Scalar(0, 255, 255), + thickness, + cv::LINE_AA); + + std::cout << "detection, image size: " << image.cols << ", " + << image.rows + << ", detect object: " << class_names[obj.class_id] + << ", score: " << obj.prob << ", location: x=" << x + << ", y=" << y << ", width=" << w << ", height=" << h + << std::endl; + } + } + data += 6; + } + return rect_out; +} + +void RunModel(std::string model_dir, std::string img_path) { + // 1. Set MobileConfig + MobileConfig config; + config.set_model_dir(model_dir); + + // 2. Create PaddlePredictor by MobileConfig + std::shared_ptr predictor = + CreatePaddlePredictor(config); + + // 3. Prepare input data from image + std::unique_ptr input_tensor(std::move(predictor->GetInput(0))); + const int in_width = 300; + const int in_height = 300; + input_tensor->Resize({1, 3, in_height, in_width}); + auto* data = input_tensor->mutable_data(); + cv::Mat img = imread(img_path, cv::IMREAD_COLOR); + pre_process(img, in_width, in_height, data); + + // 4. Run predictor + predictor->Run(); + + // 5. Get output and post process + std::unique_ptr output_tensor( + std::move(predictor->GetOutput(0))); + auto* outptr = output_tensor->data(); + auto shape_out = output_tensor->shape(); + int64_t cnt = 1; + for (auto& i : shape_out) { + cnt *= i; + } + auto rec_out = detect_object(outptr, static_cast(cnt / 6), 0.6f, img); + std::string result_name = + img_path.substr(0, img_path.find(".")) + "_detection_result.jpg"; + cv::imwrite(result_name, img); +} + +int main(int argc, char** argv) { + if (argc < 3) { + std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n"; + exit(1); + } + std::string model_dir = argv[1]; + std::string img_path = argv[2]; + RunModel(model_dir, img_path); + return 0; +} diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6bb36e136deec6088c7b75215fc35d6231283673 Binary files /dev/null and b/lite/demo/cxx/mobile_detection/test.jpg differ diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 5ac041b2cc53e8f17ad86a2b71e6b02058b7e249..0c9da1a76422edae45dfeec5d38556a5e2322a85 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -13,12 +13,10 @@ // limitations under the License. #include -#include +#include #include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT -#include "paddle_use_passes.h" // NOLINT +#include "paddle_api.h" // NOLINT +#include "paddle_use_passes.h" // NOLINT using namespace paddle::lite_api; // NOLINT @@ -32,11 +30,21 @@ int64_t ShapeProduction(const shape_t& shape) { return res; } +// 0. Enable OpenCL, if needed +// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl) +// #define DEMO_WITH_OPENCL void RunModel() { // 1. Set CxxConfig CxxConfig config; config.set_model_dir(FLAGS_model_dir); +#ifdef DEMO_WITH_OPENCL + std::vector valid_places{ + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)}, + Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)}, + Place{TARGET(kARM), PRECISION(kFloat)}}; +#else std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; +#endif if (FLAGS_prefer_int8_kernel) { valid_places.insert(valid_places.begin(), Place{TARGET(kARM), PRECISION(kInt8)}); @@ -68,14 +76,22 @@ void RunModel() { // 6. Get output std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } } int main(int argc, char** argv) { google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_model_dir == "" || FLAGS_optimized_model_dir == "") { + std::cerr << "[ERROR] usage: " << argv[0] + << " --model_dir=" + << " --optimized_model_dir= " + << " --prefer_int8_kernel=[true|false]\n"; + exit(1); + } RunModel(); return 0; } diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc index e1833814cad17b2af182443874c69f4c91e542fc..c40e3d5e9aa1dfc88ca0fae8d14c11b2a6dcbe1d 100644 --- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc +++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc @@ -12,27 +12,22 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include +#include #include -#include "paddle_api.h" // NOLINT -#include "paddle_use_kernels.h" // NOLINT -#include "paddle_use_ops.h" // NOLINT +#include "paddle_api.h" // NOLINT using namespace paddle::lite_api; // NOLINT -DEFINE_string(model_dir, "", "Model dir path."); - int64_t ShapeProduction(const shape_t& shape) { int64_t res = 1; for (auto i : shape) res *= i; return res; } -void RunModel() { +void RunModel(std::string model_dir) { // 1. Set MobileConfig MobileConfig config; - config.set_model_dir(FLAGS_model_dir); + config.set_model_dir(model_dir); // 2. Create PaddlePredictor by MobileConfig std::shared_ptr predictor = @@ -52,14 +47,19 @@ void RunModel() { // 5. Get output std::unique_ptr output_tensor( std::move(predictor->GetOutput(0))); - printf("Output dim: %d\n", output_tensor->shape()[1]); + std::cout << "Output shape " << output_tensor->shape()[1] << std::endl; for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) { - printf("Output[%d]: %f\n", i, output_tensor->data()[i]); + std::cout << "Output[" << i << "]: " << output_tensor->data()[i] + << std::endl; } } int main(int argc, char** argv) { - google::ParseCommandLineFlags(&argc, &argv, true); - RunModel(); + if (argc < 2) { + std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n"; + exit(1); + } + std::string model_dir = argv[1]; + RunModel(model_dir); return 0; } diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 56c70cf1e1d28dcc1cd6945130520002c8150a8d..40c95415546d99a66abf2d6f3595ae8695c4df86 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -18,7 +18,6 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -47,7 +46,6 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - BM_DEPS ${bm_kernels} EXCLUDE_COMPILE_DEPS "ON" ) diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 8949602cab00c28d03424ad7cca2387765375b80..0c8866eaf88145d3bb0703b32ffb3eaf80332898 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -1,3 +1,5 @@ + +# 1. basic kernels for basic models # for conv op add_kernel(conv_depthwise ARM basic SRCS conv_depthwise.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(conv_direct ARM basic SRCS conv_direct.cc DEPS ${lite_kernel_deps} math_arm) @@ -14,51 +16,58 @@ add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_ add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) + add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(unsqueeze_compute_arm ARM extra SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm) + +## 2.other basic kernels: basic kernels that not used in basic models +add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) + +## 3. extra kernels +add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(density_prior_box_compute_arm ARM extra SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(stack_compute_arm ARM extra SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(affine_channel_compute_arm ARM extra SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(range_compute_arm ARM extra SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm) + # for OCR specific add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -74,7 +83,7 @@ add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -90,18 +99,17 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm) lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm) -lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm) -lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm) lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm) lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm) lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm) -lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra) lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm) -lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) -lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm) - +lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm) if(LITE_BUILD_EXTRA) + lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm) + lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm) + lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm) lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm) + lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm) endif() diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc index ebb96e21d5e856325b7abdb8342df2aea3d5b5c3..69e507ba347583b3761fe38d86136a22f2576c15 100644 --- a/lite/kernels/arm/conv_compute.cc +++ b/lite/kernels/arm/conv_compute.cc @@ -32,13 +32,18 @@ void ConvCompute::PrepareForRun() { auto w_dims = param.filter->dims(); auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; int ic = w_dims[1] * param.groups; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int pad = param.paddings[0]; + int pad = paddings[0]; int stride = param.strides[0]; + int threads = ctx.threads(); + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int chin = param.x->dims()[1]; int hin = param.x->dims()[2]; int win = param.x->dims()[3]; @@ -46,22 +51,28 @@ void ConvCompute::PrepareForRun() { int hout = param.output->dims()[2]; int wout = param.output->dims()[3]; - bool kps_equal = (param.paddings[0] == param.paddings[1]) && - (param.strides[0] == param.strides[1]) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); + + bool kps_equal = (param.strides[0] == param.strides[1]) && (kw == kh); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2)); - bool flag_dw_5x5 = - (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2); + bool flag_dw_5x5 = pads_all_equal && ((kw == 5 && stride == 1) || + (kw == 5 && stride == 2 && pad == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; /// select conv impl - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { /// dw conv impl impl_ = new DepthwiseConv; VLOG(3) << "invoking dw conv"; } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal && no_dilation) { - if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) { + bool use_winograd = + (threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 && + pads_equal) || + (oc >= 32 && ic >= 32 && hout >= 16 && wout >= 16 && pads_equal); + if (use_winograd) { /// winograd conv impl impl_ = new WinogradConv; VLOG(3) << "invoking winograd conv"; @@ -92,22 +103,29 @@ void ConvCompute::PrepareForRun() { auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int ic = param.groups * w_dims[1]; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int ph = param.paddings[1]; - int pw = param.paddings[0]; + int ph = paddings[0]; + int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); - bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2); - bool flag_dw_5x5 = (kw == 5 && sw == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); + bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); + bool flag_dw_5x5 = pads_all_equal && + ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { impl_ = new DepthwiseConv; VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && @@ -130,23 +148,30 @@ void ConvCompute::PrepareForRun() { auto w_dims = param.filter->dims(); auto& ctx = this->ctx_->template As(); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int ic = w_dims[1] * param.groups; int oc = w_dims[0]; int kh = w_dims[2]; // oihw int kw = w_dims[3]; - int ph = param.paddings[1]; - int pw = param.paddings[0]; + int ph = paddings[0]; + int pw = paddings[2]; int sh = param.strides[1]; int sw = param.strides[0]; + bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]); bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh); - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); - bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2); - bool flag_dw_5x5 = (kw == 5 && sw == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); + bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2)); + bool flag_dw_5x5 = pads_all_equal && + ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2)); bool flag_dw = flag_dw_3x3 || flag_dw_5x5; - if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) { + if (param.groups == ic && ic == oc && kps_equal && pads_equal && + no_dilation && flag_dw) { impl_ = new DepthwiseConv; VLOG(3) << "Run DepthwiseConv Int8"; } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) && @@ -194,7 +219,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -203,7 +228,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -213,7 +238,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", @@ -223,7 +248,7 @@ REGISTER_LITE_KERNEL( REGISTER_LITE_KERNEL( depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out) .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) - .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))}) .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))}) .BindOutput("Output", diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc index 6a20d607e3a594c8eff83e1f872433f1c6025fd2..e2eaef51ddcb169313e6675d497ca4d7cab438d3 100644 --- a/lite/kernels/arm/conv_depthwise.cc +++ b/lite/kernels/arm/conv_depthwise.cc @@ -31,19 +31,28 @@ void DepthwiseConv::PrepareForRun() { // select dw conv kernel if (kw == 3) { VLOG(5) << "invoke 3x3 dw conv fp32"; - // trans weights - constexpr int cblock = 4; - auto oc = w_dims[0]; - auto kh = w_dims[2]; - auto cround = ROUNDUP(oc, cblock); - weights_.Resize({cround, 1, kh, kw}); - // auto w_data = weights_.mutable_data(); - // auto w_data_in = param.filter->data(); - // lite::arm::math::conv_trans_weights_numc( - // w_data_in, w_data, oc, 1, cblock, kh * kw); - impl_ = lite::arm::math::conv_depthwise_3x3_fp32; - flag_trans_weights_ = false; - // flag_trans_weights_ = true; + auto paddings = *param.paddings; + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + + if (pads_equal && paddings[0] == paddings[2] && + (paddings[0] == 0 || paddings[0] == 1)) { + impl_ = lite::arm::math::conv_depthwise_3x3_fp32; + flag_trans_weights_ = false; + } else { + // trans weights + constexpr int cblock = 4; + auto oc = w_dims[0]; + auto kh = w_dims[2]; + auto cround = ROUNDUP(oc, cblock); + weights_.Resize({cround, 1, kh, kw}); + auto w_data = weights_.mutable_data(); + auto w_data_in = param.filter->data(); + lite::arm::math::conv_trans_weights_numc( + w_data_in, w_data, oc, 1, cblock, kh * kw); + impl_ = lite::arm::math::conv_depthwise_3x3_fp32; + flag_trans_weights_ = true; + } } else if (kw == 5) { VLOG(5) << "invoke 5x5 dw conv fp32"; impl_ = lite::arm::math::conv_depthwise_5x5_fp32; diff --git a/lite/kernels/arm/conv_gemmlike.h b/lite/kernels/arm/conv_gemmlike.h index e00b8de6f4a66dfea91e8806821ba7cf3a9aa62b..5e59eb8d1790ab8845df3093ce7d86356b031034 100644 --- a/lite/kernels/arm/conv_gemmlike.h +++ b/lite/kernels/arm/conv_gemmlike.h @@ -52,12 +52,19 @@ class GemmLikeConv : public KernelLite { int oc = o_dims[1]; int kw = w_dims[3]; int kh = w_dims[2]; + + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int sw = param.strides[1]; int sh = param.strides[0]; - int pw = param.paddings[1]; - int ph = param.paddings[0]; - int dw = param.dilations[1]; - int dh = param.dilations[0]; + int pw = paddings[2]; + int ph = paddings[0]; + int dw = dilations[1]; + int dh = dilations[0]; + + bool pads_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); int m = oc / param.groups; int k = ic * kh * kw / param.groups; @@ -66,7 +73,7 @@ class GemmLikeConv : public KernelLite { bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh); bool ks_equal = (sw == sh) && (kw == kh); //! select conv gemmlike kernel - if (kw == 1 && sw == 1 && pw == 0 && kps_equal) { + if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) { //! 1x1s1p0 gemmlike conv flag_1x1gemm_ = true; } else { diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc index 5a18499c85d682e0983493869e7d54de81641a99..5c58b297138c0c042bc332e59f5ae7b76e83e779 100644 --- a/lite/kernels/arm/conv_transpose_compute.cc +++ b/lite/kernels/arm/conv_transpose_compute.cc @@ -76,19 +76,28 @@ void Conv2DTransposeCompute::Run() { bool fuse_relu = param.fuse_relu; bool flag_bias = (param.bias != nullptr); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + int m = chout * kw * kh / group; int n = hin * win; int k = chin / group; + + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + int group_size_in = win * hin * chin / group; int group_size_out = wout * hout * chout / group; int group_size_coldata = m * n; + + bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]); int hblock = lite::arm::math::get_hblock(&ctx); int m_roundup = hblock * ((m + hblock - 1) / hblock); int group_size_weights = ((m_roundup * k + 15) / 16) * 16; bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) && - (param.strides[1] == 1) && (param.paddings[0] == 0) && - (param.paddings[1] == 0) && (param.dilations[0] == 1) && - (param.dilations[1] == 1); + (param.strides[1] == 1) && pads_all_qual && + (paddings[0] == 0) && (dilations[0] == 1) && + (dilations[1] == 1); ctx.ExtendWorkspace(sizeof(float) * group * m * n); auto din = param.x->data(); @@ -129,12 +138,14 @@ void Conv2DTransposeCompute::Run() { wout, kh, kw, - param.paddings[0], - param.paddings[1], + paddings[0], + paddings[1], + paddings[2], + paddings[3], param.strides[0], param.strides[1], - param.dilations[0], - param.dilations[1], + dilations[0], + dilations[1], dout_batch); } if (flag_bias) { diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc index d1b8d8a48ecd7d564947486ee2938d6b630c41e5..d02cabf277a5e25e2dc731b5bcf0eabe601c9aae 100644 --- a/lite/kernels/arm/conv_winograd.cc +++ b/lite/kernels/arm/conv_winograd.cc @@ -26,6 +26,7 @@ template <> void WinogradConv::ReInitWhenNeeded() { auto& param = this->Param(); auto& ctx = this->ctx_->template As(); + int threads = ctx.threads(); auto x_dims = param.x->dims(); auto w_dims = param.filter->dims(); @@ -36,77 +37,97 @@ void WinogradConv::ReInitWhenNeeded() { } int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; + int ih = x_dims[2]; + int iw = x_dims[3]; int oc = o_dims[1]; - int tile_w = (ow + 5) / 6; - int tile_h = (oh + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = ic > oc ? ic : oc; - - const int n_wino = size_tile; - workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float); + int oh = o_dims[2]; + int ow = o_dims[3]; + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int parallel_threads = + (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block; + if (threads <= 2 && parallel_threads >= threads) { + if (last_kernel_is_c4_ == 1) { + return; + } + last_kernel_is_c4_ = 1; + auto pad = *(param.paddings); + int pad_h = pad[0]; + int pad_w = pad[2]; + int oc_pad = (oc + 3) / 4 * 4; + int ic_pad = (ic + 3) / 4 * 4; + const int new_input_size = + (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2); + const int temp_size = + (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 256 + 512) * threads; + ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float)); + + weights_.Resize({1, 1, 1, 64 * oc_pad * ic_pad}); + ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float)); + void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); + auto weights_data_ = weights_.mutable_data(); + lite::arm::math::weight_trans_c4( + weights_data_, param.filter->data(), ic, oc, trans_tmp_ptr); + free(trans_tmp_ptr); + } else { + if (last_kernel_is_c4_ == 0) { + return; + } + last_kernel_is_c4_ = 0; + int tile_w = (ow + 5) / 6; + int tile_h = (oh + 5) / 6; + + int size_tile = tile_h * tile_w; + int size_trans_channel = 8 * 8 * size_tile; + int max_ch = ic > oc ? ic : oc; + + const int n_wino = size_tile; + ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * + sizeof(float)); + + const int m_wino = oc; + int hblock = lite::arm::math::get_hblock(&ctx); + int m_round = hblock * ((m_wino + hblock - 1) / hblock); + weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); + ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) * + sizeof(float)); + auto weights_wino = + static_cast(malloc(sizeof(float) * 8 * 8 * oc * ic)); + void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); + lite::arm::math::winograd_transform_weights( + weights_wino, param.filter->data(), oc, ic, trans_tmp_ptr); + auto weights_trans = weights_.mutable_data(); + for (int i = 0; i < 64; ++i) { + float* packed_weights = weights_trans + i * m_round * ic; + const float* weights_wino_ptr = weights_wino + i * oc * ic; + lite::arm::math::prepackA(packed_weights, + weights_wino_ptr, + 1.f, + ic, + 0, + m_wino, + 0, + ic, + false, + &ctx); + } + free(trans_tmp_ptr); + free(weights_wino); + } last_shape_ = x_dims; } template <> void WinogradConv::PrepareForRun() { - auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); - - auto x_dims = param.x->dims(); - auto w_dims = param.filter->dims(); - auto o_dims = param.output->dims(); - last_shape_ = x_dims; - - int ic = x_dims[1]; - int ow = o_dims[3]; - int oh = o_dims[2]; - int oc = o_dims[1]; - int tile_w = (ow + 5) / 6; - int tile_h = (oh + 5) / 6; - int size_tile = tile_h * tile_w; - int size_trans_channel = 8 * 8 * size_tile; - int max_ch = ic > oc ? ic : oc; - - const int m_wino = oc; - const int n_wino = size_tile; - int hblock = lite::arm::math::get_hblock(&ctx); - int m_round = hblock * ((m_wino + hblock - 1) / hblock); - weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic}); - workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float); - auto weights_wino = - static_cast(malloc(sizeof(float) * 8 * 8 * oc * ic)); - void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic); - lite::arm::math::winograd_transform_weights( - weights_wino, param.filter->data(), oc, ic, trans_tmp_ptr); - auto weights_trans = weights_.mutable_data(); - for (int i = 0; i < 64; ++i) { - float* packed_weights = weights_trans + i * m_round * ic; - const float* weights_wino_ptr = weights_wino + i * oc * ic; - lite::arm::math::prepackA(packed_weights, - weights_wino_ptr, - 1.f, - ic, - 0, - m_wino, - 0, - ic, - false, - &ctx); - } - free(trans_tmp_ptr); - free(weights_wino); + ReInitWhenNeeded(); } template <> void WinogradConv::Run() { auto& param = this->Param(); auto& ctx = this->ctx_->template As(); - // extend workspace - ctx.ExtendWorkspace(workspace_size_); - const auto* i_data = param.x->data(); const auto* w_data = weights_.data(); const auto* b_data = param.bias ? param.bias->data() : nullptr; @@ -124,8 +145,42 @@ void WinogradConv::Run() { int ow = o_dims[3]; int oc = o_dims[1]; - lite::arm::math::conv_winograd3x3( - i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, param, &ctx); + int tile_block = 8; +#ifdef __aarch64__ + tile_block = 16; +#endif + int threads = ctx.threads(); + int parallel_threads = + (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block; + if (threads <= 2 && parallel_threads >= threads) { + lite::arm::math::conv_compute_6x6_3x3(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + param, + &ctx); + } else { + lite::arm::math::conv_winograd3x3(i_data, + o_data, + bs, + oc, + oh, + ow, + ic, + ih, + iw, + w_data, + b_data, + param, + &ctx); + } } } // namespace arm diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h index 33f0edc017adca477b2e71964efdcaddb0ca3a08..40ea54b2918ad6c1b18d36a6df287c7e3eb312a6 100644 --- a/lite/kernels/arm/conv_winograd.h +++ b/lite/kernels/arm/conv_winograd.h @@ -40,6 +40,7 @@ class WinogradConv : public KernelLite { Tensor weights_; DDim last_shape_; int workspace_size_{0}; + int last_kernel_is_c4_{-1}; }; } // namespace arm diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc index 1983c733180143dc0c715d6c8e3c4fddac6f8418..525eca269bae22d27d078f6696efcfb8566270c5 100644 --- a/lite/kernels/arm/fc_compute.cc +++ b/lite/kernels/arm/fc_compute.cc @@ -127,7 +127,8 @@ void FcCompute::Run() { k_, param.bias != nullptr, b_data, - false); + false, + &ctx); } } } diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc index 0b1911abf4fe553b670cf21dbb519c24dc08f184..05d43dddec47a303a89a2d48b3fb91ff45e6e2c0 100644 --- a/lite/kernels/arm/fill_constant_compute.cc +++ b/lite/kernels/arm/fill_constant_compute.cc @@ -25,6 +25,38 @@ class FillConstantCompute : public KernelLite { public: using param_t = operators::FillConstantParam; + inline DDimLite GetShape(const param_t& param) { + // 1. shape is a Tensor + if (param.shape_tensor != nullptr) { + auto* shape_tensor = param.shape_tensor; + auto* shape_data = shape_tensor->data(); + auto vec_shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + return DDimLite(vec_shape); + } + + // 2. shape is a list/tuple containing Tensor + auto shape_tensor_list = param.shape_tensor_list; + if (shape_tensor_list.size() > 0) { + std::vector vec_shape; + for (size_t i = 0; i < shape_tensor_list.size(); ++i) { + auto tensor = shape_tensor_list[i]; + vec_shape.push_back(*tensor->data()); + } + return DDimLite(vec_shape); + } + + // 3. shape is a list/tuple without containing Tensor + auto vec_shape = param.shape; + return DDimLite(vec_shape); + } + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + auto outdims = GetShape(param); + param.Out->Resize(outdims); + } + void Run() override { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); @@ -107,6 +139,11 @@ REGISTER_LITE_KERNEL(fill_constant, kNCHW, paddle::lite::kernels::arm::FillConstantCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("ShapeTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); REGISTER_LITE_KERNEL( diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc index a26777826db6976c755fac7798880871f407c12d..0398dabeaee4c042b33ac5572b783b126bc8ddb4 100644 --- a/lite/kernels/arm/interpolate_compute.cc +++ b/lite/kernels/arm/interpolate_compute.cc @@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() { auto& param = Param(); lite::Tensor* X = param.X; lite::Tensor* OutSize = param.OutSize; + auto SizeTensor = param.SizeTensor; + auto Scale = param.Scale; lite::Tensor* Out = param.Out; float scale = param.scale; int out_w = param.out_w; @@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() { std::string interp_method = "Bilinear"; lite::arm::math::interpolate(X, OutSize, + SizeTensor, + Scale, Out, out_h, out_w, scale, - scale, align_corners, interp_method); } @@ -49,6 +52,8 @@ void NearestInterpCompute::Run() { auto& param = Param(); lite::Tensor* X = param.X; lite::Tensor* OutSize = param.OutSize; + auto SizeTensor = param.SizeTensor; + auto Scale = param.Scale; lite::Tensor* Out = param.Out; float scale = param.scale; int out_w = param.out_w; @@ -57,11 +62,12 @@ void NearestInterpCompute::Run() { std::string interp_method = "Nearest"; lite::arm::math::interpolate(X, OutSize, + SizeTensor, + Scale, Out, out_h, out_w, scale, - scale, align_corners, interp_method); } @@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); @@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..bc52c5ea3ee452033cfd3c7d559cb88b21ca48f6 --- /dev/null +++ b/lite/kernels/arm/layout_compute.cc @@ -0,0 +1,179 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/layout_compute.h" +#include "lite/backends/arm/math/funcs.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +#define NCHWTONHWC(type) \ + auto& param = this->template Param(); \ + auto input = param.x->template data(); \ + auto input_dim = param.x->dims(); \ + CHECK(input_dim.size() == 4) \ + << "NCHW to NHWC should guarantee that the input dims should be 4"; \ + int n = input_dim[0]; \ + int c = input_dim[1]; \ + int h = input_dim[2]; \ + int w = input_dim[3]; \ + param.y->Resize({n, h, w, c}); \ + auto output = param.y->template mutable_data(TARGET(kARM)); \ + if (c == 1) { \ + memcpy(output, input, sizeof(type) * n * h * w); \ + return; \ + } \ + lite::arm::math::NCHW2NHWC(n, c, h * w, input, output); + +#define NHWCTONCHW(type) \ + auto& param = this->template Param(); \ + auto input = param.x->template data(); \ + auto input_dim = param.x->dims(); \ + CHECK(input_dim.size() == 4) \ + << "NHWC to NCHW should guarantee that the input dims should be 4"; \ + int n = input_dim[0]; \ + int h = input_dim[1]; \ + int w = input_dim[2]; \ + int c = input_dim[3]; \ + param.y->Resize({n, c, h, w}); \ + auto output = param.y->template mutable_data(TARGET(kARM)); \ + if (c == 1) { \ + memcpy(output, input, sizeof(type) * n * h * w); \ + return; \ + } \ + lite::arm::math::NHWC2NCHW(n, c, h * w, input, output); + +template <> +void NCHWToNHWCCompute::Run() { + NCHWTONHWC(float); +} + +template <> +void NCHWToNHWCCompute::Run() { + NCHWTONHWC(int8_t); +} + +template <> +void NHWCToNCHWCompute::Run() { + NHWCTONCHW(float); +} + +template <> +void NHWCToNCHWCompute::Run() { + NHWCTONCHW(int8_t); +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::arm::NCHWToNHWCCompute + NCHW_fp32; +typedef paddle::lite::kernels::arm::NCHWToNHWCCompute + NCHW_int8; +typedef paddle::lite::kernels::arm::NHWCToNCHWCompute + NHWC_fp32; +typedef paddle::lite::kernels::arm::NHWCToNCHWCompute + NHWC_int8; + +REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw) + .BindInput("Input", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kARM), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/arm/layout_compute.h b/lite/kernels/arm/layout_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..13b8621029437ea18d960e9c22d53b7062983b8f --- /dev/null +++ b/lite/kernels/arm/layout_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { +template +class NCHWToNHWCCompute : public KernelLite { + public: + using param_t = operators::LayoutParam; + void Run() override; + virtual ~NCHWToNHWCCompute() = default; +}; + +template +class NHWCToNCHWCompute : public KernelLite { + public: + using param_t = operators::LayoutParam; + void Run() override; + virtual ~NHWCToNCHWCompute() = default; +}; + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc index fa7e2c0c3ae4580f5d19e82f7c48c74db3058847..ba58b378f4dda22fd78ce76b80bdbca8d8f284a3 100644 --- a/lite/kernels/arm/lookup_table_compute.cc +++ b/lite/kernels/arm/lookup_table_compute.cc @@ -28,7 +28,6 @@ namespace arm { void LookupTableCompute::Run() { auto& param = this->Param(); - auto& ctx = this->ctx_->template As(); // inputs auto w = param.W; auto ids = param.Ids; @@ -37,7 +36,7 @@ void LookupTableCompute::Run() { auto table_dim = w->dims(); int64_t ids_numel = ids->numel(); - auto ids_data = ids->data(); + auto ids_data = ids->data(); int64_t row_number = table_dim[0]; int64_t row_width = table_dim[1]; @@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); + +REGISTER_LITE_KERNEL(lookup_table_v2, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..78748edf39c43c5451f8fa3c4d63bde7405c7078 --- /dev/null +++ b/lite/kernels/arm/lookup_table_compute_test.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/arm/lookup_table_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace arm { + +void lookup_table_compute_ref(const operators::LookupTableParam ¶m) { + auto *ids_t = param.Ids; + auto *output_t = param.Out; + int64_t padding_idx = param.padding_idx; + auto *ids = ids_t->data(); + int64_t ids_numel = ids_t->dims().production(); + + auto *table_t = param.W; + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(); + memset(output, 0, output_t->dims().production() * sizeof(float)); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != -1 && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(float)); + } else { + CHECK_LT(ids[i], row_number); + CHECK_GE(ids[i], 0); + memcpy(output + i * row_width, + table + ids[i] * row_width, + row_width * sizeof(float)); + } + } +} + +TEST(lookup_table_arm, retrieve_op) { + auto lookup_table = + KernelRegistry::Global().Create( + "lookup_table"); + ASSERT_FALSE(lookup_table.empty()); + ASSERT_TRUE(lookup_table.front()); +} + +TEST(lookup_table_arm, init) { + LookupTableCompute lookup_table; + ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat)); + ASSERT_EQ(lookup_table.target(), TARGET(kARM)); +} + +TEST(lookup_table_arm, compute) { + LookupTableCompute lookup_table; + operators::LookupTableParam param; + lite::Tensor w, ids, out, out_ref; + int64_t padding_idx = -1; + + auto w_dim = DDim(std::vector({4, 5})); + auto ids_dim = DDim(std::vector({3, 2})); + auto out_dim = DDim(std::vector({3, 2, 5})); + + w.Resize(w_dim); + ids.Resize(ids_dim); + out.Resize(out_dim); + out_ref.Resize(out_dim); + + auto *w_data = w.mutable_data(); + auto *ids_data = ids.mutable_data(); + auto *out_data = out.mutable_data(); + auto *out_ref_data = out_ref.mutable_data(); + + int w_num = w_dim.production(); + for (int i = 0; i < w_num; i++) { + w_data[i] = static_cast(i + 1) / (w_num + 1); + } + int ids_num = ids_dim.production(); + for (int i = 0; i < ids_num; i++) { + ids_data[i] = i % 4; + } + int out_num = out_dim.production(); + + param.W = &w; + param.Ids = &ids; + param.Out = &out; + lookup_table.SetParam(param); + lookup_table.Run(); + param.Out = &out_ref; + lookup_table_compute_ref(param); + for (int i = 0; i < out_num; i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace arm +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def); diff --git a/lite/kernels/arm/lrn_compute.cc b/lite/kernels/arm/lrn_compute.cc index 18e6654282c8810a8310e540c2851fecb116f2d8..0476b1e6bde99e7993d1b0feb53ab10ba1b8f9b5 100644 --- a/lite/kernels/arm/lrn_compute.cc +++ b/lite/kernels/arm/lrn_compute.cc @@ -31,16 +31,16 @@ void LrnCompute::Run() { int channel = x_dims[1]; int h = x_dims[2]; int w = x_dims[3]; - const int local_size = param.local_size; + const int n = param.n; const float alpha = param.alpha; const float beta = param.beta; const float k = param.k; if (param.norm_region == "AcrossChannels") { lite::arm::math::compute_across_channels( - x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); + x_data, out_data, num, channel, h, w, n, alpha, beta, k); } else { lite::arm::math::compute_within_channels( - x_data, out_data, num, channel, h, w, local_size, alpha, beta, k); + x_data, out_data, num, channel, h, w, n, alpha, beta, k); } } @@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL( lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc index 8e030006151c5834a68037800192ec7d9bc5d94d..e7030d00427e55c7faf333997cd90cba46260cd4 100644 --- a/lite/kernels/arm/lrn_compute_test.cc +++ b/lite/kernels/arm/lrn_compute_test.cc @@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) { const dtype* x_data = param.X->data(); dtype* out_data = param.Out->mutable_data(); auto x_dims = param.X->dims(); - int local_size = param.local_size; + int local_size = param.n; float alpha = param.alpha; float beta = param.beta; float k = param.k; @@ -171,7 +171,7 @@ TEST(lrn_arm, compute) { } param.X = &x; param.Out = &output; - param.local_size = local_size; + param.n = local_size; param.alpha = alpha; param.beta = beta; param.k = k; diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc index 29be34d0c273abe9fafb7d187cc6d443eefc2d55..d00a5bdc060431509b73b18336f41b3c688cbcf2 100644 --- a/lite/kernels/arm/matmul_compute.cc +++ b/lite/kernels/arm/matmul_compute.cc @@ -232,7 +232,7 @@ void MatMulCompute::Run() { int ldc = n_; if (n_ == 1) { lite::arm::math::sgemv( - x_data, y_data, o_data, false, m_, k_, false, nullptr, false); + x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx); if (fabsf(alpha - 1.f) > 1e-8f) { for (size_t i = 0; i < param.Out->dims().production(); ++i) { o_data[i] *= alpha; diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc index fa43b6cf8e5d7418583d44d2ed9b6e49d128d2d6..debe9e907cadafd67e6be40f7e49ff12cb4d527e 100644 --- a/lite/kernels/arm/mul_compute.cc +++ b/lite/kernels/arm/mul_compute.cc @@ -48,14 +48,13 @@ void MulCompute::Run() { CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h"; k_ = x_w; - + auto& ctx = this->ctx_->template As(); if (n_ == 1) { lite::arm::math::sgemv( - x_data, y_data, o_data, false, m_, k_, false, nullptr, false); + x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx); } else { constexpr bool is_tranposed_y = false; - auto& ctx = this->ctx_->template As(); int hblock = lite::arm::math::get_hblock(&ctx); int m_round = hblock * ((m_ + hblock - 1) / hblock); ctx.ExtendWorkspace(m_round * k_ * sizeof(float)); diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc index 9f02a462a517077f662dcc952780b6e34bfb95a4..c9f0fed47854226327be86a02a9429a003fe4762 100644 --- a/lite/kernels/arm/pool_compute.cc +++ b/lite/kernels/arm/pool_compute.cc @@ -38,7 +38,7 @@ void PoolCompute::Run() { std::vector& ksize = param.ksize; std::vector& strides = param.strides; - std::vector& paddings = param.paddings; + std::vector& paddings = *param.paddings; std::string& pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -48,12 +48,15 @@ void PoolCompute::Run() { bool use_quantizer = param.use_quantizer; std::string& data_format = param.data_format; - bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && - (paddings[0] == paddings[1]); + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && + (paddings[0] == paddings[2]); if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(in_dims[i + 2]); } if (pooling_type == "max") { @@ -80,7 +83,8 @@ void PoolCompute::Run() { return; } } else { - if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) { + if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal && + kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling2x2s2_max(din, dout, @@ -106,7 +110,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s1p1_max(din, dout, @@ -132,7 +136,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s1p0_max(din, dout, @@ -158,7 +162,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s2p0_max(din, dout, @@ -184,7 +188,7 @@ void PoolCompute::Run() { return; } } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 && - kps_equal) { + pads_equal && kps_equal) { if (pooling_type == "max") { lite::arm::math::pooling3x3s2p1_max(din, dout, diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc index 79e5332172c9a488c83dd485f094250d71a1d5dc..7ed8a142dda06e2d1b8f9d8afdade0194d87d1e6 100644 --- a/lite/kernels/arm/pool_compute_test.cc +++ b/lite/kernels/arm/pool_compute_test.cc @@ -15,6 +15,7 @@ #include "lite/kernels/arm/pool_compute.h" #include #include +#include #include #include #include "lite/backends/arm/math/funcs.h" @@ -25,14 +26,21 @@ namespace lite { namespace kernels { namespace arm { -int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } @@ -40,10 +48,12 @@ int PoolOutputSize( std::vector compute_output_shape(operators::PoolParam* param_) { const auto x_dims = param_->x->dims(); std::vector& ksize = param_->ksize; + auto paddings = *param_->paddings; if (param_->global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); for (size_t i = 0; i < ksize.size(); ++i) { - param_->paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } @@ -56,7 +66,8 @@ std::vector compute_output_shape(operators::PoolParam* param_) { for (size_t i = 0; i < param_->ksize.size(); ++i) { output_shape.push_back(PoolOutputSize(x_dims[i + 2], param_->ksize[i], - param_->paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_->strides[i], param_->ceil_mode)); } @@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) { std::vector ksize = param.ksize; std::vector strides = param.strides; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::string pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) { int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } @@ -225,75 +240,92 @@ TEST(pool_arm, compute) { for (auto exclusive : {true, false}) { for (auto ksize : {2, 3}) { for (auto stride : {1, 2}) { - for (auto pad : {0, 1}) { - for (auto n : {1, 2}) { - for (auto c : {1, 3}) { + for (auto pad_left : {0, 1}) { + for (auto pad_right : {0, 1}) { + for (auto pad_top : {0, 1}) { + for (auto pad_bottom : {0, 1}) { + for (auto n : {1, 2}) { + for (auto c : {1, 3}) { #if 1 - for (auto h : {2, 3, 4, 11}) { - for (auto w : {2, 3, 4, 11}) { + for (auto h : {2, 3, 4, 11}) { + for (auto w : {2, 3, 4, 11}) { #else - for (int h = 2; h < 25; h++) { - for (int w = 2; w < 25; w++) { + for (int h = 2; h < 25; h++) { + for (int w = 2; w < 25; w++) { #endif - VLOG(3) << "n:" << n << " c:" << c << " h:" << h - << " w:" << w << " ksize:" << ksize - << " stride:" << stride << " pad:" << pad - << " exclusive:" << exclusive - << " global_pooling:" << global_pooling - << " ceil_mode: " << ceil_mode - << " pooling_type:" << pooling_type; + VLOG(3) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride + << " pad_left:" << pad_left + << " pad_right:" << pad_right + << " pad_top:" << pad_top + << " pad_bottom:" << pad_bottom + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; - // init x, output - x.Resize(DDim(std::vector({n, c, h, w}))); - auto* x_data = x.mutable_data(); - for (int i = 0; i < x.dims().production(); ++i) { - float sign = i % 3 == 0 ? -0.03 : 0.05f; - x_data[i] = sign * (i % 128); - } + // init x, output + x.Resize( + DDim(std::vector({n, c, h, w}))); + auto* x_data = x.mutable_data(); + for (int i = 0; i < x.dims().production(); ++i) { + float sign = i % 3 == 0 ? -0.03 : 0.05f; + x_data[i] = sign * (i % 128); + } - // fill param - param.x = &x; - param.output = &output; - param.pooling_type = pooling_type; - if (global_pooling) { - param.ksize = {h, w}; - } else { - param.ksize = {ksize, ksize}; - } - param.global_pooling = global_pooling; - param.strides = {stride, stride}; - param.paddings = {pad, pad}; - param.exclusive = exclusive; - param.ceil_mode = ceil_mode; - param.adaptive = false; - param.use_quantizer = false; + // fill param + param.x = &x; + param.output = &output; + param.pooling_type = pooling_type; + if (global_pooling) { + param.ksize = {h, w}; + } else { + param.ksize = {ksize, ksize}; + } + param.global_pooling = global_pooling; + param.strides = {stride, stride}; + std::vector paddings = { + pad_top, pad_bottom, pad_left, pad_right}; + param.exclusive = exclusive; + param.paddings = + std::make_shared>(paddings); + param.ceil_mode = ceil_mode; + param.adaptive = false; + param.use_quantizer = false; - const std::vector& output_shape = - compute_output_shape(¶m); - output.Resize(DDim(output_shape)); - output_ref.Resize(DDim(output_shape)); + const std::vector& output_shape = + compute_output_shape(¶m); + output.Resize(DDim(output_shape)); + output_ref.Resize(DDim(output_shape)); - auto* output_data = output.mutable_data(); - auto* output_ref_data = - output_ref.mutable_data(); - for (int i = 0; i < output.dims().production(); ++i) { - output_data[i] = -2; - output_ref_data[i] = -2; - } + auto* output_data = output.mutable_data(); + auto* output_ref_data = + output_ref.mutable_data(); + for (int i = 0; i < output.dims().production(); + ++i) { + output_data[i] = -2; + output_ref_data[i] = -2; + } - // compute - pool.SetParam(param); - pool.Run(); + // compute + pool.SetParam(param); + pool.Run(); - // compute ref - param.output = &output_ref; - pool_compute_ref(param); + // compute ref + param.output = &output_ref; + pool_compute_ref(param); - // compare - for (int i = 0; i < output.dims().production(); i++) { - EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4); + // compare + for (int i = 0; i < output.dims().production(); + i++) { + EXPECT_NEAR( + output_data[i], output_ref_data[i], 1e-4); + } + VLOG(3) << "compare pass"; + } + } } - VLOG(3) << "compare pass"; } } } diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc index 27606e2d76dfd13161fffc3f53d614155f62254e..2a0c52e7fc44cdd7c36ac3e8f93b33731f03bd77 100644 --- a/lite/kernels/arm/split_compute.cc +++ b/lite/kernels/arm/split_compute.cc @@ -42,5 +42,9 @@ void SplitCompute::Run() { REGISTER_LITE_KERNEL( split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindInput("AxisTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("SectionsTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt index b33fc8f6bb0a5616ab87c01d55f9d81a9fe7032b..4bf1cbf5210214befb3620f8b7d70923f41f98f2 100644 --- a/lite/kernels/cuda/CMakeLists.txt +++ b/lite/kernels/cuda/CMakeLists.txt @@ -5,24 +5,39 @@ endif() message(STATUS "compile with lite CUDA kernels") add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} context) +add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps}) add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps}) add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps}) add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps}) add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose) add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps}) add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda}) add_kernel(concat_compute_cuda CUDA basic SRCS concat_compute.cu DEPS ${lite_kernel_deps}) -add_kernel(elementwise_add_compute_cuda CUDA basic SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise) +add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise) add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps}) add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose) add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale) add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale) add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps}) -add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS +${lite_kernel_deps} cudnn_pool) add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(search_fc_compute_cuda CUDA basic SRCS search_fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) +add_kernel(sequence_topk_avg_pooling_compute_cuda CUDA basic SRCS sequence_topk_avg_pooling_compute.cu DEPS ${lite_kernel_deps}) +add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm) +add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm) +add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda}) lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda) nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda) @@ -31,13 +46,28 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_ nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda) nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda) nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda) +nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda) nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda) -nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda) +nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda) nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda) #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda) -nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) +nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda ) nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda) +nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda) +nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda) +nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda) +nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda) +nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda) +nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda) +nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda) + if(LITE_BUILD_EXTRA) + nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda) + nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda) + nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda) + nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda) nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda) + nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda) + nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda) endif() diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..fac73b1adc49fd90fbda33669aee53e4126a6649 --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute.cu @@ -0,0 +1,162 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/attention_padding_mask_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +#define CUDA_NUM_THREADS 256 + +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ker_attention_padding_mask(T* out_data, + const T* attn_data, + const int* src_offset, + const int attn_seq_num, + const int attn_seq_len, + const int src_seq_num, + const int src_seq_len, + const T* pad_begin_data, + const T mask, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int src_word_id = tid % src_seq_len; + int tmp_tid = tid / src_seq_len; + int attn_seq_id = tmp_tid / attn_seq_len; + int attn_word_id = tmp_tid % attn_seq_len; + int src_seq_id = attn_seq_id % src_seq_num; + int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id]; + + int k = static_cast(pad_begin_data[src_seq_id]); + if (k < cur_len && + tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k && + tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + + cur_len) { + out_data[tid] = mask; + } else { + out_data[tid] = attn_data[tid]; + } + } +} + +void AttentionPaddingMaskCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto attn = param.X; + auto src = param.Y; + const int count = attn->numel(); + auto attn_offset = attn->lod()[0]; + auto src_offset = src->lod()[0]; + const int attn_seq_num = attn_offset.size() - 1; + const int attn_seq_len = attn_offset[1]; + const int src_seq_num = src_offset.size() - 1; + const int src_seq_len = count / attn->dims()[0]; + + auto out = param.Out; + out->Resize(attn->dims()); + out->set_lod(attn->lod()); + + auto attn_data = attn->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + + std::vector src_cpu(src->numel(), 0); + TargetWrapperCuda::MemcpyAsync(src_cpu.data(), + src->data(), + sizeof(float) * src->numel(), + IoDirection::DtoH, + stream); + cudaStreamSynchronize(stream); + + std::vector pad_begin(src_seq_num, 0); + auto src_len = static_cast(src->lod()[0][1]); + int _pad_id = param.pad_id; + for (int i = 0; i < src_seq_num; ++i) { + const auto* src_data = src_cpu.data() + src_len * i; + int index = src_len - 1; + for (; index >= 0 && _pad_id == static_cast(src_data[index]); + --index) { + } + pad_begin[i] = static_cast(index + 1); + } + + param.pad_begin->Resize({static_cast(src_seq_num)}); + auto pad_begin_cuda_data = + param.pad_begin->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data, + pad_begin.data(), + sizeof(float) * src_seq_num, + IoDirection::HtoD, + stream); + + std::vector src_offset_cpu(src_offset.size(), 0); + for (int i = 0; i < src_offset.size(); i++) { + src_offset_cpu[i] = src_offset[i]; + } + + src_offset_cuda.Resize({static_cast(src_offset.size())}); + auto src_offset_cuda_data = src_offset_cuda.mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data, + src_offset_cpu.data(), + sizeof(int) * src_offset.size(), + IoDirection::HtoD, + stream); + + ker_attention_padding_mask< + float><<>>( + out_data, + attn_data, + src_offset_cuda_data, + attn_seq_num, + attn_seq_len, + src_seq_num, + src_seq_len, + pad_begin_cuda_data, + param.mask, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_attention_padding_mask, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::AttentionPaddingMaskCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/attention_padding_mask_compute.h b/lite/kernels/cuda/attention_padding_mask_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..57d8c269a1cdc1f6dcd59bf3399a835b64b6784c --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class AttentionPaddingMaskCompute + : public KernelLite { + public: + using param_t = operators::AttentionPaddingMaskParam; + + void Run() override; + virtual ~AttentionPaddingMaskCompute() = default; + + private: + lite::Tensor src_offset_cuda; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/attention_padding_mask_compute_test.cc b/lite/kernels/cuda/attention_padding_mask_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d11858350d6fd49ef12cfe5d7ebb7ed865ee51d7 --- /dev/null +++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/attention_padding_mask_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void attention_padding_mask_ref( + const Tensor& x, + const Tensor& y, + Tensor* out, + Tensor* pad_begin, + const operators::AttentionPaddingMaskParam& param) { + auto attn_offset = x.lod()[0]; + auto src_offset = y.lod()[0]; + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = x.dims()[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0); + + auto count = x.numel(); + auto attn_data = x.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + memcpy(out_data, attn_data, count * sizeof(float)); + + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx]; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_out_data[k] = param.mask; + } + } + } +} + +void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) { + std::vector x_dims{static_cast(lod[0].back()), dim2rd}; + x->Resize(x_dims); + x->set_lod(lod); + auto x_data = x->mutable_data(); + auto x_num = x->numel(); + for (int i = 0; i < x_num; i++) { + x_data[i] = (i - x_num) * 1.1; + } +} + +int get_max_len(const LoD& lod) { + int max_len = 0; + auto offset = lod[0]; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i + 1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + +TEST(attention_padding_mask_cuda, run_test) { + lite::Tensor x, y, x_cpu, y_cpu; + lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref; + + LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}}; + prepare_input(&x_cpu, x_lod, get_max_len(y_lod)); + prepare_input(&y_cpu, y_lod, 1); + + x.Resize(x_cpu.dims()); + x.set_lod(x_cpu.lod()); + auto x_cpu_data = x_cpu.mutable_data(); + x.Assign(x_cpu_data, x_cpu.dims()); + + y.Resize(y_cpu.dims()); + y.set_lod(y_cpu.lod()); + + operators::AttentionPaddingMaskParam param; + param.X = &x; + param.Y = &y; + param.pad_id = 12800001; + param.mask = -90000000.f; + param.Out = &out; + param.pad_begin = &pad_begin; + + std::unique_ptr ctx(new KernelContext); + auto context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + AttentionPaddingMaskCompute attention_padding_mask_kernel; + attention_padding_mask_kernel.SetParam(param); + attention_padding_mask_kernel.SetContext(std::move(ctx)); + attention_padding_mask_kernel.Run(); + cudaDeviceSynchronize(); + + auto out_data = out.mutable_data(TARGET(kCUDA)); + out_cpu.Resize(out.dims()); + auto out_cpu_data = out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/bilinear_interp_compute.cu b/lite/kernels/cuda/bilinear_interp_compute.cu index 7e1dbaf228c31d8123e48832e93e0180c4920359..00b14579383b67eccd65600869a156ba68d8cb09 100644 --- a/lite/kernels/cuda/bilinear_interp_compute.cu +++ b/lite/kernels/cuda/bilinear_interp_compute.cu @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include "lite/backends/cuda/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/kernels/cuda/bilinear_interp_compute.h" @@ -20,6 +21,43 @@ namespace kernels { namespace cuda { using Tensor = lite::Tensor; +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + lite::Tensor temp; + auto temp_data = temp.mutable_data(); + auto tensor_data = tensor->data(); + cudaMemcpy(temp_data, + tensor_data, + tensor->dims().production() * sizeof(float), + cudaMemcpyDeviceToHost); + + vec_new_shape.push_back(static_cast(*temp_data)); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data(); + cudaMemcpy(cpu_starts_tensor_data, + new_data, + new_data_tensor->dims().production() * sizeof(T), + cudaMemcpyDeviceToHost); + + auto new_data_ = cpu_starts_tensor.data(); + vec_new_data = std::vector( + new_data_, new_data_ + new_data_tensor->dims().production()); + return vec_new_data; +} + template __global__ void BilinearInterp(const T* in, const size_t in_img_h, @@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() { int out_w = param.out_w; float scale = param.scale; bool align_corners = param.align_corners; - if (scale > 0) { - out_h = static_cast(in_h * scale); - out_w = static_cast(in_w * scale); - } - if (out_size != nullptr) { - Tensor sizes; - float* size_data = sizes.mutable_data(); - float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); - cudaMemcpy( - size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); - out_h = static_cast(size_data[0]); - out_w = static_cast(size_data[1]); + auto list_new_shape_tensor = param.SizeTensor; + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + auto scale_tensor = param.Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + if (out_size != nullptr) { + lite::Tensor sizes; + float* size_data = sizes.mutable_data(); + float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); + cudaMemcpy( + size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); + out_h = static_cast(size_data[0]); + out_w = static_cast(size_data[1]); + } } auto output_data = output->mutable_data(TARGET(kCUDA)); - if (in_h == out_h && in_w == out_w) { cudaMemcpy(output_data, input_data, @@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp, {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scale", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), diff --git a/lite/kernels/cuda/bilinear_interp_compute_test.cc b/lite/kernels/cuda/bilinear_interp_compute_test.cc index e7e8143150d2963fb4cb74c3530cfd6e125a454c..e93f5b1f3e8d6f5d93af2571a10d9fc531605f9c 100644 --- a/lite/kernels/cuda/bilinear_interp_compute_test.cc +++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include namespace paddle { namespace lite { @@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) { } } +TEST(bilinear_interp, update) { + BilinearInterpCompute bilinear_interp_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::InterpolateParam param; + + std::vector size_tensor(2); + std::vector size_tensor_cpu(2), size_tensor_ref(2); + Tensor x, input_scale, osz, out; + Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu; + Tensor x_ref, input_scale_ref, osz_ref, out_ref; + + int n = 1, c = 1, in_h = 3, in_w = 3; + int out_h = 6, out_w = 6; + float scale = 2.0; + + param.out_h = out_h; + param.out_w = out_w; + param.scale = scale; + param.align_corners = false; + param.align_mode = 0; + + x.Resize({n, c, in_h, in_w}); + size_tensor[0].Resize({1}); + size_tensor[1].Resize({1}); + input_scale.Resize({1}); + osz.Resize({2}); + out.Resize({n, c, out_h, out_w}); + + x_cpu.Resize({n, c, in_h, in_w}); + size_tensor_cpu[0].Resize({1}); + size_tensor_cpu[1].Resize({1}); + input_scale_cpu.Resize({1}); + osz_cpu.Resize({2}); + out_cpu.Resize({n, c, out_h, out_w}); + + x_ref.Resize({n, c, in_h, in_w}); + size_tensor_ref[0].Resize({1}); + size_tensor_ref[1].Resize({1}); + input_scale_ref.Resize({1}); + osz_ref.Resize({2}); + out_ref.Resize({n, c, out_h, out_w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data(); + float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data(); + float* input_scale_cpu_data = input_scale_cpu.mutable_data(); + float* osz_cpu_data = osz_cpu.mutable_data(); + float* out_cpu_data = out_cpu.mutable_data(); + + float* x_ref_data = x_ref.mutable_data(); + float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data(); + float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data(); + float* input_scale_ref_data = input_scale_ref.mutable_data(); + float* osz_ref_data = osz_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + + osz_cpu_data[0] = out_h; + osz_cpu_data[1] = out_w; + size_tensor0_cpu_data[0] = out_h; + size_tensor1_cpu_data[0] = out_w; + input_scale_cpu_data[0] = scale; + osz_ref_data[0] = out_h; + osz_ref_data[1] = out_w; + size_tensor0_ref_data[0] = out_h; + size_tensor1_ref_data[0] = out_w; + input_scale_ref_data[0] = scale; + + x.Assign(x_cpu_data, x_cpu.dims()); + size_tensor[0].Assign( + size_tensor0_cpu_data, size_tensor[0].dims()); + size_tensor[1].Assign( + size_tensor1_cpu_data, size_tensor[1].dims()); + input_scale.Assign(input_scale_cpu_data, + input_scale.dims()); + osz.Assign(osz_cpu_data, osz_cpu.dims()); + + param.X = &x; + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[0])); + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[1])); + param.Scale = &input_scale; + param.OutSize = &osz; + param.Out = &out; + + bilinear_interp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + bilinear_interp_kernel.SetContext(std::move(ctx)); + bilinear_interp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + for (int i = 0; i < out.numel(); i++) { + LOG(INFO) << out_cpu_data[i]; + } +} + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc index 8703d8730a1880b5b93502e5095b1a17d03bee6c..fdb47f7dd3c2e6d8f82e0281b81b24ebe444909a 100644 --- a/lite/kernels/cuda/calib_compute_cuda_test.cc +++ b/lite/kernels/cuda/calib_compute_cuda_test.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/kernels/cuda/calib_compute.h" #include #include #include @@ -58,12 +59,7 @@ void calib_ref(const operators::CalibParam& param, bool to_float = true) { } TEST(calib_cuda, int8_to_fp32) { - LOG(INFO) << "to get kernel ..."; - auto kernels = KernelRegistry::Global().Create( - "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW)); - ASSERT_FALSE(kernels.empty()); - auto calib = std::move(*std::next(kernels.begin(), 1)); - LOG(INFO) << "get kernel: " << calib->doc(); + CalibComputeInt8ToFp32 calib; const int n = 64, c = 32, h = 18, w = 18; Tensor x; Tensor x_cpu; @@ -87,14 +83,14 @@ TEST(calib_cuda, int8_to_fp32) { cudaStream_t stream; cudaStreamCreate(&stream); context.SetExecStream(stream); - calib->SetContext(std::move(ctx)); + calib.SetContext(std::move(ctx)); operators::CalibParam param; param.scale = 0.013f; param.input = &x; param.output = &output; - calib->SetParam(param); - calib->Launch(); + calib.SetParam(param); + calib.Launch(); cudaDeviceSynchronize(); // invoking ref implementation and compare results param.input = &x_cpu; @@ -113,12 +109,7 @@ TEST(calib_cuda, int8_to_fp32) { } TEST(calib_cuda, fp32_to_int8) { - LOG(INFO) << "to get kernel ..."; - auto kernels = KernelRegistry::Global().Create( - "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW)); - ASSERT_FALSE(kernels.empty()); - auto calib = std::move(kernels.front()); - LOG(INFO) << "get kernel: " << calib->doc(); + CalibComputeFp32ToInt8 calib; const int n = 64, c = 32, h = 18, w = 18; Tensor x; Tensor x_cpu; @@ -142,14 +133,14 @@ TEST(calib_cuda, fp32_to_int8) { cudaStream_t stream; cudaStreamCreate(&stream); context.SetExecStream(stream); - calib->SetContext(std::move(ctx)); + calib.SetContext(std::move(ctx)); operators::CalibParam param; param.scale = 0.013f; param.input = &x; param.output = &output; - calib->SetParam(param); - calib->Launch(); + calib.SetParam(param); + calib.Launch(); cudaDeviceSynchronize(); // invoking ref implementation and compare results param.input = &x_cpu; diff --git a/lite/kernels/cuda/concat_compute.cu b/lite/kernels/cuda/concat_compute.cu index 9ec693667252e76a99f305bc3ec9d6062cc2e840..72d0af459b26b6864dd308fece7131224b212a05 100644 --- a/lite/kernels/cuda/concat_compute.cu +++ b/lite/kernels/cuda/concat_compute.cu @@ -51,9 +51,9 @@ void ConcatCompute::Run() { Tensor* output = param.output; auto* output_data = output->mutable_data(TARGET(kCUDA)); int axis = param.axis; - auto* axis_tensor = param.axis_tensor; + Tensor* axis_tensor = param.axis_tensor; if (axis_tensor != nullptr) { - auto* axis_tensor_data = axis_tensor->data(); + const int* axis_tensor_data = axis_tensor->data(); axis = axis_tensor_data[0]; } int inner_size = 1; diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc index eea81602ddf94158250aecf01fe5e95193bf58c1..468ed0cbd06a1b20596cef9ba8a7f0998de7fe73 100644 --- a/lite/kernels/cuda/conv_compute.cc +++ b/lite/kernels/cuda/conv_compute.cc @@ -21,10 +21,14 @@ namespace lite { namespace kernels { namespace cuda { -inline int ConvOutputSize( - int input_size, int filter_size, int dilation, int padding, int stride) { +inline int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + int output_size = (input_size + pad_left + pad_right - dkernel) / stride + 1; CHECK_GT_OR_FALSE(output_size, 0); return output_size; @@ -50,11 +54,15 @@ void ConvComputeInt8::PrepareForRun() { const auto filter_dims = param.filter->dims(); std::vector output_shape({in_dims[0]}); + auto paddings = *param.paddings; + auto dilations = *param.dilations; + for (size_t i = 0; i < param.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 1], filter_dims[i + 1], - param.dilations[i], - param.paddings[i], + dilations[i], + paddings[2 * i], + paddings[2 * i + 1], param.strides[i])); } output_shape.push_back(filter_dims[0]); @@ -71,12 +79,15 @@ void ConvComputeInt8::Run() { const auto in_dims = param.x->dims(); const auto filter_dims = param.filter->dims(); std::vector output_shape({in_dims[0]}); + auto paddings = *param.paddings; + auto dilations = *param.dilations; for (size_t i = 0; i < param.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 1], filter_dims[i + 1], - param.dilations[i], - param.paddings[i], + dilations[i], + paddings[2 * i], + paddings[2 * i + 1], param.strides[i])); } output_shape.push_back(filter_dims[0]); diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc index 05175a0debcd687a2e5e06fa799839ad52c50adb..2ebd7e33baf8e12cfce24661f186382152b6bb89 100644 --- a/lite/kernels/cuda/conv_compute_test.cc +++ b/lite/kernels/cuda/conv_compute_test.cc @@ -41,7 +41,10 @@ TEST(conv_compute, fp32) { act_param.Leaky_relu_alpha = 0.1; operators::ConvParam param; param.activation_param = act_param; - param.paddings = {1, 1}; + std::vector pads = {1, 1, 1, 1}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.groups = 1; Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu; @@ -148,6 +151,10 @@ TEST(conv_compute, int8) { bias.Assign(bias_cpu_data, filter_cpu.dims()); + std::vector pads = {0, 0, 0, 0}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.x = &x; param.filter = &filter; param.output = &y; @@ -202,12 +209,10 @@ TEST(conv_compute, int8_int8_out) { std::cout << "input" << std::endl; for (int i = 0; i < x_cpu.numel(); i++) { x_cpu_data[i] = static_cast(random(-36, 36)); - std::cout << float(x_cpu_data[i]) << std::endl; } std::cout << "filter" << std::endl; for (int i = 0; i < filter_cpu.numel(); i++) { filter_cpu_data[i] = static_cast(random(-10, 10)); - std::cout << float(filter_cpu_data[i]) << std::endl; } for (int i = 0; i < bias_cpu.numel(); i++) { bias_cpu_data[i] = i + 1.0; @@ -220,6 +225,10 @@ TEST(conv_compute, int8_int8_out) { bias.Assign(bias_cpu_data, filter_cpu.dims()); + std::vector pads = {0, 0, 0, 0}; + std::vector dilations = {1, 1, 1, 1}; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilations); param.x = &x; param.filter = &filter; param.output = &y; diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..64759f86f5df85f9855b9c1f186bbc9c039a044c --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute.cu @@ -0,0 +1,318 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/backends/cuda/math/elementwise.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/elementwise_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +inline DDim trim_trailing_singular_dims(const DDim& dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(); + } + return DDim(trim_dims); +} + +inline bool is_broadcast(const DDim& x_dims, + const DDim& y_dims, + int axis, + int* pre, + int* n, + int* post) { + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + DDim y_dim_trim = trim_trailing_singular_dims(y_dims); + axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis; + if (x_dims.size() == y_dim_trim.size()) { + return false; + } + *pre = 1; + *n = 1; + *post = 1; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dim_trim.size(); ++i) { + CHECK_EQ(x_dims[i + axis], y_dim_trim[i]) + << "Broadcast dimension mismatch."; + (*n) *= y_dim_trim[i]; + } + for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } + return true; +} + +#define ELEMENTWISE_COMPUTE(OP, WITH_RELU) \ + auto& param = this->Param(); \ + auto& ctx = this->ctx_->template As(); \ + auto stream = ctx.exec_stream(); \ + const lite::Tensor* x = param.X; \ + const lite::Tensor* y = param.Y; \ + lite::Tensor* out = param.Out; \ + int axis = param.axis; \ + auto* x_data = x->data(); \ + auto* y_data = y->data(); \ + auto out_data = out->mutable_data(TARGET(kCUDA)); \ + int pixel_num = x->numel(); \ + int pre = 1; \ + int n = pixel_num; \ + int post = 1; \ + if (WITH_RELU) { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } else { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } + +#define ELEMENTWISE_COMPUTE_NHWC(OP, WITH_RELU) \ + std::map pos_map = {{0, 0}, {1, 3}, {2, 1}, {3, 2}}; \ + auto& param = this->Param(); \ + auto& ctx = this->ctx_->template As(); \ + auto stream = ctx.exec_stream(); \ + const lite::Tensor* x = param.X; \ + const lite::Tensor* y = param.Y; \ + lite::Tensor* out = param.Out; \ + int axis = param.axis; \ + if (axis < 0) axis = x->dims().size() - y->dims().size(); \ + CHECK(axis >= 0) << "invalid axis of elementwise op"; \ + axis = pos_map[axis]; \ + auto* x_data = x->data(); \ + auto* y_data = y->data(); \ + auto out_data = out->mutable_data(TARGET(kCUDA)); \ + int pixel_num = x->numel(); \ + int pre = 1; \ + int n = pixel_num; \ + int post = 1; \ + if (WITH_RELU) { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise_relu( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } else { \ + if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, pre, n, post, OP, stream); \ + } else { \ + lite::cuda::math::elementwise( \ + x_data, y_data, out_data, 1, pixel_num, 1, OP, stream); \ + } \ + } + +void ElementwiseAddCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, false) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddReluCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseAddReluComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulReluCompute::Run() { + ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +void ElementwiseMulReluComputeNHWC::Run() { + ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, true) + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(elementwise_add, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseAddCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_add, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(elementwise_mul, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseMulComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseAddReluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_add_activation, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseAddReluComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::ElementwiseMulReluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::ElementwiseMulReluComputeNHWC, + nhwc_format) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..986a4db2272d9a6607090babd937747f861f49c7 --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute.h @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class ElementwiseAddCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseAddCompute() = default; +}; + +class ElementwiseAddComputeNHWC + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseAddComputeNHWC() = default; +}; + +class ElementwiseMulCompute + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseMulCompute() = default; +}; + +class ElementwiseMulComputeNHWC + : public KernelLite { + public: + using param_t = operators::ElementwiseParam; + + void Run() override; + virtual ~ElementwiseMulComputeNHWC() = default; +}; + +class ElementwiseAddReluCompute + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseAddReluCompute() = default; +}; + +class ElementwiseAddReluComputeNHWC + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseAddReluComputeNHWC() = default; +}; + +class ElementwiseMulReluCompute + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseMulReluCompute() = default; +}; + +class ElementwiseMulReluComputeNHWC + : public KernelLite { + public: + using param_t = operators::FusionElementwiseActivationParam; + + void Run() override; + virtual ~ElementwiseMulReluComputeNHWC() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/elementwise_compute_test.cc b/lite/kernels/cuda/elementwise_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9fd0b7754f2d3209137b5f4862dfe1e90279f3be --- /dev/null +++ b/lite/kernels/cuda/elementwise_compute_test.cc @@ -0,0 +1,252 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/elementwise_compute.h" +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +static void ElementwiseAddRef(float* x, float* y, float* out, int num) { + for (int i = 0; i < num; ++i) { + out[i] = x[i] + y[i]; + } +} + +static void ElementwiseBroadcastRef( + float* x, float* y, float* out, int pre, int n, int post) { + for (int i = 0; i < pre * n * post; ++i) { + int idx = (i / post) % n; + out[i] = x[i] + y[idx]; + } +} + +TEST(elementwise_add, normal) { + ElementwiseAddCompute elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, c, h, w}); + y.Resize({n, c, h, w}); + out.Resize({n, c, h, w}); + x_cpu.Resize({n, c, h, w}); + y_cpu.Resize({n, c, h, w}); + out_cpu.Resize({n, c, h, w}); + x_ref.Resize({n, c, h, w}); + y_ref.Resize({n, c, h, w}); + out_ref.Resize({n, c, h, w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel()); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(elementwise_add, bias) { + ElementwiseAddCompute elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, c, h, w}); + y.Resize({c, 1, 1}); + out.Resize({n, c, h, w}); + x_cpu.Resize({n, c, h, w}); + y_cpu.Resize({c, 1, 1}); + out_cpu.Resize({n, c, h, w}); + x_ref.Resize({n, c, h, w}); + y_ref.Resize({c, 1, 1}); + out_ref.Resize({n, c, h, w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + param.axis = -1; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseBroadcastRef(x_ref_data, y_ref_data, out_ref_data, n, c, h * w); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(elementwise_add_nhwc, bias) { + ElementwiseAddComputeNHWC elementwise_add_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::ElementwiseParam param; + Tensor x, y, out; + Tensor x_cpu, y_cpu, out_cpu; + Tensor x_ref, y_ref, out_ref; + + const int n = 1; + const int c = 3; + const int h = 2000; + const int w = 2000; + + x.Resize({n, h, w, c}); + y.Resize({c, 1, 1}); + out.Resize({n, h, w, c}); + x_cpu.Resize({n, h, w, c}); + y_cpu.Resize({c, 1, 1}); + out_cpu.Resize({n, h, w, c}); + x_ref.Resize({n, h, w, c}); + y_ref.Resize({c, 1, 1}); + out_ref.Resize({n, h, w, c}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + auto* out_cpu_data = out_cpu.mutable_data(); + + auto* x_ref_data = x_ref.mutable_data(); + auto* y_ref_data = y_ref.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = i - 5.0; + y_ref_data[i] = i - 5.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.X = &x; + param.Y = &y; + param.Out = &out; + param.axis = -1; + elementwise_add_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + elementwise_add_kernel.SetContext(std::move(ctx)); + elementwise_add_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + ElementwiseBroadcastRef( + x_ref_data, y_ref_data, out_ref_data, n * h * w, c, 1); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/feed_compute.cc b/lite/kernels/cuda/feed_compute.cc index cffa8a573d9b12b52ae1448632a56e40cea35b95..e54c5b9b035ab63c1356343ec671f5e968fd479b 100644 --- a/lite/kernels/cuda/feed_compute.cc +++ b/lite/kernels/cuda/feed_compute.cc @@ -20,21 +20,22 @@ namespace lite { namespace kernels { namespace cuda { -void FeedCompute::Run() { - auto& param = this->Param(); +template +void FeedCompute::Run() { + auto& param = this->template Param(); auto& ctx = this->ctx_->template As(); auto stream = ctx.exec_stream(); VLOG(4) << "feed_list.size: " << param.feed_list->size(); const lite::Tensor& feed_item = (*param.feed_list)[param.col]; int num = static_cast(feed_item.numel()); - auto input = feed_item.data(); + auto input = feed_item.data(); param.out->Resize(feed_item.dims()); - auto output = param.out->mutable_data(TARGET(kCUDA)); + auto output = param.out->template mutable_data(TARGET(kCUDA)); VLOG(4) << "col: " << param.col << " num:" << num; TargetW::MemcpyAsync( - output, input, num * sizeof(float), IoDirection::HtoD, stream); + output, input, num * sizeof(T), IoDirection::HtoD, stream); } } // namespace cuda @@ -42,8 +43,13 @@ void FeedCompute::Run() { } // namespace lite } // namespace paddle -REGISTER_LITE_KERNEL( - feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw) +typedef paddle::lite::kernels::cuda::FeedCompute + FeedFp32; + +typedef paddle::lite::kernels::cuda::FeedCompute + FeedInt64; + +REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw) .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), @@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL( DATALAYOUT(kNCHW))}) .Finalize(); -REGISTER_LITE_KERNEL( - feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc) +REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc) .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat), @@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNHWC))}) .Finalize(); + +REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kInt64), + DATALAYOUT(kNCHW))}) + .Finalize(); + +REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt64), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kInt64), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/feed_compute.h b/lite/kernels/cuda/feed_compute.h index 0510404b2b6ad6c50f69c847bf833afbcfe59b99..9c42dcc1ca847ccbd58c0a578a969c4d77ec1bf1 100644 --- a/lite/kernels/cuda/feed_compute.h +++ b/lite/kernels/cuda/feed_compute.h @@ -20,7 +20,8 @@ namespace lite { namespace kernels { namespace cuda { -class FeedCompute : public KernelLite { +template +class FeedCompute : public KernelLite { public: using param_t = operators::FeedParam; using TargetW = TargetWrapper; diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc index e2d0ae4f2ef10b29247a2f823988e8098aa33795..6b56d9e1de28cbec57b4b45aff1d1b237b1784b9 100644 --- a/lite/kernels/cuda/layout_compute.cc +++ b/lite/kernels/cuda/layout_compute.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/kernels/cuda/layout_compute.h" +#include #include "lite/backends/cuda/math/transpose.h" #include "lite/core/op_registry.h" @@ -21,11 +22,32 @@ namespace lite { namespace kernels { namespace cuda { +inline DDim trim_singular_dims(const DDim& dims) { + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(); + } + return DDim(trim_dims); +} + #define NCHWTONHWC(type) \ auto& param = this->template Param(); \ auto& ctx = this->ctx_->template As(); \ auto input = param.x->template data(); \ auto input_dim = param.x->dims(); \ + DDim input_trim_dim = trim_singular_dims(input_dim); \ + if (input_trim_dim.size() == 1) { \ + param.y->CopyDataFrom(*param.x); \ + return; \ + } \ CHECK(input_dim.size() == 4) \ << "NCHW to NHWC should guarantee that the input dims should be 4"; \ int n = input_dim[0]; \ @@ -41,6 +63,11 @@ namespace cuda { auto& ctx = this->ctx_->template As(); \ auto input = param.x->template data(); \ auto input_dim = param.x->dims(); \ + DDim input_trim_dim = trim_singular_dims(input_dim); \ + if (input_trim_dim.size() == 1) { \ + param.y->CopyDataFrom(*param.x); \ + return; \ + } \ CHECK(input_dim.size() == 4) \ << "NHWC to NCHW should guarantee that the input dims should be 4"; \ int n = input_dim[0]; \ diff --git a/lite/kernels/cuda/lookup_table_compute.cu b/lite/kernels/cuda/lookup_table_compute.cu index 34b6de0e105f8f6dbf070b4ad41a9e6c7d2a06c8..3c3bb952cac01a6d1e296085dc357b9b3a03773a 100644 --- a/lite/kernels/cuda/lookup_table_compute.cu +++ b/lite/kernels/cuda/lookup_table_compute.cu @@ -98,3 +98,14 @@ REGISTER_LITE_KERNEL(lookup_table, .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) .Finalize(); +REGISTER_LITE_KERNEL(lookup_table_v2, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))}) + .Finalize(); diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..f89b9c9578e54ec8e7de93541eaa51a9b1d17a97 --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu @@ -0,0 +1,145 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/match_matrix_tensor_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +void MatchMatrixTensorCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void MatchMatrixTensorCompute::Run() { + CHECK(ctx_) << "running context should be set first"; + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + + auto* x = param.x; + auto* w = param.w; + auto* y = param.y; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* t_data = w->data(); + auto* out_data = out->mutable_data(TARGET(kCUDA)); + auto* bottom_l_trans_data = tmp->mutable_data(TARGET(kCUDA)); + + gemm_impl_->init( + false, false, x->dims()[0], dim_t * dim_in, dim_in, &context); + gemm_impl_->run( + 1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + + gemm_impl_->init(false, + true, + len_l, + len_r, + dim_in, + dim_t * dim_in, + dim_in, + len_r, + &context); + gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context); + } + } + + int batch_size = x->lod()[0].size() - 1; + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(match_matrix_tensor, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::MatchMatrixTensorCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("W", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Y", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Tmp", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..09db326ff3e992363e9b572ca91444499caed20f --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + void PrepareForRun() override; + void Run() override; + virtual ~MatchMatrixTensorCompute() = default; + + private: + std::unique_ptr> gemm_impl_; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ce0ae2a7a8b4e41a16da3b8ed7fce2eef30f4f76 --- /dev/null +++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/match_matrix_tensor_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(match_matrix_tensor, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + MatchMatrixTensorCompute kernel; + operators::MatchMatrixTensorParam param; + + // prepare ins and outs tensor in gpu, including size and lod + int ix = 5, iy = 4, h = 2, dim_t = 2; + Tensor x, w, y, out, tmp; + x.Resize({ix, h}); + w.Resize({h, dim_t, h}); + y.Resize({iy, h}); + out.Resize({18, 1}); + tmp.Resize({20, 1}); + LoD x_lod{}; + x_lod.push_back({0, 2, 5}); + x.set_lod(x_lod); + LoD y_lod{}; + y_lod.push_back({0, 3, 4}); + y.set_lod(y_lod); + + // init ins tensor in cpu + Tensor x_cpu, w_cpu, y_cpu, out_cpu, tmp_cpu; + x_cpu.Resize({ix, h}); + w_cpu.Resize({h, dim_t, h}); + y_cpu.Resize({iy, h}); + out_cpu.Resize({18, 1}); + tmp_cpu.Resize({20, 1}); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* w_cpu_data = w_cpu.mutable_data(); + auto* y_cpu_data = y_cpu.mutable_data(); + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = static_cast(i); + } + for (int i = 0; i < w_cpu.numel(); ++i) { + w_cpu_data[i] = static_cast(i); + } + for (int i = 0; i < y_cpu.numel(); ++i) { + y_cpu_data[i] = static_cast(i); + } + + // cpu tensor data assigin to gpu tensor + x.Assign(x_cpu_data, x_cpu.dims()); + w.Assign(w_cpu_data, w_cpu.dims()); + y.Assign(y_cpu_data, y_cpu.dims()); + + param.x = &x; + param.w = &w; + param.y = &y; + param.dim_t = dim_t; + param.out = &out; + param.tmp = &tmp; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + std::vector ref_results = {5, + 23, + 41, + 17, + 75, + 133, + 7, + 33, + 59, + 27, + 125, + 223, + 323, + 455, + 587, + 557, + 793, + 1029}; + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/mul_compute_test.cc b/lite/kernels/cuda/mul_compute_test.cc index d1c1d63e7dcd46f84cd128fc5b855da2098e179d..f521a12e2dddcf854b3982ae37f4da7631f6acf3 100644 --- a/lite/kernels/cuda/mul_compute_test.cc +++ b/lite/kernels/cuda/mul_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include "lite/backends/cuda/blas.h" namespace paddle { namespace lite { @@ -26,6 +27,7 @@ TEST(mul_compute, normal) { MulCompute mul_kernel; std::unique_ptr ctx(new KernelContext); auto& context = ctx->As(); + context.InitOnce(); Tensor x, y, out, x_cpu, y_cpu, out_cpu; int x_h = 2, x_w_y_h = 3, y_w = 4; diff --git a/lite/kernels/cuda/nearest_interp_compute.cu b/lite/kernels/cuda/nearest_interp_compute.cu index 1a614e0656b417786deff8df6b7a827433b33f7b..adae034a1d68d723440c55ff3cc21430e1bc33b4 100644 --- a/lite/kernels/cuda/nearest_interp_compute.cu +++ b/lite/kernels/cuda/nearest_interp_compute.cu @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include "lite/backends/cuda/target_wrapper.h" #include "lite/core/op_registry.h" #include "lite/kernels/cuda/nearest_interp_compute.h" @@ -20,6 +21,43 @@ namespace kernels { namespace cuda { using Tensor = lite::Tensor; +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + lite::Tensor temp; + auto temp_data = temp.mutable_data(); + auto tensor_data = tensor->data(); + cudaMemcpy(temp_data, + tensor_data, + tensor->dims().production() * sizeof(float), + cudaMemcpyDeviceToHost); + + vec_new_shape.push_back(static_cast(*temp_data)); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data(); + cudaMemcpy(cpu_starts_tensor_data, + new_data, + new_data_tensor->dims().production() * sizeof(T), + cudaMemcpyDeviceToHost); + + auto new_data_ = cpu_starts_tensor.data(); + vec_new_data = std::vector( + new_data_, new_data_ + new_data_tensor->dims().production()); + return vec_new_data; +} + __global__ void KeNearestNeighborInterp(const float* in, const size_t in_img_h, const size_t in_img_w, @@ -79,19 +117,34 @@ void NearestInterpCompute::Run() { int out_w = param.out_w; float scale = param.scale; bool align_corners = param.align_corners; - if (scale > 0) { - out_h = static_cast(in_h * scale); - out_w = static_cast(in_w * scale); - } - - if (out_size != nullptr) { - Tensor sizes; - float* size_data = sizes.mutable_data(); - float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); - cudaMemcpy( - size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); - out_h = static_cast(size_data[0]); - out_w = static_cast(size_data[1]); + auto align_mode = param.align_mode; + + auto list_new_shape_tensor = param.SizeTensor; + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + auto scale_tensor = param.Scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + + if (out_size != nullptr) { + lite::Tensor sizes; + float* size_data = sizes.mutable_data(); + float* outsize_data = out_size->mutable_data(TARGET(kCUDA)); + cudaMemcpy( + size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost); + out_h = static_cast(size_data[0]); + out_w = static_cast(size_data[1]); + } } auto output_data = output->mutable_data(TARGET(kCUDA)); @@ -162,6 +215,14 @@ REGISTER_LITE_KERNEL(nearest_interp, {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) + .BindInput("SizeTensor", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Scale", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), diff --git a/lite/kernels/cuda/nearest_interp_compute_test.cc b/lite/kernels/cuda/nearest_interp_compute_test.cc index 85032016d630f11bbfe150f750470e89e241c61b..ad2ef9294e0de06a9dfdd141b8001bb34c6d1fb9 100644 --- a/lite/kernels/cuda/nearest_interp_compute_test.cc +++ b/lite/kernels/cuda/nearest_interp_compute_test.cc @@ -16,6 +16,7 @@ #include #include #include +#include namespace paddle { namespace lite { @@ -143,6 +144,116 @@ TEST(nearest_interp, normal) { } } +TEST(nearest_interp, update) { + NearestInterpCompute nearest_interp_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::InterpolateParam param; + + std::vector size_tensor(2); + std::vector size_tensor_cpu(2), size_tensor_ref(2); + Tensor x, input_scale, osz, out; + Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu; + Tensor x_ref, input_scale_ref, osz_ref, out_ref; + + int n = 1, c = 3, in_h = 40, in_w = 40; + int out_h = 80, out_w = 80; + float scale = 2.0; + + param.out_h = out_h; + param.out_w = out_w; + param.scale = scale; + param.align_corners = false; + param.align_mode = 0; + + x.Resize({n, c, in_h, in_w}); + size_tensor[0].Resize({1}); + size_tensor[1].Resize({1}); + input_scale.Resize({1}); + osz.Resize({2}); + out.Resize({n, c, out_h, out_w}); + + x_cpu.Resize({n, c, in_h, in_w}); + size_tensor_cpu[0].Resize({1}); + size_tensor_cpu[1].Resize({1}); + input_scale_cpu.Resize({1}); + osz_cpu.Resize({2}); + out_cpu.Resize({n, c, out_h, out_w}); + + x_ref.Resize({n, c, in_h, in_w}); + size_tensor_ref[0].Resize({1}); + size_tensor_ref[1].Resize({1}); + input_scale_ref.Resize({1}); + osz_ref.Resize({2}); + out_ref.Resize({n, c, out_h, out_w}); + + auto* out_data = out.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data(); + float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data(); + float* input_scale_cpu_data = input_scale_cpu.mutable_data(); + float* osz_cpu_data = osz_cpu.mutable_data(); + float* out_cpu_data = out_cpu.mutable_data(); + + float* x_ref_data = x_ref.mutable_data(); + float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data(); + float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data(); + float* input_scale_ref_data = input_scale_ref.mutable_data(); + float* osz_ref_data = osz_ref.mutable_data(); + float* out_ref_data = out_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = i + 5.0; + x_ref_data[i] = i + 5.0; + } + osz_cpu_data[0] = out_h; + osz_cpu_data[1] = out_w; + size_tensor0_cpu_data[0] = out_h; + size_tensor1_cpu_data[0] = out_w; + input_scale_cpu_data[0] = scale; + osz_ref_data[0] = out_h; + osz_ref_data[1] = out_w; + size_tensor0_ref_data[0] = out_h; + size_tensor1_ref_data[0] = out_w; + input_scale_ref_data[0] = scale; + + x.Assign(x_cpu_data, x_cpu.dims()); + size_tensor[0].Assign( + size_tensor0_cpu_data, size_tensor[0].dims()); + size_tensor[1].Assign( + size_tensor1_cpu_data, size_tensor[1].dims()); + input_scale.Assign(input_scale_cpu_data, + input_scale.dims()); + osz.Assign(osz_cpu_data, osz_cpu.dims()); + + param.X = &x; + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[0])); + param.SizeTensor.emplace_back( + reinterpret_cast(&size_tensor[1])); + param.Scale = &input_scale; + param.OutSize = &osz; + param.Out = &out; + nearest_interp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + nearest_interp_kernel.SetContext(std::move(ctx)); + nearest_interp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + NearestInterpRef(&x_ref, &out_ref, false); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5); + } +} + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/pool_compute.cu b/lite/kernels/cuda/pool_compute.cu index a2483a2c759e8acc5f5944fd316c83bb49530d36..d7e3739ddbb59a624e1911b8178e96053dacc0d1 100644 --- a/lite/kernels/cuda/pool_compute.cu +++ b/lite/kernels/cuda/pool_compute.cu @@ -256,6 +256,7 @@ void PoolCompute::Run() { bool adaptive = param.adaptive; auto x_dims = param.x->dims(); auto out_dims = param.output->dims(); + auto paddings = *param.paddings; const int in_h = x_dims[2]; const int in_w = x_dims[3]; const int out_h = out_dims[2]; @@ -266,8 +267,8 @@ void PoolCompute::Run() { const int win_w = param.ksize[1]; const int stride_h = param.strides[0]; const int stride_w = param.strides[1]; - const int pad_h = param.paddings[0]; - const int pad_w = param.paddings[1]; + const int pad_h = paddings[0]; + const int pad_w = paddings[2]; const int total_threads = out_dims.production(); const int threads = 512; const int blocks = (total_threads + threads - 1) / threads; @@ -357,6 +358,61 @@ void PoolCompute::Run() { if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error); } +inline int PoolOutputSize( + int input_size, int filter_size, int padding, int stride, bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = (input_size - filter_size + 2 * padding) / stride + 1; + } else { + output_size = + (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + } + return output_size; +} + +void PoolComputeNHWC::PrepareForRun() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + pool_impl_.reset(new lite::cuda::math::CudnnPool2DNHWC); + pool_impl_->init(param, &ctx); +} + +void PoolComputeNHWC::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const auto x_dims = param.x->dims(); + std::vector& ksize = param.ksize; + if (param.global_pooling) { + ksize.resize(static_cast(x_dims.size()) - 2); + for (size_t i = 0; i < ksize.size(); ++i) { + (*param.paddings)[i] = 0; + ksize[i] = static_cast(x_dims[i + 1]); + } + } + + std::vector output_shape({x_dims[0]}); + if (param.adaptive) { + output_shape.insert( + output_shape.end(), param.ksize.begin(), param.ksize.end()); + } else { + for (size_t i = 0; i < param.ksize.size(); ++i) { + output_shape.push_back(PoolOutputSize(x_dims[i + 1], + param.ksize[i], + (*param.paddings)[i], + param.strides[i], + param.ceil_mode)); + } + } + output_shape.push_back(x_dims[3]); + param.output->Resize(lite::DDim(output_shape)); + + pool_impl_->run(param); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error); +} + } // namespace cuda } // namespace kernels } // namespace lite @@ -373,3 +429,19 @@ REGISTER_LITE_KERNEL( PRECISION(kFloat), DATALAYOUT(kNCHW))}) .Finalize(); + +REGISTER_LITE_KERNEL(pool2d, + kCUDA, + kFloat, + kNHWC, + paddle::lite::kernels::cuda::PoolComputeNHWC, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNHWC))}) + .Finalize(); diff --git a/lite/kernels/cuda/pool_compute.h b/lite/kernels/cuda/pool_compute.h index 55b346bfaf4ac139c8d22bff2ac64f0e78bc6023..5c3a1bc2b93d3a03a40515fff6f14e604a11c0a1 100644 --- a/lite/kernels/cuda/pool_compute.h +++ b/lite/kernels/cuda/pool_compute.h @@ -13,6 +13,9 @@ // limitations under the License. #pragma once +#include +#include +#include "lite/backends/cuda/math/cudnn_pool.h" #include "lite/core/kernel.h" namespace paddle { @@ -29,6 +32,20 @@ class PoolCompute virtual ~PoolCompute() = default; }; +class PoolComputeNHWC + : public KernelLite { + public: + using param_t = operators::PoolParam; + + void PrepareForRun() override; + void Run() override; + virtual ~PoolComputeNHWC() = default; + + private: + std::unique_ptr> + pool_impl_; +}; + } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc index fe6ff92c0ce943cad36fbdd4f1408e344d9fd5fd..0e5aeec8c0133f1f61b469437e3e9a602096133f 100644 --- a/lite/kernels/cuda/pool_compute_test.cc +++ b/lite/kernels/cuda/pool_compute_test.cc @@ -27,42 +27,123 @@ namespace cuda { using Tensor = lite::Tensor; using DDim = lite::DDim; -static int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +#define IN(n, c, h, w) \ + input_data[w + h * input_w + c * input_h * input_w + \ + n * input_c * input_h * input_w] +#define OUT(n, c, h, w) \ + output_data[w + h * output_w + c * output_h * output_w + \ + n * output_c * output_h * output_w] + +template +void nchw2nhwc_ref(lite::Tensor* input, lite::Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_c = input->dims()[1]; + int input_h = input->dims()[2]; + int input_w = input->dims()[3]; + int output_c = output->dims()[1]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, h, w, c) = IN(n, c, h, w); + } + } + } + } +} + +#undef IN +#undef OUT + +#define IN(n, h, w, c) \ + input_data[c + w * input_c + h * input_w * input_c + \ + n * input_h * input_w * input_c] +#define OUT(n, h, w, c) \ + output_data[c + w * output_c + h * output_w * output_c + \ + n * output_h * output_w * output_c] + +template +void nhwc2nchw_ref(lite::Tensor* input, lite::Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_h = input->dims()[1]; + int input_w = input->dims()[2]; + int input_c = input->dims()[3]; + int output_h = output->dims()[1]; + int output_w = output->dims()[2]; + int output_c = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, c, h, w) = IN(n, h, w, c); + } + } + } + } +} + +static int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } -static std::vector compute_output_shape(operators::PoolParam* param_) { +static std::vector compute_output_shape(operators::PoolParam* param_, + bool is_nchw) { + int axis = 2; + if (!is_nchw) axis = 1; const auto x_dims = param_->x->dims(); std::vector& ksize = param_->ksize; if (param_->global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); + auto paddings = *param_->paddings; for (size_t i = 0; i < ksize.size(); ++i) { - param_->paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } - std::vector output_shape({x_dims[0], x_dims[1]}); + std::vector output_shape({x_dims[0]}); + if (is_nchw) output_shape.push_back(x_dims[1]); if (param_->adaptive) { output_shape.insert( output_shape.end(), param_->ksize.begin(), param_->ksize.end()); } else { + auto paddings = *param_->paddings; for (size_t i = 0; i < param_->ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(x_dims[i + 2], + output_shape.push_back(PoolOutputSize(x_dims[i + axis], param_->ksize[i], - param_->paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_->strides[i], param_->ceil_mode)); } } + if (!is_nchw) output_shape.push_back(x_dims[3]); return output_shape; } @@ -75,7 +156,7 @@ static void pool_compute_ref(const operators::PoolParam& param) { std::vector ksize = param.ksize; std::vector strides = param.strides; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::string pooling_type = param.pooling_type; bool global_pooling = param.global_pooling; @@ -99,7 +180,7 @@ static void pool_compute_ref(const operators::PoolParam& param) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -195,15 +276,15 @@ TEST(pool_cuda, compute) { for (auto pad : {0, 1}) { for (auto n : {1, 2}) { for (auto c : {1, 3}) { - for (auto h : {2, 3, 4, 11}) { - for (auto w : {2, 3, 4, 11}) { - VLOG(3) << "n:" << n << " c:" << c << " h:" << h - << " w:" << w << " ksize:" << ksize - << " stride:" << stride << " pad:" << pad - << " exclusive:" << exclusive - << " global_pooling:" << global_pooling - << " ceil_mode: " << ceil_mode - << " pooling_type:" << pooling_type; + for (auto h : {3}) { + for (auto w : {3}) { + LOG(INFO) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride << " pad:" << pad + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; // init x, output x.Resize(DDim(std::vector({n, c, h, w}))); @@ -226,14 +307,16 @@ TEST(pool_cuda, compute) { } param.global_pooling = global_pooling; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; + param.paddings = + std::make_shared>(paddings); param.exclusive = exclusive; param.ceil_mode = ceil_mode; param.adaptive = false; param.use_quantizer = false; const std::vector& output_shape = - compute_output_shape(¶m); + compute_output_shape(¶m, true); if (output_shape[2] * output_shape[3] == 0) continue; output.Resize(DDim(output_shape)); output_ref.Resize(DDim(output_shape)); @@ -277,6 +360,131 @@ TEST(pool_cuda, compute) { } } } + +TEST(pool_cuda, nhwc) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + PoolComputeNHWC pool; + operators::PoolParam param; + pool.SetContext(std::move(ctx)); + + lite::Tensor x, temp; + lite::Tensor x_cpu; + lite::Tensor output; + lite::Tensor output_cpu, output_temp; + lite::Tensor output_ref; + for (auto pooling_type : {"max", "avg"}) { + for (auto ceil_mode : {false}) { + for (auto global_pooling : {true, false}) { + for (auto exclusive : {false, true}) { + for (auto ksize : {3}) { + for (auto stride : {3}) { + for (auto pad : {1}) { + for (auto n : {1}) { + for (auto c : {3}) { + for (auto h : {8}) { + for (auto w : {8}) { + LOG(INFO) << "n:" << n << " c:" << c << " h:" << h + << " w:" << w << " ksize:" << ksize + << " stride:" << stride << " pad:" << pad + << " exclusive:" << exclusive + << " global_pooling:" << global_pooling + << " ceil_mode: " << ceil_mode + << " pooling_type:" << pooling_type; + + // init x, output + x.Resize(DDim(std::vector({n, h, w, c}))); + temp.Resize(DDim(std::vector({n, h, w, c}))); + x_cpu.Resize(DDim(std::vector({n, c, h, w}))); + + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < x_cpu.dims().production(); ++i) { + float sign = i % 3 == 0 ? -0.03 : 0.05f; + x_cpu_data[i] = sign * (i % 128); + } + + nchw2nhwc_ref(&x_cpu, &temp); + auto* temp_cpu_data = temp.mutable_data(); + + x.Assign(temp_cpu_data, + temp.dims()); + // fill param + param.x = &x; + param.output = &output; + param.pooling_type = pooling_type; + if (global_pooling) { + param.ksize = {h, w}; + } else { + param.ksize = {ksize, ksize}; + } + param.global_pooling = global_pooling; + param.strides = {stride, stride}; + std::vector paddings = {pad, pad, pad, pad}; + param.paddings = + std::make_shared>(paddings); + param.exclusive = exclusive; + param.ceil_mode = ceil_mode; + param.adaptive = false; + param.use_quantizer = false; + + const std::vector& output_shape = + compute_output_shape(¶m, false); + if (output_shape[2] * output_shape[3] == 0) continue; + output.Resize(DDim(output_shape)); + output_temp.Resize(DDim(output_shape)); + output_cpu.Resize(DDim(output_shape)); + + auto* output_data = + output.mutable_data(TARGET(kCUDA)); + auto* output_cpu_data = + output_cpu.mutable_data(); + + // compute + pool.SetParam(param); + pool.Launch(); + + // compute ref + param.x = &x_cpu; + // nchw + const std::vector& output_shape_ref = + compute_output_shape(¶m, true); + + output_ref.Resize(DDim(output_shape_ref)); + // auto* output_ref_data = + // output_ref.mutable_data(); + param.output = &output_ref; + pool_compute_ref(param); + nchw2nhwc_ref(&output_ref, &output_temp); + auto* output_temp_data = + output_temp.mutable_data(); + + cudaDeviceSynchronize(); + CopySync(output_cpu_data, + output_data, + sizeof(float) * output.numel(), + IoDirection::DtoH); + // compare + for (int i = 0; i < output.dims().production(); i++) { + EXPECT_NEAR( + output_cpu_data[i], output_temp_data[i], 1e-4); + } + VLOG(3) << "compare pass"; + } + } + } + } + } + } + } + } + } + } + } +} } // namespace cuda } // namespace kernels } // namespace lite diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..ddefb608dd233279b4a8127b100151acf8ffc8e6 --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda {} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_aligned_mat_mul, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchAlignedMatMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.h b/lite/kernels/cuda/search_aligned_mat_mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..b1c4552d9c43e2dcbc3bf0211f7028811410cb6c --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h @@ -0,0 +1,103 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/batched_gemm.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/types.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchAlignedMatMulCompute + : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void PrepareForRun() override { + auto& param = this->Param(); + CHECK(ctx_) << "running context should be set first"; + auto& cuda_ctx = ctx_->template As(); + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + int seq_num = param.X->lod()[0].size() - 1; + batched_gemm_impl_.reset(new lite::cuda::math::BatchedGemm); + CHECK( + batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx)); + A_ = static_cast(malloc(3 * seq_num * sizeof(float*))); + CHECK(A_); + } + + void Run() override { + auto& param = this->Param(); + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + float alpha = param.alpha; + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + auto x_stride = x_batch_size * x_inner_size; + auto y_stride = y_batch_size * y_inner_size; + auto out_stride = M * N; + for (int seq = 0; seq < seq_num; seq++) { + A_[seq] = const_cast(x_data) + seq * x_stride; + A_[seq + seq_num] = const_cast(y_data) + seq * y_stride; + A_[seq + seq_num * 2] = out_data + seq * out_stride; + } + batched_gemm_impl_->run( + alpha, 0.0f, const_cast(A_), M, N, K, seq_num); + } + + ~SearchAlignedMatMulCompute() { + if (A_ != nullptr) { + free(A_); + } + } + + private: + std::unique_ptr> + batched_gemm_impl_; + float** A_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f08333b3103973f99d37e39e7e7babeb52b335f1 --- /dev/null +++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void search_aligned_mat_mul_compute_ref(const operators::MatMulParam& param) { + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + T alpha = static_cast(param.alpha); + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + int lda = x_transpose ? M : K; + int ldb = y_transpose ? K : N; + int ldc = N; + int x_stride = x_batch_size * x_inner_size; + int y_stride = y_batch_size * y_inner_size; + int out_stride = M * N; + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(); + + for (int seq = 0; seq < seq_num; seq++) { + auto a = x_data + seq * x_stride; + auto b = y_data + seq * y_stride; + auto c = out_data + seq * out_stride; + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + auto sum = static_cast(0); + for (int l = 0; l < K; l++) { + T av; + T bv; + if (x_transpose) { + av = a[l * lda + i]; + } else { + av = a[i * lda + l]; + } + if (y_transpose) { + bv = b[j * ldb + l]; + } else { + bv = b[l * ldb + j]; + } + sum += av * bv; + } + c[i * ldc + j] = alpha * sum; + } + } + } +} + +TEST(search_aligned_mat_mul_compute, normal) { + Env::Init(); + for (int seq_num : {1, 2}) { + for (int x_batch_size : {1, 3}) { + for (int x_inner_size : {1, 5}) { + for (int out_inner_size : {1, 4}) { + for (bool x_transpose : {true, false}) { + for (bool y_transpose : {true, false}) { + for (float alpha : {1., 2.}) { + // infer x_dims and y_dims + int y_batch_size; + int y_inner_size; + int out_batch_size; + if (x_transpose) { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_batch_size; + out_batch_size = x_inner_size; + } else { + y_batch_size = x_batch_size; + y_inner_size = out_inner_size; + out_batch_size = x_inner_size; + } + } else { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_inner_size; + out_batch_size = x_batch_size; + } else { + y_batch_size = x_inner_size; + y_inner_size = out_inner_size; + out_batch_size = x_batch_size; + } + } + std::vector x_lod_0(seq_num + 1); + std::vector y_lod_0(seq_num + 1); + std::vector out_lod_0(seq_num + 1); + x_lod_0[0] = 0; + y_lod_0[0] = 0; + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + x_lod_0[i + 1] = x_lod_0[i] + x_batch_size; + y_lod_0[i + 1] = y_lod_0[i] + y_batch_size; + out_lod_0[i + 1] = out_lod_0[i] + out_batch_size; + } + LoD x_lod; + LoD y_lod; + LoD out_lod; + x_lod.push_back(x_lod_0); + y_lod.push_back(y_lod_0); + out_lod.push_back(out_lod_0); + DDim x_dims({static_cast(x_lod_0.back()), + static_cast(x_inner_size)}); + DDim y_dims({static_cast(y_lod_0.back()), + static_cast(y_inner_size)}); + DDim out_dims({static_cast(out_lod_0.back()), + static_cast(out_inner_size)}); + // prepare input&output tensors + Tensor x_dev, x_host, y_dev, y_host, out_dev, out_host, out_ref; + x_host.Resize(x_dims); + y_host.Resize(y_dims); + out_host.Resize(out_dims); + x_dev.Resize(x_dims); + y_dev.Resize(y_dims); + out_dev.Resize(out_dims); + out_ref.Resize(out_dims); + x_host.set_lod(x_lod); + y_host.set_lod(y_lod); + out_host.set_lod(out_lod); + x_dev.set_lod(x_lod); + y_dev.set_lod(y_lod); + out_dev.set_lod(out_lod); + out_ref.set_lod(out_lod); + auto out_dev_data = out_dev.mutable_data(TARGET(kCUDA)); + auto x_host_data = x_host.mutable_data(); + auto y_host_data = y_host.mutable_data(); + auto out_host_data = out_host.mutable_data(); + auto out_ref_data = out_ref.mutable_data(); + for (int i = 0; i < x_host.dims().production(); i++) { + x_host_data[i] = i * 0.125f; + } + for (int i = 0; i < y_host.dims().production(); i++) { + y_host_data[i] = i * 0.5f; + } + x_dev.Assign(x_host_data, + x_host.dims()); + y_dev.Assign(y_host_data, + y_host.dims()); + // prepare cuda context, initialize param, and run kernel + operators::MatMulParam param; + param.X = &x_dev; + param.Y = &y_dev; + param.Out = &out_dev; + param.alpha = alpha; + param.transpose_X = x_transpose; + param.transpose_Y = y_transpose; + std::unique_ptr ctx(new KernelContext); + auto& cuda_ctx = ctx->As(); + cuda_ctx.InitOnce(); + int dev_id = TargetWrapper::GetCurDevice(); + cuda_ctx.Init(dev_id); + SearchAlignedMatMulCompute search_aligned_mat_mul; + search_aligned_mat_mul.SetParam(param); + search_aligned_mat_mul.SetContext(std::move(ctx)); + search_aligned_mat_mul.Launch(); + cudaDeviceSynchronize(); + CopySync( + out_host_data, + out_dev_data, + sizeof(float) * out_dev.dims().production(), + IoDirection::DtoH); + // run reference + param.X = &x_host; + param.Y = &y_host; + param.Out = &out_ref; + search_aligned_mat_mul_compute_ref(param); + // verify result + for (int i = 0; i < out_ref.dims().production(); i++) { + EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5); + } + } + } + } + } + } + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..591e2474a475590e8c7d3882b4dfa8f5a55a3ab0 --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_fc_compute.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +template +static void anakin_NV_gemv(cublasHandle_t handle, + const bool TransA, + const int M, + const int N, + const T alpha, + const T* A, + const T* x, + const T beta, + T* y); +template <> +void anakin_NV_gemv(cublasHandle_t handle, + const bool TransA, + const int M, + const int N, + const float alpha, + const float* A, + const float* x, + const float beta, + float* y) { + cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK( + cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +} +template +static void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C); + +template <> +void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const float alpha, + const float* A, + const float* B, + const float beta, + float* C) { + // Note that cublas follows fortran order. + int lda = (!TransA /* == CblasNoTrans*/) ? K : M; + int ldb = (!TransB /* == CblasNoTrans*/) ? N : K; + cublasOperation_t cuTransA = + (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + N)); +} + +template <> +void anakin_NV_gemm(cublasHandle_t handle, + const bool TransA, + const bool TransB, + const int M, + const int N, + const int K, + const char alpha, + const char* A, + const char* B, + const char beta, + char* C) { + LOG(FATAL) << "int8 gemm is not implemented"; +} + +template +static __global__ void add_bias(int n, + int output_size, + const T* bias, + T* dout) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int bias_index = index % output_size; + if (index < n) { + dout[index] = dout[index] + bias[bias_index]; + } +} + +template +void SearchFcCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + const Tensor* x_tensor = param.X; + param.Out->Resize({x_tensor->dims()[0], param.out_size}); + _M = x_tensor->dims().count(0, 1); + _K = x_tensor->dims().count(1, x_tensor->numel()); + _N = param.out_size; + const T* din = x_tensor->data(); + Tensor* out_tensor = param.Out; + T* dout = out_tensor->mutable_data(TARGET(kCUDA)); + const Tensor* w_tensor = param.W; + const T* weight = w_tensor->data(); + const Tensor* b_tensor = param.b; + const T* bias = b_tensor->data(); + cublasCreate(&_handle); + if (_M == 1 && _K > 50000) { + anakin_NV_gemv(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout); + } else { + anakin_NV_gemm(_handle, + false, + !_flag_trans_weights, + _M, + _N, + _K, + (T)1, + din, + weight, + (T)0, + dout); + } + int total_size = _M * _N; + add_bias<<>>( + total_size, _N, bias, dout); +} +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_fc, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_fc_compute.h b/lite/kernels/cuda/search_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..db09362734ecdb05663a5a6d4297ab869cb1b55d --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} +inline int CUDA_GET_BLOCKS(const int N, const int base) { + return (N + base - 1) / base; +} + +template +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + void Run() override; + virtual ~SearchFcCompute() = default; + + private: + bool _flag_trans_weights{false}; + int _M; + int _K; + int _N; + cublasHandle_t _handle; + bool _is_continue_buf{true}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_fc_compute_test.cc b/lite/kernels/cuda/search_fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f06028fbe15557c652c442ac436fa09700a56e28 --- /dev/null +++ b/lite/kernels/cuda/search_fc_compute_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_fc_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void fc_cpu_base(const lite::Tensor* X, + const lite::Tensor* W, + const lite::Tensor* b, + int out_size, + lite::Tensor* Out) { + const float* data_in = X->data(); + const float* bias = b->data(); + const float* weights = W->data(); + float* data_out = Out->mutable_data(); + int out_rows = X->dims()[0]; + int in_cols = X->numel() / out_rows; + int out_cols = W->numel() / in_cols; + int index_out; + + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[j * in_cols + k]; + } + } + } +} + +TEST(search_fc, normal) { + SearchFcCompute search_fc_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + operators::SearchFcParam param; + lite::Tensor X, X_gpu, W, W_gpu, b, b_gpu; + lite::Tensor Out, Out_cpu, out_ref; + std::vector x_shape{1, 4}; + X.Resize(lite::DDim(x_shape)); + std::vector w_shape{3, 4}; + W.Resize(lite::DDim(w_shape)); + std::vector b_shape{3}; + b.Resize(lite::DDim(b_shape)); + std::vector out_shape{1, 4}; + Out.Resize(lite::DDim(out_shape)); + out_ref.Resize(lite::DDim(out_shape)); + auto x_data = X.mutable_data(); + auto w_data = W.mutable_data(); + auto b_data = b.mutable_data(); + auto out_data_ref = out_ref.mutable_data(); + for (int64_t i = 0; i < X.dims().production(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < W.dims().production(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < b.dims().production(); i++) { + b_data[i] = static_cast(i); + } + X_gpu.Assign(x_data, X.dims()); + W_gpu.Assign(w_data, W.dims()); + b_gpu.Assign(b_data, b.dims()); + param.X = &X_gpu; + param.W = &W_gpu; + param.b = &b_gpu; + param.out_size = 4; + param.Out = &Out; + search_fc_kernel.SetParam(param); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + search_fc_kernel.SetContext(std::move(ctx)); + search_fc_kernel.Run(); + fc_cpu_base(&X, &W, &b, 4, &out_ref); + cudaDeviceSynchronize(); + const float* out_data = Out.data(); + float* out_cpu_data = Out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH); + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(out_cpu_data[i], out_data_ref[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..468b66e5680c7d0e5879def9a888e10faa0bca32 --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute.cu @@ -0,0 +1,351 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_grnn_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +template +T sigmoid(T z) { + return 1 / (1 + std::exp(-z)); +} + +template +__global__ void PreComputeKernel( + const int num, const T* w_x_e, const T* wz_x_e, T* tilde, T* z, T* hidden) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num) { + tilde[index] = std::tanh(w_x_e[index]); + z[index] = 1 / (1 + std::exp(-wz_x_e[index])); + hidden[index] = (1. - z[index]) * tilde[index]; + } +} + +template +__global__ void PostComputeKernel(const int start, + const int end, + const int cap_h, + const int w_tm1, + const T* wr_x_e, + const T* ur_x_h, + const T* wz_x_e, + const T* uz_x_h, + const T* w_x_e, + const T* u_x_h, + T* r, + T* z, + T* tilde, + T* hidden) { + int j = start + blockIdx.x * blockDim.x + threadIdx.x; + if (j < end) { + r[j] = 1 / (1 + std::exp(-(wr_x_e[j] + ur_x_h[j]))); + z[j] = 1 / (1 + std::exp(-(wz_x_e[j] + uz_x_h[j]))); + tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]); + hidden[j] = z[j] * hidden[j - cap_h * w_tm1] + (1.0 - z[j]) * tilde[j]; + } +} + +void SearchGrnnCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) { + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto cuda_stream = context.exec_stream(); + + auto* _input = input_blob; + int dim0 = _input->dims()[0]; + int dim1 = 1; + if (_input->dims().size() > 1) { + dim1 = _input->dims()[1]; + } + int batch = _input->lod()[0].size() - 1; + auto& offset = _input->lod()[0]; + + idx_sorted_by_width_cpu = std::make_shared(); + idx_sorted_by_width_cpu->Resize({batch}); + int* idx_sorted_by_width_cpu_data = + idx_sorted_by_width_cpu->mutable_data(); + + Tensor _width; + _width.Resize({batch}); + int* width_data = _width.mutable_data(); + // sort sequence by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width_data[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_cpu_data[i] = i; + } + std::sort(idx_sorted_by_width_cpu_data, + idx_sorted_by_width_cpu_data + batch, + [&_width](int a, int b) { + return _width.data()[a] > _width.data()[b]; + }); + int max_width = width_data[idx_sorted_by_width_cpu_data[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width_data[idx_sorted_by_width_cpu_data[k]] > last_width) { + sub_row = width_data[idx_sorted_by_width_cpu_data[k]] - last_width; + sub_col = k + 1; + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width_data[idx_sorted_by_width_cpu_data[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + auto* _layout_input = new Tensor(); + auto* _layout_input_gpu = param.layout_input; + if (_input->dims().size() == 1) { + // _layout_input.reshape_batch_sequence({dim0}, new_offset); + LOG(FATAL) << "_input->dims().size() = 1, error."; + } else { + // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset); + LoD new_lod; + new_lod.push_back(new_offset); + _layout_input->set_lod(new_lod); + _layout_input->Resize({dim0, dim1}); + _layout_input_gpu->set_lod(new_lod); + _layout_input_gpu->Resize({dim0, dim1}); + } + + auto* new_emb = _layout_input->mutable_data(); + auto* input_cpu = new Tensor(); + input_cpu->Resize(_input->dims()); + auto* input_cpu_data = input_cpu->mutable_data(); + TargetW::MemcpyAsync(input_cpu_data, + _input->data(), + _input->numel() * sizeof(float), + IoDirection::DtoH, + cuda_stream); + for (int i = 0; i < max_width; i++) { + int w = new_offset[i + 1] - new_offset[i]; + auto* emb_start = new_emb + dim1 * new_offset[i]; + for (int j = 0; j < w; ++j) { + memcpy(emb_start + dim1 * j, + input_cpu_data + dim1 * offset[idx_sorted_by_width_cpu_data[j]] + + dim1 * i, + dim1 * sizeof(float)); + } + } + + auto* _layout_input_gpu_data = + _layout_input_gpu->mutable_data(TARGET(kCUDA)); + TargetW::MemcpyAsync(_layout_input_gpu_data, + new_emb, + _layout_input->numel() * sizeof(float), + IoDirection::HtoD, + cuda_stream); + delete _layout_input; + delete input_cpu; +} + +void SearchGrnnCompute::CopyBack(float* from, float* to, int step) { + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + auto* _input = param.x; + auto* _layout_input = param.layout_input; + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_cpu_data = + idx_sorted_by_width_cpu->data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + TargetW::MemcpyAsync( + to + step * (offset[idx_sorted_by_width_cpu_data[j]] + i), + from + (new_offset[i] + j) * step, + step * sizeof(float), + IoDirection::DtoD, + stream); + } + } +} + +void SearchGrnnCompute::Run() { + CHECK(ctx_) << "running context should be set first"; + auto& param = this->Param(); + auto& context = this->ctx_->template As(); + auto stream = context.exec_stream(); + + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* _buffer = param.tmp_buffer; + int _cap_h = param.num_hidden; + int _cap_e = param.num_input; + + int _cap_l = bottom->dims()[0]; + int batch = bottom->lod()[0].size() - 1; + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{_cap_l, _cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->mutable_data(TARGET(kCUDA)); + + const auto* dense_e2h = wi->data(); + const auto* dense_h2h = wh->data(); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + PrepareLayout(bottom); + + auto* _layout_input = param.layout_input; + auto* new_emb = _layout_input->data(); + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + _buffer->Resize({20, _cap_l, _cap_h}); + auto* buffer_data = _buffer->mutable_data(TARGET(kCUDA)); + auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h; + auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h; + auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h; + auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h; + auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h; + auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h; + auto* r = buffer_data + 6 * _cap_l * _cap_h; + auto* z = buffer_data + 7 * _cap_l * _cap_h; + auto* tilde = buffer_data + 8 * _cap_l * _cap_h; + // the internal hidden + auto* hidden = buffer_data + 19 * _cap_l * _cap_h; + + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2h, w_x_e, &context); + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2hr, wr_x_e, &context); + gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context); + gemm_impl_->run(1.0f, 0.0f, new_emb, e2hz, wz_x_e, &context); + + // precompute hidden0 + int num = batch * _cap_h; + int threads = 512; + int blocks = (num + threads - 1) / threads; + PreComputeKernel<<>>( + num, w_x_e, wz_x_e, tilde, z, hidden); + + // recurrence + for (int i = 1; i < max_width; i++) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + // precompute hidden i-1 to hidden i + auto* htm1 = hidden + new_offset[i - 1] * _cap_h; + + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2h, u_x_h + new_offset[i] * _cap_h, &context); + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2hr, ur_x_h + new_offset[i] * _cap_h, &context); + gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context); + gemm_impl_->run( + 1.0f, 0.0f, htm1, h2hz, uz_x_h + new_offset[i] * _cap_h, &context); + + // compute the gate and hidden + int start = new_offset[i] * _cap_h; + int end = (new_offset[i] + w) * _cap_h; + PostComputeKernel<<>>(start, + end, + _cap_h, + w_tm1, + wr_x_e, + ur_x_h, + wz_x_e, + uz_x_h, + w_x_e, + u_x_h, + r, + z, + tilde, + hidden); + } + + CopyBack(hidden, top_hidden, _cap_h); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchGrnnCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Wi", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Wh", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("tmp_buffer", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("layout_input", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..73d84635d06f578f68bd844fe275d99595e70fc8 --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchGrnnCompute + : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + using TargetW = TargetWrapper; + + void PrepareForRun() override; + void Run() override; + virtual ~SearchGrnnCompute() = default; + + private: + std::shared_ptr idx_sorted_by_width_cpu; + std::unique_ptr> gemm_impl_; + void PrepareLayout(const Tensor* input); + void CopyBack(float* from, float* to, int step); +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_grnn_compute_test.cc b/lite/kernels/cuda/search_grnn_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..08b96e1f1ecd57d10099b9566a5c0cd5e6e885d1 --- /dev/null +++ b/lite/kernels/cuda/search_grnn_compute_test.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_grnn_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(search_grnn, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + SearchGrnnCompute kernel; + operators::SearchGrnnParam param; + + int num_input = 6; + int num_hidden = 6; + int num_batch = 3; + Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer; + x.Resize({num_batch, num_input}); + wi.Resize({3, num_hidden, num_input}); + wh.Resize({3, num_hidden, num_hidden}); + LoD x_lod{}; + x_lod.push_back({0, 1, 3}); + x.set_lod(x_lod); + + Tensor x_cpu, wi_cpu, wh_cpu, out_cpu, layout_input_cpu, tmp_buffer_cpu; + x_cpu.Resize({num_batch, num_input}); + wi_cpu.Resize({3, num_hidden, num_input}); + wh_cpu.Resize({3, num_hidden, num_hidden}); + out_cpu.Resize({num_batch, num_hidden}); + layout_input_cpu.Resize({num_batch, num_input}); + tmp_buffer_cpu.Resize({20, num_batch, num_hidden}); + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = static_cast(i); + } + auto* wi_cpu_data = wi_cpu.mutable_data(); + for (int i = 0; i < wi_cpu.numel(); ++i) { + wi_cpu_data[i] = static_cast(i); + } + auto* wh_cpu_data = wh_cpu.mutable_data(); + for (int i = 0; i < wh_cpu.numel(); ++i) { + wh_cpu_data[i] = static_cast(i); + } + + x.Assign(x_cpu_data, x_cpu.dims()); + wi.Assign(wi_cpu_data, wi_cpu.dims()); + wh.Assign(wh_cpu_data, wh_cpu.dims()); + + param.x = &x; + param.wi = &wi; + param.wh = &wh; + param.out = &out; + param.idx_sorted_by_width = &idx_sorted_by_width; + param.layout_input = &layout_input; + param.tmp_buffer = &tmp_buffer; + param.num_input = num_input; + param.num_hidden = num_hidden; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + LOG(INFO) << "out_data:"; + for (int i = 0; i < out.numel(); i++) { + // EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + LOG(INFO) << out_cpu_data[i]; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..697e53dbb68b09bec6c32ece73723d469a5cd9d6 --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute.cu @@ -0,0 +1,164 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_group_padding_compute.h" + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +template +__global__ void ker_search_group_padding(Dtype* out_emb_padding_data, + Dtype* out_padding_data, + const Dtype* in_data, + const uint64_t* offset, + const int seq_num, + const int max_len, + const int emb_size, + const Dtype pad_id, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = word_id / max_len; + int word_id_in_seq = word_id % max_len; + int cur_len = offset[seq_id + 1] - offset[seq_id]; + if (word_id_in_seq < cur_len) { + out_emb_padding_data[tid] = + in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id]; + } else { + out_emb_padding_data[tid] = 0.f; + if (emb_id == 0) { + out_padding_data[word_id] = pad_id; + } + } + } +} + +void SearchGroupPaddingCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + const Tensor* x = param.x; + Tensor* out_emb_padding = param.out_emb_padding; + Tensor* out_new = param.out_new; + Tensor* out_padding = param.out_padding; + const float pad_id = static_cast(param.pad_id); + const float* in_data = x->data(); + const auto& in_seq_offset = x->lod()[0]; + int batch = in_seq_offset.size() - 1; + int max_seq = 0; + for (int i = 0; i < batch; ++i) { + if (in_seq_offset[i + 1] - in_seq_offset[i] > max_seq) { + max_seq = in_seq_offset[i + 1] - in_seq_offset[i]; + } + } + std::vector new_offset; + new_offset.resize(batch + 1); + for (int i = 0; i < batch + 1; ++i) { + new_offset[i] = i * max_seq; + } + std::vector x_dims = x->dims().Vectorize(); + LoD out_emb_padding_lod; + out_emb_padding_lod.push_back(new_offset); + out_emb_padding->set_lod(out_emb_padding_lod); + out_emb_padding->Resize({batch * max_seq, x_dims[1]}); + float* out_emb_padding_data = + out_emb_padding->mutable_data(TARGET(kCUDA)); + + LoD out_new_lod; + out_new_lod.push_back(in_seq_offset); + out_new->set_lod(out_new_lod); + out_new->Resize({x_dims[0], 1}); + float* out_new_data = out_new->mutable_data(TARGET(kCUDA)); + + LoD out_padding_lod; + out_padding_lod.push_back(new_offset); + out_padding->set_lod(out_padding_lod); + out_padding->Resize({batch * max_seq, 1}); + float* out_padding_data = out_padding->mutable_data(TARGET(kCUDA)); + + const int count = out_emb_padding->numel(); + const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0]; + int max_len = out_emb_padding_seq_offset[1]; + int seq_num = out_emb_padding_seq_offset.size() - 1; + int emb_size = x->dims()[1]; + _in_seq_offset.Resize({seq_num + 1, 1, 1, 1}); + uint64_t* offset_data = _in_seq_offset.mutable_data(TARGET(kCUDA)); + + TargetWrapperCuda::MemcpyAsync(offset_data, + in_seq_offset.data(), + sizeof(uint64_t) * in_seq_offset.size(), + IoDirection::HtoD, + cuda_stream); + + TargetWrapperCuda::MemsetSync( + out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float)); + TargetWrapperCuda::MemsetSync( + out_padding_data, + 0, + out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float)); + + ker_search_group_padding< + float><<>>( + out_emb_padding_data, + out_padding_data, + in_data, + offset_data, + seq_num, + max_len, + emb_size, + pad_id, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_group_padding, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchGroupPaddingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_emb_padding", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_new", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_padding", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_group_padding_compute.h b/lite/kernels/cuda/search_group_padding_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..88391e6d652b92571d11b321f12288155665d9da --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchGroupPaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchGroupPaddingParam; + + void Run() override; + virtual ~SearchGroupPaddingCompute() = default; + + private: + lite::Tensor _in_seq_offset; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_group_padding_compute_test.cc b/lite/kernels/cuda/search_group_padding_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b831780c876dcc9d910cbf48a66bf0d1ec7a5bb2 --- /dev/null +++ b/lite/kernels/cuda/search_group_padding_compute_test.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_group_padding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +TEST(search_group_padding_cuda, run_test) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + lite::Tensor x, x_cpu, x_ref; + lite::Tensor out_emb_padding, out_emb_padding_cpu, out_emb_padding_ref; + lite::Tensor out_new, out_new_cpu, out_new_ref; + lite::Tensor out_padding, out_padding_cpu, out_padding_ref; + + int x_dims0 = 2; + int x_dims1 = 3; + + x.Resize({x_dims0, x_dims1}); + x_cpu.Resize({x_dims0, x_dims1}); + x_ref.Resize({x_dims0, x_dims1}); + out_emb_padding.Resize({1, x_dims1}); + out_emb_padding_cpu.Resize({1, x_dims1}); + out_emb_padding_ref.Resize({1, x_dims1}); + out_new.Resize({x_dims0, 1}); + out_new_cpu.Resize({x_dims0, 1}); + out_new_ref.Resize({x_dims0, 1}); + out_padding.Resize({1, 1}); + out_padding_cpu.Resize({1, 1}); + out_padding_ref.Resize({1, 1}); + + LoD x_lod{}; + x_lod.push_back({0, 1}); + x.set_lod(x_lod); + + auto* x_cpu_data = x_cpu.mutable_data(); + auto* x_ref_data = x_ref.mutable_data(); + auto* out_emb_padding_data = + out_emb_padding.mutable_data(TARGET(kCUDA)); + auto* out_emb_padding_cpu_data = out_emb_padding_cpu.mutable_data(); + auto* out_emb_padding_ref_data = out_emb_padding_ref.mutable_data(); + auto* out_new_data = out_new.mutable_data(TARGET(kCUDA)); + auto* out_new_cpu_data = out_new_cpu.mutable_data(); + auto* out_new_ref_data = out_new_ref.mutable_data(); + auto* out_padding_data = out_padding.mutable_data(TARGET(kCUDA)); + auto* out_padding_cpu_data = out_padding_cpu.mutable_data(); + auto* out_padding_ref_data = out_padding_ref.mutable_data(); + + for (int64_t i = 0; i < x_cpu.dims().production(); i++) { + x_cpu_data[i] = static_cast(i); + x_ref_data[i] = static_cast(i); + } + x.Assign(x_cpu_data, x_cpu.dims()); + out_emb_padding_ref_data[0] = 0.f; + out_emb_padding_ref_data[1] = 1.f; + out_emb_padding_ref_data[2] = 2.f; + out_new_ref_data[0] = 0.f; + out_new_ref_data[1] = 0.f; + out_padding_ref_data[0] = 0.f; + + SearchGroupPaddingCompute sgp_kernel; + operators::SearchGroupPaddingParam param; + + param.x = &x; + param.out_emb_padding = &out_emb_padding; + param.out_new = &out_new; + param.out_padding = &out_padding; + + sgp_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + sgp_kernel.SetContext(std::move(ctx)); + sgp_kernel.Launch(); + cudaDeviceSynchronize(); + + CopySync(out_emb_padding_cpu_data, + out_emb_padding_data, + sizeof(float) * out_emb_padding.numel(), + IoDirection::DtoH); + CopySync(out_new_cpu_data, + out_new_data, + sizeof(float) * out_new.numel(), + IoDirection::DtoH); + CopySync(out_padding_cpu_data, + out_padding_data, + sizeof(float) * out_padding.numel(), + IoDirection::DtoH); + + for (int i = 0; i < out_emb_padding_cpu.dims().production(); i++) { + EXPECT_NEAR(out_emb_padding_cpu_data[i], out_emb_padding_ref_data[i], 1e-5); + } + for (int i = 0; i < out_new_cpu.dims().production(); i++) { + EXPECT_NEAR(out_new_cpu_data[i], out_new_ref_data[i], 1e-5); + } + for (int i = 0; i < out_padding_cpu.dims().production(); i++) { + EXPECT_NEAR(out_padding_cpu_data[i], out_padding_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_group_padding, kCUDA, kFloat, kNCHW, def); diff --git a/lite/kernels/cuda/search_seq_depadding_compute.cu b/lite/kernels/cuda/search_seq_depadding_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..ecadceab582ccebf765ef43edda49ed414354611 --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute.cu @@ -0,0 +1,115 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_seq_depadding_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { +using Tensor = lite::Tensor; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void ker_sequence_depadding_fwd(Dtype* out_data, + const Dtype* in_data, + const int* seq_id_map, + const int seq_num, + const int max_len, + const int emb_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int seq_id = seq_id_map[word_id]; + out_data[tid] = in_data[seq_id * emb_size + emb_id]; + } +} + +void SearchSeqDepaddingCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + + auto* pad = param.pad; + auto* src = param.src; + auto* out = param.out; + + auto* in_data = pad->data(); + out->Resize({src->dims()[0], pad->dims()[1]}); + auto* out_data = out->mutable_data(TARGET(kCUDA)); + const int count = out->numel(); + + const auto& pad_seq_offset = pad->lod()[0]; + const auto& src_seq_offset = src->lod()[0]; + int max_len = pad_seq_offset[1]; + int seq_num = pad_seq_offset.size() - 1; + int emb_size = pad->dims()[1]; + + LoD out_lod; + out_lod.push_back(src_seq_offset); + out->set_lod(out_lod); + std::vector seq_id_map; + for (int i = 0; i < seq_num; i++) { + int cur_len = src_seq_offset[i + 1] - src_seq_offset[i]; + for (int j = 0; j < cur_len; j++) { + seq_id_map.push_back(i * max_len + j); + } + } + + int map_size = seq_id_map.size(); + seq_id_map_tensor.Resize({map_size, 1, 1, 1}); + int* seq_id_map_data = seq_id_map_tensor.mutable_data(TARGET(kCUDA)); + TargetW::MemcpyAsync(seq_id_map_data, + &seq_id_map[0], + seq_id_map.size() * sizeof(int), + IoDirection::HtoD, + cuda_stream); + + int threads = 512; + int blocks = (count + threads - 1) / threads; + ker_sequence_depadding_fwd<<>>( + out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_seq_depadding, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchSeqDepaddingCompute, + def) + .BindInput("Src", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("Pad", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_seq_depadding_compute.h b/lite/kernels/cuda/search_seq_depadding_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..a06f39bee2d9078206ab05f7f5377a5598498620 --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchSeqDepaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchSeqDepaddingParam; + using TargetW = TargetWrapper; + + void Run() override; + virtual ~SearchSeqDepaddingCompute() = default; + + private: + Tensor seq_id_map_tensor; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_depadding_compute_test.cc b/lite/kernels/cuda/search_seq_depadding_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c23ff14ab7a27b53177b2d0e48710df55c59ae5 --- /dev/null +++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_seq_depadding_compute.h" +#include +#include +#include +#include +#include "lite/api/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +using Tensor = lite::Tensor; + +TEST(search_seq_depadding, normal) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + SearchSeqDepaddingCompute kernel; + operators::SearchSeqDepaddingParam param; + + Tensor pad, src, out; + pad.Resize({2 * 3, 4}); + src.Resize({3, 1}); + out.Resize({3, 4}); + LoD pad_lod{}; + pad_lod.push_back({0, 4, 6}); + pad.set_lod(pad_lod); + LoD src_lod{}; + src_lod.push_back({0, 2, 3}); + src.set_lod(src_lod); + + Tensor pad_cpu, src_cpu, out_cpu; + pad_cpu.Resize({2 * 3, 4}); + src_cpu.Resize({3, 1}); + out_cpu.Resize({3, 4}); + + auto* pad_cpu_data = pad_cpu.mutable_data(); + auto* src_cpu_data = src_cpu.mutable_data(); + for (int i = 0; i < pad_cpu.numel(); ++i) { + pad_cpu_data[i] = static_cast(i); + } + + pad.Assign(pad_cpu_data, pad_cpu.dims()); + src.Assign(src_cpu_data, src_cpu.dims()); + + param.pad = &pad; + param.src = &src; + param.out = &out; + kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + kernel.SetContext(std::move(ctx)); + kernel.Launch(); + cudaDeviceSynchronize(); + + auto* out_cpu_data = out_cpu.mutable_data(); + auto* out_data = out.mutable_data(TARGET(kCUDA)); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5); + // LOG(INFO) << out_cpu_data[i]; + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_fc_compute.cu b/lite/kernels/cuda/search_seq_fc_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..e3ac75afeeee772ed7486a47dde14b7a3af4085f --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute.cu @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/search_seq_fc_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void add_bias(int n, + int output_size, + const dtype* bias, + dtype* dout) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int bias_index = index % output_size; + if (index < n) { + dout[index] = dout[index] + bias[bias_index]; + } +} + +void SearchSeqFcCompute::PrepareForRun() { + gemm_impl_.reset(new lite::cuda::math::Gemm); +} + +void SearchSeqFcCompute::Run() { + auto& param = this->Param(); + CHECK(ctx_) << "running context should be set first"; + auto& cuda_ctx = ctx_->template As(); + auto cuda_stream = cuda_ctx.exec_stream(); + + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto out_dims = out->dims(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]"; + CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size"; + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto x_data = x->data(); + auto w_data = w->data(); + auto out_data = out->mutable_data(TARGET(kCUDA)); + + CHECK(gemm_impl_->init(false, true, M, N, K, &cuda_ctx)); + gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &cuda_ctx); + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + auto b_data = b->mutable_data(); + int total_size = M * N; + add_bias<<>>(total_size, N, b_data, out_data); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_seq_fc, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SearchSeqFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/search_seq_fc_compute.h b/lite/kernels/cuda/search_seq_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..dff8ba2acfbe28fc72f095294ad5a140ed66f150 --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/context.h" +#include "lite/core/kernel.h" +#include "lite/core/types.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SearchSeqFcCompute : public KernelLite { + public: + using param_t = operators::SearchSeqFcParam; + + void PrepareForRun() override; + void Run() override; + virtual ~SearchSeqFcCompute() = default; + + private: + std::unique_ptr> gemm_impl_{nullptr}; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/search_seq_fc_compute_test.cc b/lite/kernels/cuda/search_seq_fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..354d1bb5bc3b0f3ee4d102fb2ebce176041ba91b --- /dev/null +++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/search_seq_fc_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) { + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto& x_lod = x->lod(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto x_data = x->data(); + auto w_data = w->data(); + auto out_data = out->mutable_data(); + + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + auto sum = static_cast(0); + for (int l = 0; l < K; l++) { + T xv = x_data[i * K + l]; + T wv = w_data[j * K + l]; + sum += xv * wv; + } + out_data[i * N + j] = sum; + } + } + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + auto b_data = b->data(); + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out_data[i * N + j] += b_data[j]; + } + } + } +} + +TEST(search_seq_fc_compute, normal) { + Env::Init(); + for (auto x_lod_0 : {std::vector({0, 1, 3}), + std::vector({0, 3, 4, 5})}) { + for (auto feature_size : {2, 9}) { + for (auto out_size : {3, 5}) { + for (auto has_bias : {true, false}) { + // infer x_dims, w_dims, b_dims and out_dims + DDim x_dims({static_cast(x_lod_0.back()), feature_size}); + DDim w_dims({out_size, feature_size}); + DDim b_dims({has_bias ? out_size : 0}); + DDim out_dims({static_cast(x_lod_0.back()), out_size}); + LoD x_lod; + x_lod.push_back(x_lod_0); + LoD out_lod; + out_lod.push_back(x_lod_0); + // prepare input&output tensors + Tensor x_dev, x_host, w_dev, w_host, b_dev, b_host, out_dev, out_host, + out_ref; + x_host.Resize(x_dims); + w_host.Resize(w_dims); + b_host.Resize(b_dims); + out_host.Resize(out_dims); + x_dev.Resize(x_dims); + w_dev.Resize(w_dims); + b_dev.Resize(b_dims); + out_dev.Resize(out_dims); + out_ref.Resize(out_dims); + x_host.set_lod(x_lod); + out_host.set_lod(out_lod); + x_dev.set_lod(x_lod); + out_dev.set_lod(out_lod); + out_ref.set_lod(out_lod); + auto out_dev_data = out_dev.mutable_data(TARGET(kCUDA)); + auto x_host_data = x_host.mutable_data(); + auto w_host_data = w_host.mutable_data(); + auto out_host_data = out_host.mutable_data(); + auto out_ref_data = out_ref.mutable_data(); + for (int i = 0; i < x_host.dims().production(); i++) { + x_host_data[i] = i * 0.125f; + } + for (int i = 0; i < w_host.dims().production(); i++) { + w_host_data[i] = i * 0.5f; + } + x_dev.Assign(x_host_data, + x_host.dims()); + w_dev.Assign(w_host_data, + w_host.dims()); + // prepare cuda context, initialize param, and run kernel + operators::SearchSeqFcParam param; + param.x = &x_dev; + param.w = &w_dev; + param.out = &out_dev; + param.out_size = out_size; + if (has_bias) { + auto b_host_data = b_host.mutable_data(); + for (int i = 0; i < b_host.dims().production(); i++) { + b_host_data[i] = i * 0.5f; + } + b_dev.Assign(b_host_data, + b_host.dims()); + param.b = &b_dev; + } + std::unique_ptr ctx(new KernelContext); + auto& cuda_ctx = ctx->As(); + cuda_ctx.InitOnce(); + int dev_id = TargetWrapper::GetCurDevice(); + cuda_ctx.Init(dev_id); + SearchSeqFcCompute search_seq_fc; + search_seq_fc.SetParam(param); + search_seq_fc.SetContext(std::move(ctx)); + search_seq_fc.Launch(); + cudaDeviceSynchronize(); + CopySync(out_host_data, + out_dev_data, + sizeof(float) * out_dev.dims().production(), + IoDirection::DtoH); + // run reference + param.x = &x_host; + param.w = &w_host; + param.out = &out_ref; + if (has_bias) { + param.b = &b_host; + } + search_seq_fc_compute_ref(param); + // verify result + for (int i = 0; i < out_ref.dims().production(); i++) { + EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5); + } + } + } + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.cu b/lite/kernels/cuda/sequence_arithmetic_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..7593632a14acd0cbec548dc5b9d3a096c4c7f38d --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu @@ -0,0 +1,249 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_arithmetic_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void ker_arithmetic_sum(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] + + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_sub(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] - + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +template +__global__ void ker_arithmetic_mul(Dtype* out_data, + const Dtype* in_data_0, + const Dtype* in_data_1, + const int* offset_0, + const int* offset_1, + const int* word_id_to_seq_id, + const int seq_num, + const int inner_size, + const int count) { + CUDA_KERNEL_LOOP(tid, count) { + int emb_id = tid % inner_size; + int word_id = tid / inner_size; + int seq_id = word_id_to_seq_id[word_id]; + int word_id_in_cur_seq = word_id - offset_0[seq_id]; + int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id]; + if (word_id_in_cur_seq < seq_len_1) { + out_data[tid] = + in_data_0[tid] * + in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size + + emb_id]; + } else { + out_data[tid] = in_data_0[tid]; + } + } +} + +void SequenceArithmeticCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto x_data = param.X->data(); + auto x_lod = param.X->lod()[0]; + auto y_data = param.Y->data(); + auto y_lod = param.Y->lod()[0]; + auto out_data = param.Out->mutable_data(TARGET(kCUDA)); + + offset_x.Resize({static_cast(x_lod.size())}); + auto offset_x_data = offset_x.mutable_data(TARGET(kCUDA)); + + offset_y.Resize({static_cast(y_lod.size())}); + auto offset_y_data = offset_y.mutable_data(TARGET(kCUDA)); + + word_id_to_seq_id.Resize({param.X->numel()}); + auto word_id_to_seq_id_data = + word_id_to_seq_id.mutable_data(TARGET(kCUDA)); + + std::vector word_seq_map; + for (int i = 0; i < x_lod.size() - 1; i++) { + for (int j = x_lod[i]; j < x_lod[i + 1]; j++) { + word_seq_map.push_back(i); + } + } + + std::vector offset_x_data_cpu(x_lod.size(), 0); + auto x_lod_data = x_lod.data(); + for (int i = 0; i < offset_x_data_cpu.size(); i++) { + offset_x_data_cpu[i] = x_lod_data[i]; + } + + std::vector offset_y_data_cpu(y_lod.size(), 0); + auto y_lod_data = y_lod.data(); + for (int i = 0; i < offset_y_data_cpu.size(); i++) { + offset_y_data_cpu[i] = y_lod_data[i]; + } + + TargetWrapperCuda::MemcpyAsync(offset_x_data, + offset_x_data_cpu.data(), + sizeof(int) * x_lod.size(), + IoDirection::HtoD, + stream); + + TargetWrapperCuda::MemcpyAsync(offset_y_data, + offset_y_data_cpu.data(), + sizeof(int) * y_lod.size(), + IoDirection::HtoD, + stream); + + TargetWrapperCuda::MemcpyAsync(word_id_to_seq_id_data, + word_seq_map.data(), + sizeof(int) * word_seq_map.size(), + IoDirection::HtoD, + stream); + + int seq_num = x_lod.size() - 1; + int count = param.X->numel(); + int inner_size = param.X->dims()[1]; + switch (param.op_type) { + case 1: // sum + ker_arithmetic_sum< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + case 2: // sub + ker_arithmetic_sub< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + case 3: // mul + ker_arithmetic_mul< + float><<>>( + out_data, + x_data, + y_data, + offset_x_data, + offset_y_data, + word_id_to_seq_id_data, + seq_num, + inner_size, + count); + break; + default: + break; + } + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_arithmetic, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); +REGISTER_LITE_KERNEL(search_seq_arithmetic, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.h b/lite/kernels/cuda/sequence_arithmetic_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..a180c50eaa810511f8d72902e81bcd9abdaca31e --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override; + virtual ~SequenceArithmeticCompute() = default; + + private: + lite::Tensor offset_x; + lite::Tensor offset_y; + lite::Tensor word_id_to_seq_id; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c0746d375d5c43d68cfad1896e7a3ab6178e2c35 --- /dev/null +++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_arithmetic_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +void sequence_arithmetic_compute_ref(const Tensor& x, + const Tensor& y, + Tensor* out, + int op_type) { + auto x_data = x.data(); + auto y_data = y.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + auto x_seq_offset = x.lod()[0]; + auto y_seq_offset = y.lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = x.numel() / x.dims()[0]; + + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + switch (op_type) { + case 1: + t_out[j] = input_x[j] + input_y[j]; + break; + case 2: + t_out[j] = input_x[j] - input_y[j]; + break; + case 3: + t_out[j] = input_x[j] * input_y[j]; + break; + default: + break; + } + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len)); + } + } +} + +void prepare_input(Tensor* x, const LoD& x_lod) { + x->Resize({static_cast(x_lod[0].back()), 3}); + x->set_lod(x_lod); + auto x_data = x->mutable_data(); + for (int i = 0; i < x->numel(); i++) { + x_data[i] = (i - x->numel() / 2) * 1.1; + } +} + +TEST(sequence_arithmetic_cuda, run_test) { + lite::Tensor x, y, x_cpu, y_cpu; + lite::Tensor out, out_cpu, out_ref; + lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}}; + + prepare_input(&x_cpu, x_lod); + prepare_input(&y_cpu, y_lod); + + x.Resize(x_cpu.dims()); + x.set_lod(x_cpu.lod()); + auto x_cpu_data = x_cpu.mutable_data(); + x.Assign(x_cpu_data, x_cpu.dims()); + + y.Resize(y_cpu.dims()); + y.set_lod(y_cpu.lod()); + auto y_cpu_data = y_cpu.mutable_data(); + y.Assign(y_cpu_data, y_cpu.dims()); + + operators::SequenceArithmeticParam param; + param.X = &x; + param.Y = &y; + param.Out = &out; + param.op_type = 1; + + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + SequenceArithmeticCompute sequence_arithmetic; + sequence_arithmetic.SetContext(std::move(ctx)); + sequence_arithmetic.SetParam(param); + sequence_arithmetic.Run(); + cudaDeviceSynchronize(); + + auto out_data = out.mutable_data(TARGET(kCUDA)); + out_cpu.Resize(out.dims()); + auto out_cpu_data = out_cpu.mutable_data(); + CopySync( + out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH); + + sequence_arithmetic_compute_ref(x_cpu, y_cpu, &out_ref, param.op_type); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-3); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..d4390046b01d6411bc8528e86083d5059eb4d449 --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute.cu @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_concat_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +template +inline LoD ConcatLoD(const std::vector& xs) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +template +__global__ void ker_sequence_concat(Dtype* out_data, + const uint64_t* in_locate_data, + const int* o2i_map, + const int* o2i_w_map, + const int seq_num, + const int emb_size, + const int count) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) { + int emb_id = tid % emb_size; + int word_id = tid / emb_size; + int input_id = o2i_map[word_id]; + int cur_work_id = o2i_w_map[word_id]; + const Dtype* in_data = reinterpret_cast( + reinterpret_cast(in_locate_data[input_id])); + out_data[tid] = in_data[cur_work_id * emb_size + emb_id]; + } +} + +void SequenceConcatCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + float* out_data = param.Out->mutable_data(TARGET(kCUDA)); + + int seq_num = param.X[0]->lod()[0].size() - 1; + const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0]; + std::vector in_locate_vec; + for (size_t i = 0; i < param.X.size(); ++i) { + in_locate_vec.push_back( + reinterpret_cast(param.X[i]->data())); + } + in_locate_tensor.Resize({static_cast(in_locate_vec.size())}); + + std::vector out2in_map; + std::vector out2in_word_map; + for (int i = 0; i < seq_num; ++i) { + for (int j = 0; j < param.X.size(); ++j) { + auto offset = param.X[j]->lod()[0]; + int cur_len = offset[i + 1] - offset[i]; + for (int k = 0; k < cur_len; ++k) { + out2in_map.push_back(j); + out2in_word_map.push_back(offset[i] + k); + } + } + } + int word_num = out2in_map.size(); + out2in_map_tensor.Resize({word_num}); + out2in_word_map_tensor.Resize({word_num}); + int* gpu_o2i_map_data = out2in_map_tensor.mutable_data(TARGET(kCUDA)); + int* gpu_o2i_w_map_data = + out2in_word_map_tensor.mutable_data(TARGET(kCUDA)); + uint64_t* gpu_in_locate_data = + in_locate_tensor.mutable_data(TARGET(kCUDA)); + + TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data, + out2in_map.data(), + sizeof(int) * out2in_map.size(), + IoDirection::HtoD, + stream); + TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data, + out2in_word_map.data(), + sizeof(int) * out2in_word_map.size(), + IoDirection::HtoD, + stream); + TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data, + in_locate_vec.data(), + sizeof(uint64_t) * in_locate_vec.size(), + IoDirection::HtoD, + stream); + + param.Out->set_lod(ConcatLoD(param.X)); + + int count = param.X[0]->numel(); + for (int i = 1; i < param.X.size(); ++i) { + count += param.X[i]->numel(); + } + + int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + ker_sequence_concat<<>>( + out_data, + gpu_in_locate_data, + gpu_o2i_map_data, + gpu_o2i_w_map_data, + seq_num, + emb_size, + count); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_concat, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..1737c18dd35976572efa1b62fadefed906b0ceb5 --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void Run() override; + virtual ~SequenceConcatCompute() = default; + + private: + lite::Tensor out2in_map_tensor; + lite::Tensor out2in_word_map_tensor; + lite::Tensor in_locate_tensor; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_concat_compute_test.cc b/lite/kernels/cuda/sequence_concat_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..477dc48dbbdfe7a1453bbb5c811d6897347fee53 --- /dev/null +++ b/lite/kernels/cuda/sequence_concat_compute_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_concat_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +namespace { +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +static void sequence_concat_ref(const std::vector& xs, + lite::Tensor* out) { + std::vector out_dims; + int64_t batch_size = 0; + int64_t feature_size = 0; + for (const auto& tensor : xs) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = batch_size; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + int num = x_in_order.size(); + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel(); + } + float* out_data = out->mutable_data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(out_data + col_idx, input_data, sizeof(float) * col_len); + col_idx += col_len; + } +} + +#define PREPARE_INPUT_DATA(name) \ + name.Resize({name##_lod_len, feature_len}); \ + name##_cpu.Resize({name##_lod_len, feature_len}); \ + name##_ref.Resize({name##_lod_len, feature_len}); \ + name.set_lod(lod_info_##name); \ + name##_cpu.set_lod(lod_info_##name); \ + name##_ref.set_lod(lod_info_##name); \ + float* name##_cpu_data = name##_cpu.mutable_data(); \ + float* name##_ref_data = name##_ref.mutable_data(); \ + for (int i = 0; i < name##_cpu.numel(); ++i) { \ + name##_cpu_data[i] = (i - 2.0) * 1.0; \ + name##_ref_data[i] = (i - 2.0) * 1.0; \ + } \ + name.Assign(name##_cpu_data, \ + name##_cpu.dims()); + +#define PREPARE_OUTPUT_INFO(name) \ + name##_cpu.Resize({y_lod_len, feature_len}); \ + name##_ref.Resize({y_lod_len, feature_len}); \ + name.Resize({y_lod_len, feature_len}); \ + float* name##_cpu_data = name##_cpu.mutable_data(); + +} // namespace + +TEST(sequence_concat_cuda, normal) { + SequenceConcatCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::SequenceConcatParam param; + lite::Tensor x1, x2, x3, x1_cpu, x2_cpu, x3_cpu, x1_ref, x2_ref, x3_ref; + lite::Tensor y, y_cpu, y_ref; + + int32_t x1_lod_len = 10, feature_len = 4; + int32_t x2_lod_len = 4, x3_lod_len = 8; + int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len; + LoD lod_info_x1{{0, 3, 5, 6, 10}}; + LoD lod_info_x2{{0, 1, 2, 3, 4}}; + LoD lod_info_x3{{0, 2, 4, 6, 8}}; + LoD lod_info_y{{0, 0, 0, 0, 0}}; + for (size_t i = 0; i < lod_info_x1[0].size(); ++i) { + lod_info_y[0][i] = + lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i]; + } + + PREPARE_INPUT_DATA(x1); + PREPARE_INPUT_DATA(x2); + PREPARE_INPUT_DATA(x3); + PREPARE_OUTPUT_INFO(y); + + param.X = std::vector({&x1, &x2, &x3}); + param.Out = &y; + seq_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + cudaDeviceSynchronize(); + + auto* y_data = y.mutable_data(TARGET(kCUDA)); + CopySync( + y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH); + + std::vector input_ref({&x1_ref, &x2_ref, &x3_ref}); + sequence_concat_ref(input_ref, &y_ref); + float* y_ref_data = y_ref.mutable_data(); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..97876ec32fcc3ffc3d45ff8dbeafca90d6191b23 --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute.cu @@ -0,0 +1,258 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_pool_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void seq_pool_average_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum / in_slice_num; + } +} + +template +__global__ void seq_pool_sum_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum; + } +} + +template +__global__ void seq_pool_sqrt_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype sum = (Dtype)0; + for (int i = 0; i < in_slice_num; ++i) { + sum += src_in[i * slice_size]; + } + dst[out_batch_id * slice_size + out_id] = sum * rsqrtf(in_slice_num); + } +} + +template +__global__ void seq_pool_max_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_slice_num = static_cast(seq_offset[out_batch_id + 1] - + seq_offset[out_batch_id]); + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + src_in += in_offset + out_id; + Dtype max = src_in[0]; + for (int i = 1; i < in_slice_num; ++i) { + Dtype val = src_in[i * slice_size]; + if (val > max) { + max = val; + } + } + dst[out_batch_id * slice_size + out_id] = max; + } +} + +template +__global__ void seq_pool_last_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_offset = + (static_cast(seq_offset[out_batch_id + 1]) - 1) * slice_size; + dst[tid] = src_in[in_offset + out_id]; + } +} + +template +__global__ void seq_pool_first_kernel(Dtype* dst, + const Dtype* src_in, + const int batch_size, + const uint64_t* seq_offset, + const int slice_size) { + int total = slice_size * batch_size; + CUDA_KERNEL_LOOP(tid, total) { + int out_batch_id = tid / slice_size; + int out_id = tid % slice_size; + int in_offset = static_cast(seq_offset[out_batch_id] * slice_size); + dst[tid] = src_in[in_offset + out_id]; + } +} + +void SequencePoolCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + std::vector seq_offset = param.X->lod()[0]; + int batch_size = param.X->lod()[0].size() - 1; + int slice_size = param.Out->dims().production() / batch_size; + + float* out_data = param.Out->mutable_data(TARGET(kCUDA)); + const float* in_data = param.X->data(); + + lite::Tensor seq_offset_D; + seq_offset_D.Resize({static_cast(seq_offset.size())}); + TargetWrapperCuda::MemcpyAsync( + seq_offset_D.mutable_data(TARGET(kCUDA)), + seq_offset.data(), + sizeof(uint64_t) * seq_offset.size(), + IoDirection::HtoD, + stream); + + if (param.pool_type == "MAX") { + seq_pool_max_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "AVERAGE") { + seq_pool_average_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "SUM") { + seq_pool_sum_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "SQRT") { + seq_pool_sqrt_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "FIRST") { + seq_pool_first_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else if (param.pool_type == "LAST") { + seq_pool_last_kernel<<>>(out_data, + in_data, + batch_size, + seq_offset_D.data(), + slice_size); + } else { + LOG(ERROR) << "pool type " << param.pool_type << " is not supoorted."; + } + + std::vector offset_new(static_cast(batch_size + 1)); + + for (int i = 0; i <= batch_size; ++i) { + offset_new[i] = i; + } + std::vector> voffset_new; + voffset_new.push_back(offset_new); + param.Out->set_lod(voffset_new); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(sequence_pool, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequencePoolCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_pool_compute.h b/lite/kernels/cuda/sequence_pool_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..9309454d18d014045ac3bc7f189d2d8430949033 --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class SequencePoolCompute + : public KernelLite { + public: + using param_t = operators::SequencePoolParam; + + void Run() override; + virtual ~SequencePoolCompute() = default; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_pool_compute_test.cc b/lite/kernels/cuda/sequence_pool_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f2656cd1d6c4baa377d8f1d363ae5150113d42f --- /dev/null +++ b/lite/kernels/cuda/sequence_pool_compute_test.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_pool_compute.h" +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +TEST(sequence_pool_cuda, normal) { + SequencePoolCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + lite::Tensor x, x_cpu, out, out_cpu; + lite::LoD lod; + lod.push_back(std::vector{0, 10}); + + x.set_lod(lod); + x_cpu.set_lod(lod); + const size_t second_dim = 8u; + std::vector input_shape{static_cast(lod[0].back()), + static_cast(second_dim)}; + lite::DDim in_dims(input_shape); + x.Resize(in_dims); + x_cpu.Resize(in_dims); + + const size_t out_first_dim = lod[0].size() - 1; + std::vector output_shape{static_cast(out_first_dim), + static_cast(second_dim)}; + lite::DDim out_dims(output_shape); + out.Resize(out_dims); + out_cpu.Resize(out_dims); + + auto x_cpu_data = x_cpu.mutable_data(); + auto out_data = out.mutable_data(TARGET(kCUDA)); + auto out_cpu_data = out_cpu.mutable_data(); + + for (int64_t i = 0; i < x_cpu.dims().production(); i++) { + x_cpu_data[i] = 1.1f * i; + } + x.Assign(x_cpu_data, x_cpu.dims()); + + operators::SequencePoolParam param; + param.X = &x; + param.Out = &out; + std::vector pool_types( + {"MAX", "AVERAGE", "SUM", "SQRT", "FIRST", "LAST"}); + std::map> type_map; + type_map["MAX"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9}; + type_map["AVERAGE"] = {39.6, 40.7, 41.8, 42.9, 44, 45.1, 46.2, 47.3}; + type_map["SUM"] = {396, 407, 418, 429, 440, 451, 462, 473}; + type_map["SQRT"] = { + 125.226, 128.705, 132.183, 135.662, 139.14, 142.619, 146.097, 149.576}; + type_map["FIRST"] = {0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7}; + type_map["LAST"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9}; + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + for (std::string pool_type : pool_types) { + param.pool_type = pool_type; + seq_kernel.SetParam(param); + + seq_kernel.Run(); + cudaDeviceSynchronize(); + + CopySync(out_cpu_data, + out_data, + sizeof(float) * out_cpu.numel(), + IoDirection::DtoH); + + std::vector ref_results = type_map[pool_type]; + + for (int i = 0; i < out_cpu.numel(); i++) { + EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-3); + } + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_reverse_compute.cu b/lite/kernels/cuda/sequence_reverse_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..68447fcebb1a6189f3a80d47ea29b0fca88267c8 --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/kernels/cuda/sequence_reverse_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__host__ __device__ inline size_t UpperBound(const T* x, + size_t num, + const T& val) { + // The following code is from + // https://en.cppreference.com/w/cpp/algorithm/upper_bound + auto* first = x; + int64_t count = static_cast(num); + while (count > 0) { + auto step = (count >> 1); + auto* it = first + step; + if (val < *it) { + count = step; + } else { + first = ++it; + count -= (step + 1); + } + } + return static_cast(first - x); +} + +template +__global__ void SequenceReverseKernelGridIsOne( + const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) { + int64_t idx = static_cast(threadIdx.x); + auto row_idx_x = idx / row_numel; + auto lod_idx = UpperBound(lod, lod_count, row_idx_x); + auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x); + auto idx_y = row_idx_y * row_numel + idx % row_numel; + y[idx_y] = x[idx]; +} + +template +__global__ void SequenceReverseKernel(const T* x, + T* y, + const int64_t* lod, + size_t lod_count, + int64_t row_numel, + size_t limit) { + int64_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + auto row_idx_x = idx / row_numel; + auto lod_idx = UpperBound(lod, lod_count, row_idx_x); + auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x); + auto idx_y = row_idx_y * row_numel + idx % row_numel; + y[idx_y] = x[idx]; + } +} + +template +void SequenceReverseCompute::Run() { + auto& param = this->template Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + size_t limit = static_cast(param.X->numel()); + int64_t row_numel = static_cast(limit / param.X->dims()[0]); + const auto* x_data = param.X->template data(); + auto y_data = param.Out->template mutable_data(TARGET(kCUDA)); + CHECK_NE(x_data, y_data) + << "SequenceReverse Op does not support in-place operation"; + const auto lod = param.X->lod()[param.X->lod().size() - 1]; + const size_t lod_count = lod.size(); + param.Out->set_lod(param.X->lod()); + + lod_cuda.Resize({static_cast(lod.size())}); + int64_t* lod_data = lod_cuda.mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemcpyAsync(lod_data, + lod.data(), + sizeof(int64_t) * lod.size(), + IoDirection::HtoD, + stream); + constexpr int num_threads = 1024; + int block_size = limit <= num_threads ? limit : num_threads; + int grid_size = (limit + num_threads - 1) / num_threads; + if (grid_size == 1) { + SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>( + x_data, y_data, lod_data, lod_count, row_numel); + } else { + SequenceReverseKernel<<>>( + x_data, y_data, lod_data, lod_count, row_numel, limit); + } + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +typedef paddle::lite::kernels::cuda::SequenceReverseCompute + ReverseFp32; + +typedef paddle::lite::kernels::cuda::SequenceReverseCompute + ReverseInt64; + +REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_reverse_compute.h b/lite/kernels/cuda/sequence_reverse_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6b6199e020e64343632d3f7c90d2cbbae4eaa42b --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void Run() override; + virtual ~SequenceReverseCompute() = default; + + private: + lite::Tensor lod_cuda; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_reverse_compute_test.cc b/lite/kernels/cuda/sequence_reverse_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3317b523037d913d6017041fbd357ed1dcf2d20a --- /dev/null +++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/sequence_reverse_compute.h" +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { + const auto* x_data = x->data(); + auto seq_offset = x->lod()[x->lod().size() - 1]; + int width = x->numel() / x->dims()[0]; + auto* y_data = y->mutable_data(); + for (int i = 0; i < static_cast(seq_offset.size()) - 1; ++i) { + auto start_pos = seq_offset[i]; + auto end_pos = seq_offset[i + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(y_data + pos * width, + x_data + cur_pos * width, + width * sizeof(float)); + } + } +} + +TEST(sequence_reverse_cuda, normal) { + SequenceReverseCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::SequenceReverseParam param; + lite::Tensor x, x_cpu, x_ref; + lite::Tensor y, y_cpu, y_ref; + + int32_t lod_len = 10, feature_len = 4; + LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}}; + + x.Resize({lod_len, feature_len}); + x_cpu.Resize({lod_len, feature_len}); + x_ref.Resize({lod_len, feature_len}); + y.Resize({lod_len, feature_len}); + y_cpu.Resize({lod_len, feature_len}); + y_ref.Resize({lod_len, feature_len}); + x.set_lod(lod_info); + x_cpu.set_lod(lod_info); + x_ref.set_lod(lod_info); + y.set_lod(lod_info); + y_cpu.set_lod(lod_info); + y_ref.set_lod(lod_info); + + auto* y_data = y.mutable_data(TARGET(kCUDA)); + + float* x_cpu_data = x_cpu.mutable_data(); + float* x_ref_data = x_ref.mutable_data(); + float* y_cpu_data = y_cpu.mutable_data(); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < x_cpu.numel(); ++i) { + x_cpu_data[i] = (i - 2.0) * 1.0; + x_ref_data[i] = (i - 2.0) * 1.0; + } + + x.Assign(x_cpu_data, x_cpu.dims()); + + param.X = &x; + param.Out = &y; + seq_kernel.SetParam(param); + + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + cudaDeviceSynchronize(); + + CopySync( + y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH); + + sequence_reverse_ref(&x_ref, &y_ref); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..8ea3edb30d86e314a04aab7ceac358e4c57b5b6a --- /dev/null +++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/cuda/sequence_topk_avg_pooling_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +__global__ void topk_avg_pooling_kernel_by_row_improve( + Dtype *output_data, + const Dtype *input, + const int *gpu_input_offset_l, + const int *gpu_input_offset_r, + const int topk_size, + const int *topks, + const int feat_map_num) { + int row = + gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x]; // 8 + int col = gpu_input_offset_r[blockIdx.x + 1] - + gpu_input_offset_r[blockIdx.x]; // 30 + int max_k = topks[topk_size - 1]; + max_k = max_k < col ? max_k : col; + + extern __shared__ Dtype smem[]; // H*W + + const Dtype *fm_row_in_data = input; + for (int i = 0; i < blockIdx.x; ++i) { + int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i]; + int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i]; + fm_row_in_data += tmp_row * feat_map_num * tmp_col; + } + fm_row_in_data += blockIdx.y * row * col; + + for (int i = threadIdx.x; i < row * col; i += blockDim.x) { + smem[i] = fm_row_in_data[i]; + } + __syncthreads(); + + for (int idx = threadIdx.x; idx < row; idx += blockDim.x) { + Dtype *fm_row_out_data = + output_data + + (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size + + blockIdx.y * topk_size; + + Dtype *smem_start_col = smem + idx * col; + + int counter = max_k; // topk_size; + Dtype last_max_val = -20000.0; + while (counter) { + Dtype max_val = -10000.0; + int max_pos = 0; + int m = 0; + for (; m < col; m++) { + Dtype cur_data = smem_start_col[m]; + if (cur_data > max_val) { + max_val = cur_data; + max_pos = m; + last_max_val = max_val; + } + } + if (max_val < -9999.0) { // == -10000.0 + max_val = last_max_val; + } + smem_start_col[max_pos] = -10000000.0; + int i = max_k - counter; + for (int c = 0; c < topk_size; c++) { + if (i <= topks[c] - 1) { + fm_row_out_data[c] += max_val; + } + } + counter--; + } + __syncthreads(); + // compute avg + for (int i = 0; i < topk_size; i++) { + fm_row_out_data[i] = fm_row_out_data[i] / topks[i]; + } + } +} + +template +void SequenceTopkAvgPoolingCompute::Run() { + auto ¶m = this->Param(); + auto &ctx = this->ctx_->template As(); + auto cuda_stream = ctx.exec_stream(); + int topk_num = param.topks.size(); + lite::DDim top_ks_shape(std::vector{topk_num, 1, 1, 1}); + _top_ks.Resize(top_ks_shape); + cudaMemcpyAsync(_top_ks.mutable_data(TARGET(kCUDA)), + ¶m.topks[0], + sizeof(int) * topk_num, + cudaMemcpyHostToDevice, + cuda_stream); + + int width_offset_len = param.COLUMN->lod()[0].size(); + lite::DDim width_offset_shape( + std::vector{width_offset_len, 1, 1, 1}); + _width_offset.Resize(width_offset_shape); + std::vector width_lod_0(width_offset_len, 0); + for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) { + width_lod_0[i] = static_cast(param.COLUMN->lod()[0][i]); + } + cudaMemcpyAsync(_width_offset.mutable_data(TARGET(kCUDA)), + &width_lod_0[0], + sizeof(int) * width_offset_len, + cudaMemcpyHostToDevice, + cuda_stream); + + int height_offset_len = param.ROW->lod()[0].size(); + lite::DDim height_offset_shape( + std::vector{height_offset_len, 1, 1, 1}); + _height_offset.Resize(height_offset_shape); + std::vector height_lod_0(height_offset_len, 0); + for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) { + height_lod_0[i] = static_cast(param.ROW->lod()[0][i]); + } + cudaMemcpyAsync(_height_offset.mutable_data(TARGET(kCUDA)), + &height_lod_0[0], + sizeof(int) * height_offset_len, + cudaMemcpyHostToDevice, + cuda_stream); + + const Tensor *x_tensor = param.X; + Tensor *out_tensor = param.Out; + const T *in_data = x_tensor->data(); + T *out_data = out_tensor->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data(TARGET(kCUDA)), + 0, + sizeof(T) * out_tensor->numel(), + cuda_stream); + + int num = param.ROW->lod()[0].size() - 1; + int channel = param.channel_num; + + const int *height_offset = _height_offset.data(); + const int *width_offset = _width_offset.data(); + + int feat_map_size = 0; + for (size_t i = 0; i < height_lod_0.size() - 1; ++i) { + int height = height_lod_0[i + 1] - height_lod_0[i]; + int width = width_lod_0[i + 1] - width_lod_0[i]; + if (height * width > feat_map_size) { + feat_map_size = height * width; + } + } + dim3 blocks(num, channel); + dim3 threads(32, 1); + topk_avg_pooling_kernel_by_row_improve< + T><<>>( + out_data, + in_data, + height_offset, + width_offset, + param.topks.size(), + _top_ks.data(), + param.channel_num); + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + sequence_topk_avg_pooling, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("ROW", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindInput("COLUMN", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("pos", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..321ec9cfce2b22e7ddfc5dab53060a7eaea01732 --- /dev/null +++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/core/kernel.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +template +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void Run() override; + + virtual ~SequenceTopkAvgPoolingCompute() = default; + + protected: + lite::Tensor _height_offset; + lite::Tensor _width_offset; + lite::Tensor _top_ks; +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu index d8d2987524cd2e8f9c38aba4da3ff61a80bf53ce..6293f7295ec78f44705992182667b30f82728e09 100644 --- a/lite/kernels/cuda/softmax_compute.cu +++ b/lite/kernels/cuda/softmax_compute.cu @@ -173,9 +173,10 @@ void SoftmaxCompute::Run() { cudaGetDeviceProperties(&deviceProp, device_id); size_t sharedmem_size = deviceProp.sharedMemPerBlock; int max_dimsize = sharedmem_size / sizeof(float) / threads; - auto input_data = param.x->data(); auto output_data = param.output->mutable_data(TARGET(kCUDA)); + TargetWrapperCuda::MemsetSync( + output_data, 0, param.output->numel() * sizeof(float)); if (axis_size <= max_dimsize) { int use_sharemem_size = axis_size * threads * sizeof(float); sharemem_softmax_kernel<<>>( @@ -194,7 +195,7 @@ void SoftmaxCompute::Run() { auto max_data = tmax_data.mutable_data(TARGET(kCUDA)); auto sum_data = tsum_data.mutable_data(TARGET(kCUDA)); //! firstly, get maximum data - float min_data = std::numeric_limits::min(); + float min_data = std::numeric_limits::lowest(); softmax_max_kernel<<>>(total_threads, input_data, max_data, @@ -217,7 +218,7 @@ void SoftmaxCompute::Run() { total_threads, output_data, sum_data, inner_num, outer_num, axis_size); } cudaError_t error = cudaGetLastError(); - if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); + if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error); } } // namespace cuda @@ -244,3 +245,19 @@ REGISTER_LITE_KERNEL(softmax, PRECISION(kFloat), DATALAYOUT(kNCHW))}) .Finalize(); +REGISTER_LITE_KERNEL(search_seq_softmax, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::SoftmaxCompute, + def) + .BindInput("X", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out", + {LiteType::GetTensorTy(TARGET(kCUDA), + PRECISION(kFloat), + DATALAYOUT(kNCHW))}) + .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu new file mode 100644 index 0000000000000000000000000000000000000000..f2588a8f53b83363300000fca6ba8a11cf5d50b6 --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute.cu @@ -0,0 +1,263 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "lite/backends/cuda/math/gemm.h" +#include "lite/core/op_registry.h" +#include "lite/core/target_wrapper.h" +#include "lite/core/tensor.h" +#include "lite/kernels/cuda/var_conv_2d_compute.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +const int CUDA_NUM_THREADS = 512; + +template +__global__ void var_im2col_gpu_kernel(const int n, + const Dtype* data_im, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int height_col, + const int width_col, + Dtype* data_col) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int index = idx; index < n; index += blockDim.x * gridDim.x) { + const int h_index = index / width_col; + const int h_col = h_index % height_col; + const int w_col = index % width_col; + const int c_im = h_index / height_col; + const int c_col = c_im * kernel_h * kernel_w; + const int h_offset = h_col * stride_h - pad_h; + const int w_offset = w_col * stride_w - pad_w; + + Dtype* data_col_ptr = data_col; + data_col_ptr += (c_col * height_col + h_col) * width_col + w_col; + const Dtype* data_im_ptr = data_im; + data_im_ptr += (c_im * height + h_offset) * width + w_offset; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + int h_im = h_offset + i; + int w_im = w_offset + j; + *data_col_ptr = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) + ? data_im_ptr[i * width + j] + : 0; + data_col_ptr += height_col * width_col; + } + } + } +} + +void VarConv2DCompute::var_im2col(const cudaStream_t& stream) { + auto& param = this->Param(); + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + const auto* input = param.X; + auto* col = param.Col; + + int batch = input->lod()[0].size() - 1; + const auto& bottom_offset = input->lod()[0]; + // 2-D lod info. + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(TARGET(kCUDA)); + const auto* bottom_data = input->data(); + + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int width_col = (width - 1) / stride_w + 1; + int height_col = (height - 1) / stride_h + 1; + const float* data_im = bottom_data + b_offset; + float* data_col = top_data + t_offset; + + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int num_kernels = height_col * width_col * input_channel; + const int CUDA_NUM_BLOCKS = + (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; + var_im2col_gpu_kernel< + float><<>>( + num_kernels, + data_im, + height, + width, + kernel_h, + kernel_w, + ((stride_h - 1) * height + kernel_h - 1) / 2, + ((stride_w - 1) * width + kernel_w - 1) / 2, + stride_h, + stride_w, + height_col, + width_col, + data_col); + } +} + +void VarConv2DCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto stream = ctx.exec_stream(); + + auto* bottom = param.X; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + auto* w = param.W; + auto* top = param.Out; + auto* col = param.Col; + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + + var_im2col(stream); + + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + std::vector top_offset; + std::vector height_vector; + std::vector width_vector; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + height_vector.push_back(top_im_y); + width_vector.push_back(top_im_x); + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + + auto* top_data = top->mutable_data(TARGET(kCUDA)); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + std::unique_ptr> gemm_impl_; + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + float* out_data = top_data + top_offset[b]; + const float* in_data = col_data + col->lod()[0][b]; + gemm_impl_.reset(new lite::cuda::math::Gemm); + gemm_impl_->init(false, + false, + w->dims()[0], + height_vector[b] * width_vector[b], + input_channel * kernel_h * kernel_w, + &ctx); + gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx); + } + + cudaError_t error = cudaGetLastError(); + if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error); +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(var_conv_2d, + kCUDA, + kFloat, + kNCHW, + paddle::lite::kernels::cuda::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))}) + .Finalize(); diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e0b8e30c509f9095960bee3720567c96a71e7336 --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void Run() override; + virtual ~VarConv2DCompute() = default; + + private: + void var_im2col(const cudaStream_t& stream); +}; + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/cuda/var_conv_2d_compute_test.cc b/lite/kernels/cuda/var_conv_2d_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..98e9c73cdd680edc03cf18b60444bd5b0f76274c --- /dev/null +++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/cuda/var_conv_2d_compute.h" +#include +#include +#include +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace cuda { + +static void im2col_ref(const lite::Tensor& input, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + lite::Tensor* col) { + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } +} + +static void naive_sgemm(const bool transpose_A, + const bool transpose_B, + const int M, + const int N, + const int K, + const float alpha, + const float* A, // m x k (after transpose if TransA) + const int lda, // leading dimension of a + const float* B, // k x n (after transpose if TransB) + const int ldb, // leading dimension of b + const float beta, + float* C, // m x n + const int ldc) { + for (int m = 0; m < M; ++m) { + for (int k = 0; k < K; ++k) { + for (int n = 0; n < N; ++n) { + C[m * N + n] += beta * C[m * N + n]; + size_t A_idx = 0, B_idx = 0; + if (transpose_A) { + A_idx = k * M + m; // A is k x m + } else { + A_idx = m * K + k; // A is m x k + } + + if (transpose_B) { + B_idx = n * K + k; // B is n x k + } else { + B_idx = k * N + n; // B is k x n + } + + C[m * N + n] += alpha * A[A_idx] * B[B_idx]; + } + } + } +} + +static void var_conv_2d_ref(const lite::Tensor* bottom, + const lite::Tensor* w, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + const int output_channel, + lite::Tensor* top, + lite::Tensor* col) { + im2col_ref(*bottom, + in_row, + in_col, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + naive_sgemm(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } +} + +TEST(var_conv_2d_cuda, normal) { + VarConv2DCompute var_conv_kernel; + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + operators::VarConv2DParam param; + + lite::Tensor X, W, ROW, COLUMN; + lite::Tensor x_cpu, w_cpu; + lite::Tensor Out, Col, out_cpu, col_cpu; + int kernel_h = 5, kernel_w = 5; + int stride_h = 1, stride_w = 1; + int input_channel = 5, output_channel = 5; + + std::vector w_dims_vec; + w_dims_vec.push_back(output_channel); + w_dims_vec.push_back(input_channel * kernel_h * kernel_w); + W.Resize(w_dims_vec); + w_cpu.Resize(w_dims_vec); + auto* w_cpu_data = w_cpu.mutable_data(); + for (int i = 0; i < W.numel(); ++i) { + w_cpu_data[i] = i - 1.f; + } + + std::vector row_lod_vec{0, 10, 20}; + LoD row_lod; + row_lod.push_back(row_lod_vec); + ROW.set_lod(row_lod); + + std::vector column_lod_vec{0, 10, 20}; + LoD column_lod; + column_lod.push_back(column_lod_vec); + COLUMN.set_lod(column_lod); + + int x_size = 0; + std::vector x_lod_vec; + x_lod_vec.push_back(0); + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + x_lod_vec.push_back(x_lod_vec.back() + height * width); + x_size += height * width; + } + for (size_t i = 0; i < x_lod_vec.size(); ++i) { + x_lod_vec[i] *= input_channel; + } + x_size *= input_channel; + std::vector x_dims_vec{x_size, 1}; + LoD x_lod; + x_lod.push_back(x_lod_vec); + x_lod.push_back(row_lod_vec); + x_lod.push_back(column_lod_vec); + X.Resize(x_dims_vec); + x_cpu.Resize(x_dims_vec); + X.set_lod(x_lod); + x_cpu.set_lod(x_lod); + auto* x_cpu_data = x_cpu.mutable_data(); + for (int i = 0; i < X.numel(); ++i) { + x_cpu_data[i] = i % 20 * 1.f; + } + + int sum_num = 0; + int out_sum_num = 0; + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + sum_num += height * width * input_channel * kernel_h * kernel_w; + out_sum_num += height * width * output_channel; + } + col_cpu.Resize({sum_num, 1}); + out_cpu.Resize({out_sum_num, 1}); + float* out_cpu_data = out_cpu.mutable_data(); + float* col_cpu_data = col_cpu.mutable_data(); + + X.Assign(x_cpu_data, x_cpu.dims()); + W.Assign(w_cpu_data, w_cpu.dims()); + + param.X = &X; + param.W = &W; + // param.ROW = &ROW; + // param.COLUMN = &COLUMN; + param.Out = &Out; + param.Col = &Col; + param.stride_h = stride_h; + param.stride_w = stride_w; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.input_channel = input_channel; + param.output_channel = output_channel; + var_conv_kernel.SetParam(param); + cudaStream_t stream; + cudaStreamCreate(&stream); + context.SetExecStream(stream); + var_conv_kernel.SetContext(std::move(ctx)); + var_conv_kernel.Run(); + cudaDeviceSynchronize(); + + const float* out_data = Out.data(); + const float* col_data = Col.data(); + + CopySync( + out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH); + CopySync( + col_cpu_data, col_data, sizeof(float) * Col.numel(), IoDirection::DtoH); + + lite::Tensor top_ref, col_ref; + var_conv_2d_ref(&x_cpu, + &w_cpu, + &ROW, + &COLUMN, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + output_channel, + &top_ref, + &col_ref); + + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(out_cpu_data[i], top_ref.data()[i], 1e-5); + } + for (int i = 0; i < Col.numel(); ++i) { + EXPECT_NEAR(col_cpu_data[i], col_ref.data()[i], 1e-5); + } +} + +} // namespace cuda +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc index 3e06e103bba61937e48bb4d14eeedd493ab15bba..8bc171dd67df08c17cdce61c6fa6882afd9ae8ae 100644 --- a/lite/kernels/fpga/conv_compute.cc +++ b/lite/kernels/fpga/conv_compute.cc @@ -36,8 +36,15 @@ void ConvCompute::PrepareForRun() { conv_param.filter = param.filter->ZynqTensor(); conv_param.groups = param.groups; conv_param.strides = param.strides; + auto paddings = *param.paddings; conv_param.paddings = param.paddings; conv_param.dilations = param.dilations; + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + if (!pad_equal) { + LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1] + << ", " << paddings[2] << ", " << paddings[3]; + } fill_scale_bias_const(&conv_param); conv_param.bias()->copyFrom(param.bias->ZynqTensor()); conv_param.relu.enabled = param.fuse_relu; diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc index f166974cc9f2fd856defd753e1e9131858d41252..1e05c1fa0c7e0f211b5eaed8f5e0385cbfe20cf2 100644 --- a/lite/kernels/fpga/conv_compute_test.cc +++ b/lite/kernels/fpga/conv_compute_test.cc @@ -141,13 +141,15 @@ void conv_compute_ref(const operators::ConvParam& param) { int group = param.groups; int kernel_w = param.filter->dims()[2]; int kernel_h = param.filter->dims()[3]; + + auto paddings = *param.paddings; + auto dilations = *para.dilations; int stride_w = param.strides[0]; int stride_h = param.strides[1]; - int dila_w = param.dilations[0]; - int dila_h = param.dilations[1]; - - int pad_w = param.paddings[0]; - int pad_h = param.paddings[1]; + int dila_w = dilations[0]; + int dila_h = dilations[1]; + int pad_w = paddings[2]; + int pad_h = paddings[0]; bool flag_bias = (param.bias != nullptr); bool flag_relu = param.fuse_relu; @@ -277,10 +279,14 @@ TEST(conv_fpga, compute) { param.bias = &bias; } param.fuse_relu = flag_relu; - param.paddings = std::vector({padding, padding}); + std::vector paddings = { + padding, padding, padding, padding}; param.strides = std::vector({stride, stride}); + std::vector dilations = {dilation, dilation}; + param.paddings = + std::make_shared>(paddings); param.dilations = - std::vector({dilation, dilation}); + std::make_shared>(dilations); param.groups = group; conv.SetParam(param); conv.Launch(); diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 032de819743f4aba02e442dd71c26b950d1435b6..79d1bf2fd5fa694d4888d474c321a43d279bab76 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -19,6 +19,9 @@ lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps}) +lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps}) set(npu_bridges npu_bridge_registry @@ -39,6 +42,9 @@ set(npu_bridges npu_bridge_concat_op npu_bridge_shuffle_channel_op npu_bridge_pad2d_op + npu_bridge_square_op + npu_bridge_sqrt_op + npu_bridge_reduce_mean_op CACHE INTERNAL "npu_bridges") set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) @@ -60,5 +66,8 @@ lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) +lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) message(STATUS "+++++ npu_bridges: ${npu_bridges}") diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 51b49091cd0e6f47fb9367e13aa7b2e43a6cf610..ac62891113b1899036c35ffd3058f1d409b00a36 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -41,6 +41,19 @@ node_map_type ActConverter(const std::shared_ptr act_op, // clipped_relu etc. act_node->set_attr_mode(lite::npu::CvtActMode(op_type)); + if (op_type == "relu_clipped") { + auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + act_node->set_attr_coef(Relu_clipped_coef); + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + act_node->set_attr_negative_slope(alpha); + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + act_node->set_attr_negative_slope(slope); + act_node->set_attr_coef(offset); + } + node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = act_node; return outputs_map; @@ -52,14 +65,18 @@ node_map_type ActConverter(const std::shared_ptr act_op, } // namespace lite } // namespace paddle -REGISTER_NPU_BRIDGE(sigmod, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter); -REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(relu_clipped, + paddle::lite::kernels::npu::bridges::ActConverter); +// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter); +REGISTER_NPU_BRIDGE(leaky_relu, + paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(softsign, paddle::lite::kernels::npu::bridges::ActConverter); REGISTER_NPU_BRIDGE(softplus, paddle::lite::kernels::npu::bridges::ActConverter); -REGISTER_NPU_BRIDGE(hardsigmoid, +REGISTER_NPU_BRIDGE(hard_sigmoid, paddle::lite::kernels::npu::bridges::ActConverter); diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/act_op_test.cc index 420de655dcdfb2069948399525bc4a8a561d0fd5..d50b1968b14cc33efd7ab9bcd0c4427d8ca2e508 100644 --- a/lite/kernels/npu/bridges/act_op_test.cc +++ b/lite/kernels/npu/bridges/act_op_test.cc @@ -17,7 +17,7 @@ #include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/test_helper.h" -#include "lite/operators/relu_op.h" +#include "lite/operators/activation_ops.h" namespace paddle { namespace lite { @@ -25,69 +25,112 @@ namespace kernels { namespace npu { namespace bridges { -void relu_ref(const std::shared_ptr op) { +void act_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto op_type = op_info->Type(); + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); auto x_data = x->data(); auto out_data = out->mutable_data(); - DDim x_dims = x->dims(); - DDim out_dims = out->dims(); - CHECK_EQ(x_dims.production(), out_dims.production()); - for (int i = 0; i < out_dims.production(); i++) { - out_data[i] = std::max(0.f, x_data[i]); + CHECK_EQ(x->numel(), out->numel()); + + // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid" + if (op_type == "sigmoid") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = 1.f / (1.f + std::exp(-x_data[i])); + } + } else if (op_type == "relu") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(0.f, x_data[i]); + } + } else if (op_type == "tanh") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) / + (std::exp(x_data[i]) + std::exp(-x_data[i])); + } + } else if (op_type == "relu_clipped") { + auto relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef); + } + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::max(x_data[i], x_data[i] * alpha); + } + } else if (op_type == "softsign") { + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = x_data[i] / (1 + std::abs(x_data[i])); + } + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + for (size_t i = 0; i < out->numel(); i++) { + out_data[i] = std::min(1.f, slope * x_data[i] + offset); + out_data[i] = std::max(0.f, out_data[i]); + } + } else { + LOG(FATAL) << "unsupported activation type: " << op_type; } } -void test_relu(int bs, int ic, int ih, int iw) { +void test_act(std::vector x_shape, std::string op_type) { // prepare input&output variables Scope scope; std::string x_var_name("x"); std::string out_var_name("out"); std::string out_ref_var_name("out_ref"); - auto* x = scope.Var(x_var_name)->GetMutable(); - auto* out = scope.Var(out_var_name)->GetMutable(); - auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(x_shape); // initialize input&output data - FillTensor(x); + FillTensor(x, -8, 8); // initialize op desc cpp::OpDesc opdesc; - opdesc.SetType("relu"); + opdesc.SetType(op_type); opdesc.SetInput("X", {x_var_name}); opdesc.SetOutput("Out", {out_var_name}); + if (op_type == "relu_clipped") { + opdesc.SetAttr("Relu_clipped_coef", 6.f); + } else if (op_type == "leaky_relu") { + opdesc.SetAttr("alpha", 0.02f); + } else if (op_type == "hard_sigmoid") { + opdesc.SetAttr("slope", 0.2f); + opdesc.SetAttr("offset", 0.5f); + } // create and convert op to NPU model, then run it on NPU - auto op = CreateOp(opdesc, &scope); + auto op = CreateOp(opdesc, &scope); LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); // execute reference implementation and save to output tensor - relu_ref(op); + act_ref(op); // compare results auto* out_data = out->mutable_data(); auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { - VLOG(5) << i; - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); } } -TEST(NPUBridges, relu) { - for (auto bs : {1, 3}) { - for (auto ic : {3, 4}) { - for (auto ih : {2, 5}) { - for (auto iw : {5, 9}) { - VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih - << " iw: " << iw; - test_relu(bs, ic, ih, iw); - } - } +TEST(NPUBridges, activation) { + std::vector> shapes{{1}, {2, 3}, {1, 2, 3, 4}}; + std::vector types{"sigmoid", + "relu", + "tanh", + "relu_clipped", + "leaky_relu", + "softsign", + "hard_sigmoid"}; + for (auto x_shape : shapes) { + for (auto op_type : types) { + test_act(x_shape, op_type); } } } @@ -98,5 +141,20 @@ TEST(NPUBridges, relu) { } // namespace lite } // namespace paddle +USE_LITE_OP(sigmoid); +USE_NPU_BRIDGE(sigmoid); USE_LITE_OP(relu); USE_NPU_BRIDGE(relu); +USE_LITE_OP(tanh); +USE_NPU_BRIDGE(tanh); +USE_LITE_OP(relu_clipped); +USE_NPU_BRIDGE(relu_clipped); + +USE_LITE_OP(leaky_relu); +USE_NPU_BRIDGE(leaky_relu); + +USE_LITE_OP(softsign); +USE_NPU_BRIDGE(softsign); + +USE_LITE_OP(hard_sigmoid); +USE_NPU_BRIDGE(hard_sigmoid); diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index 6f5f00959bd55faee2a76aa0bfbb9f12fa84c194..8c3153d242330360a2145ae87951dc8ea29168ca 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -30,8 +30,8 @@ node_map_type BatchNormConverter( auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - std::shared_ptr batch_norm_node = - std::make_shared(unique_op_type); + std::shared_ptr batch_norm_node = + std::make_shared(unique_op_type); auto x_var_name = op_info->Input("X").front(); auto scale_var_name = op_info->Input("Scale").front(); @@ -66,7 +66,7 @@ node_map_type BatchNormConverter( batch_norm_node->set_input_x(*inputs_map.at(x_var_name)); batch_norm_node->set_input_scale(*npu_scale); - batch_norm_node->set_input_b(*npu_bias); + batch_norm_node->set_input_offset(*npu_bias); batch_norm_node->set_input_mean(*npu_mean); batch_norm_node->set_input_variance(*npu_variance); batch_norm_node->set_attr_momentum(npu_momentum); diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index 32f4d511d5d35a64a5e02a18a2b5ffa6d09d75cd..8dc9ab1f0f8a1e63c52b2406117fc34477e71490 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/conv_op.h" #include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" @@ -42,9 +43,9 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto bs = input_dims[0]; auto ic = input_dims[1]; auto oc = filter_dims[0]; - CHECK_EQ(input_dims.size(), 4); - CHECK_EQ(output_dims.size(), 4); - CHECK_EQ(filter_dims.size(), 4); + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); CHECK_EQ(output_dims[0], bs); CHECK_EQ(output_dims[1], oc); auto strides = op_info->GetAttr>("strides"); @@ -52,9 +53,28 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); // check depthwise mode, and decide whether use ConvolutionDepthwise Op bool use_depthwise_conv = @@ -134,7 +154,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, depthwise_conv_node->set_attr_pad_mode(5); // VALID depthwise_conv_node->set_attr_group(groups); depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[1], paddings[2], paddings[3]})); depthwise_conv_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); depthwise_conv_node->set_attr_stride( @@ -161,7 +181,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, common_conv_node->set_attr_pad_mode(0); // NOTSET common_conv_node->set_attr_group(groups); common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[0], paddings[2], paddings[2]})); common_conv_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); common_conv_node->set_attr_stride( diff --git a/lite/kernels/npu/bridges/conv_op_test.cc b/lite/kernels/npu/bridges/conv_op_test.cc index 26309aa9e27a1f0a5f6093b44242434d9e29a173..909061d2bae5f3330355c58f5dfe707a23c22075 100644 --- a/lite/kernels/npu/bridges/conv_op_test.cc +++ b/lite/kernels/npu/bridges/conv_op_test.cc @@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr op) { int stride_h = strides[0]; int dila_w = dilations[1]; int dila_h = dilations[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int pad_h = paddings[0]; int batch_size = input_dims[0]; int in_ch_size = input_dims[1]; @@ -175,7 +175,8 @@ void test_conv(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 5ae99ef04670214c27f29b7ad30a637d614bea62..6eff4cb2d28d64098186dfb50a457a8828b8eb61 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -44,9 +44,17 @@ node_map_type ConvTransposeConverter( auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; // create deconv node auto conv_transpose_node = @@ -82,12 +90,11 @@ node_map_type ConvTransposeConverter( lite::npu::OpList::Global().add(inputs_map.at(input_var_name)); // set attributes - conv_transpose_node->set_attr_mode(1); conv_transpose_node->set_attr_format(0); // NCHW conv_transpose_node->set_attr_pad_mode(0); // NOTSET conv_transpose_node->set_attr_group(groups); conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT( - {paddings[0], paddings[0], paddings[1], paddings[1]})); + {paddings[0], paddings[1], paddings[2], paddings[3]})); conv_transpose_node->set_attr_dilation( ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); conv_transpose_node->set_attr_stride( diff --git a/lite/kernels/npu/bridges/conv_transpose_op_test.cc b/lite/kernels/npu/bridges/conv_transpose_op_test.cc index a009ef588e1ddf9561f895e977fbb08a98b2d51b..f96e57c06fc0fe1023edd591990fe4bd7ffc3ba5 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc @@ -278,7 +278,8 @@ void test_conv_transpose(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index 2ec757ab14bf13eee323fa35df5ff592622ca4cf..5eb5f4e271df71b1fa29084f0787c004f4753ffc 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -21,6 +21,30 @@ namespace kernels { namespace npu { namespace bridges { +std::vector CvtYShape(const Tensor& x, Tensor* y, int axis) { + auto x_dims = x.dims(); + CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x"; + auto y_dims = y->dims(); + CHECK_GE(x_dims.size(), y_dims.size()); + + if (axis < 0) { + axis += x_dims.size(); + } + + std::vector y_new_shape(y_dims.Vectorize()); + if (y_new_shape.size() == 4UL) { + return y_new_shape; + } + for (int i = 0; i < axis; i++) { + y_new_shape.insert(y_new_shape.begin(), 1); + } + while (y_new_shape.size() < 4) { + y_new_shape.push_back(1); + } + CHECK_EQ(y_new_shape.size(), 4UL); + return y_new_shape; +} + node_map_type ElementwiseConverter( const std::shared_ptr elementwise_op, const node_map_type& inputs_map) { @@ -30,34 +54,53 @@ node_map_type ElementwiseConverter( auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - std::shared_ptr elementwise_node = - std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); auto y_var_name = op_info->Input("Y").front(); - - CHECK_EQ(op_info->GetAttr("axis"), -1) - << "[NPU] elementwise only support inputs with same size"; - CHECK(inputs_map.find(x_var_name) != inputs_map.end()); - elementwise_node->set_input_x1(*inputs_map.at(x_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + auto axis = op_info->GetAttr("axis"); + std::shared_ptr elementwise_node = nullptr; + std::shared_ptr x_node = inputs_map.at(x_var_name); + std::shared_ptr y_node = nullptr; if (inputs_map.find(y_var_name) != inputs_map.end()) { - elementwise_node->set_input_x2(*inputs_map.at(y_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(y_var_name)); + y_node = inputs_map.at(y_var_name); } else { auto y_const_node = std::make_shared(y_var_name); - auto* y = scope->FindVar(y_var_name)->GetMutable(); - y_const_node->set_attr_value(lite::npu::CvtTensor(y)); - elementwise_node->set_input_x2(*y_const_node); - lite::npu::OpList::Global().add(y_const_node); + auto x = scope->FindTensor(x_var_name); + auto y = scope->FindMutableTensor(y_var_name); + auto y_new_shape = CvtYShape(*x, y, axis); + y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape)); + y_node = y_const_node; } + lite::npu::OpList::Global().add(x_node); + lite::npu::OpList::Global().add(y_node); - lite::npu::OpList::Global().add(elementwise_node); + if (op_type == "elementwise_add" || + op_type == "fusion_elementwise_add_activation") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_sub") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_mul") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x(*x_node); + elt_node->set_input_y(*y_node); + elementwise_node = elt_node; + } else if (op_type == "elementwise_div") { + auto elt_node = std::make_shared(unique_op_type); + elt_node->set_input_x1(*x_node); + elt_node->set_input_x2(*y_node); + elementwise_node = elt_node; + } else { + LOG(FATAL) << "unsupported op type: " << op_type; + } - // paddlelite has sum only - elementwise_node->set_attr_mode(1); + lite::npu::OpList::Global().add(elementwise_node); node_map_type outputs_map; if (op_type == "fusion_elementwise_add_activation") { @@ -86,3 +129,9 @@ REGISTER_NPU_BRIDGE(elementwise_add, paddle::lite::kernels::npu::bridges::ElementwiseConverter); REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation, paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_sub, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_mul, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); +REGISTER_NPU_BRIDGE(elementwise_div, + paddle::lite::kernels::npu::bridges::ElementwiseConverter); diff --git a/lite/kernels/npu/bridges/elementwise_ops_test.cc b/lite/kernels/npu/bridges/elementwise_ops_test.cc index 0e2fc9f2622d839c8eda6f82aab2759053b3e23d..8dd4c851ca89413d3e740bb5bc5d0461938a7f69 100644 --- a/lite/kernels/npu/bridges/elementwise_ops_test.cc +++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc @@ -29,37 +29,28 @@ template void elementwise_add_ref(const std::shared_ptr op) { Scope* scope = op->scope(); const OpInfo* op_info = op->op_info(); - auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); - auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); - auto out = - scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x = scope->FindTensor("x"); + auto y = scope->FindTensor("y"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); auto x_data = x->data(); auto y_data = y->data(); - dtype* out_data = out->mutable_data(); + auto out_data = out->mutable_data(); auto x_dims = x->dims(); auto y_dims = y->dims(); int axis = op_info->GetAttr("axis"); if (axis < 0) { - axis = x_dims.size() - y_dims.size(); + axis += x_dims.size(); } int batch = 1; - int channels = 1; - int num = 1; - for (int i = 0; i < axis; ++i) { - batch *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - channels *= y_dims[i]; - } - for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { - num *= x_dims[i]; - } + int channels = y->numel(); + int num = x->numel() / channels / batch; // do elementwise add/sub/max... - std::string elt_type = "add"; - if (elt_type == "add") { + std::string op_type = op_info->Type(); + if (op_type == "elementwise_add") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -73,7 +64,7 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "sub") { + } else if (op_type == "elementwise_sub") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -87,7 +78,7 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "mul") { + } else if (op_type == "elementwise_mul") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -101,7 +92,21 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } - } else if (elt_type == "max") { + } else if (op_type == "elementwise_div") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr / diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (op_type == "elementwise_max") { for (int i = 0; i < batch; ++i) { for (int j = 0; j < channels; ++j) { int offset = (i * channels + j) * num; @@ -116,11 +121,14 @@ void elementwise_add_ref(const std::shared_ptr op) { } } } else { - LOG(FATAL) << "unsupported Elementwise type: " << elt_type; + LOG(FATAL) << "unsupported Elementwise type: " << op_type; } } -void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { +void test_elementwise_add(const std::vector& x_shape, + const std::vector& y_shape, + int axis, + std::string elt_type) { // prepare input&output variables Scope scope; std::string x_var_name = "x"; @@ -131,16 +139,16 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { auto* y = scope.Var(y_var_name)->GetMutable(); auto* out = scope.Var(out_var_name)->GetMutable(); auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); - x->Resize({bs, ic, ih, iw}); - y->Resize({bs, ic, ih, iw}); + x->Resize(x_shape); + y->Resize(y_shape); // initialize input&output data - FillTensor(x); - FillTensor(y); + FillTensor(x, 1, 3); + FillTensor(y, 1, 3); // initialize op desc cpp::OpDesc opdesc; - opdesc.SetType("elementwise_add"); + opdesc.SetType("elementwise_" + elt_type); opdesc.SetInput("X", {x_var_name}); opdesc.SetInput("Y", {y_var_name}); opdesc.SetOutput("Out", {out_var_name}); @@ -149,7 +157,6 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { // create and convert op to NPU model, then run it on NPU auto op = CreateOp(opdesc, &scope); LauchOp(op, {x_var_name}, {out_var_name}); - out_ref->CopyDataFrom(*out); // execute reference implementation and save to output tensor elementwise_add_ref(op); @@ -158,19 +165,15 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) { auto* out_data = out->mutable_data(); auto* out_ref_data = out_ref->mutable_data(); for (int i = 0; i < out->dims().production(); i++) { - EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1); + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); } } TEST(NPUBridges, elementwise_add) { - for (auto bs : {1, 4, 7}) { - for (auto ic : {1, 4, 7}) { - for (auto ih : {1, 4, 7}) { - for (auto iw : {1, 4, 7}) { - for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis); - } - } - } + for (auto elt_type : {"add", "sub", "mul", "div"}) { + test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type); + test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type); + test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type); } } @@ -182,3 +185,9 @@ TEST(NPUBridges, elementwise_add) { USE_LITE_OP(elementwise_add); USE_NPU_BRIDGE(elementwise_add); +USE_LITE_OP(elementwise_sub); +USE_NPU_BRIDGE(elementwise_sub); +USE_LITE_OP(elementwise_mul); +USE_NPU_BRIDGE(elementwise_mul); +USE_LITE_OP(elementwise_div); +USE_NPU_BRIDGE(elementwise_div); diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index 71f5eac57aa007b60ad574034b145b89b2e3095d..8e60a39fe4a32e8750cc161d3485314b42e1ab0c 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -45,6 +45,7 @@ node_map_type InterpolateConverter( auto out_h = op_info->GetAttr("out_h"); auto align_corners = op_info->GetAttr("align_corners"); int align_mode = op_info->GetAttr("align_mode"); + auto interp_method = op_info->GetAttr("interp_method"); CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && " "align_corners = false isn't " "supported in HiAI DDK"; @@ -58,11 +59,11 @@ node_map_type InterpolateConverter( } // update out_h and out_w if has OutSize - bool inputs_map_has_w = false; + std::shared_ptr out_size_node = nullptr; if (lite::npu::HasInputArg(op_info, scope, "OutSize")) { auto out_size_var_name = op_info->Input("OutSize").front(); if (inputs_map.count(out_size_var_name)) { - inputs_map_has_w = true; + out_size_node = inputs_map.at(out_size_var_name); } else { auto out_size = scope->FindVar(out_size_var_name)->GetMutable(); @@ -73,58 +74,45 @@ node_map_type InterpolateConverter( out_w = out_size_data[1]; } } - - node_map_type outputs_map; - auto interp_method = op_info->GetAttr("interp_method"); - if (interp_method == "bilinear") { - auto interp_node = std::make_shared(unique_op_type); - lite::npu::OpList::Global().add(interp_node); - interp_node->set_input_x(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_w(*inputs_map.at(out_size_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { + if (out_size_node == nullptr) { + if (interp_method == "bilinear") { const float largest_multiple = 7.0f; float multiple = static_cast(x_h * x_w) / (out_h * out_w); CHECK_LT(multiple, largest_multiple) << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple << " is too large, should not exceed " << largest_multiple << " in HiAI DDK"; - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_w(*w_const_node); - lite::npu::OpList::Global().add(w_const_node); } - interp_node->set_attr_output_dim_mode( - 2); // 0: zoom_factor, 1: shrink_factor, 2: height/width - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; + auto out_size_const_node = + std::make_shared(unique_op_type + "/out_size"); + out_size_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); + out_size_node = out_size_const_node; + } + lite::npu::OpList::Global().add(out_size_node); + + std::shared_ptr interp_node = nullptr; + if (interp_method == "bilinear") { + auto bilinear_interp_node = + std::make_shared(unique_op_type); + bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name)); + bilinear_interp_node->set_input_size(*out_size_node); + bilinear_interp_node->set_attr_align_corners(align_corners); + interp_node = bilinear_interp_node; } else if (interp_method == "nearest") { - auto interp_node = + auto nearest_interp_node = std::make_shared(unique_op_type); - lite::npu::OpList::Global().add(interp_node); - interp_node->set_input_image(*inputs_map.at(x_var_name)); - if (inputs_map_has_w) { - auto out_size_var_name = op_info->Input("OutSize").front(); - interp_node->set_input_size(*inputs_map.at(out_size_var_name)); - lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); - } else { - auto w_const_node = - std::make_shared(unique_op_type + "/w"); - w_const_node->set_attr_value( - lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); - interp_node->set_input_size(*w_const_node); - lite::npu::OpList::Global().add(w_const_node); - } - interp_node->set_attr_align_corners(align_corners); - outputs_map[op_info->Output("Out").front()] = interp_node; + nearest_interp_node->set_input_image(*inputs_map.at(x_var_name)); + nearest_interp_node->set_input_size(*out_size_node); + nearest_interp_node->set_attr_align_corners(align_corners); + interp_node = nearest_interp_node; } else { LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method; } + lite::npu::OpList::Global().add(interp_node); + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = interp_node; return outputs_map; } diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index 5f8bdc4ee955a15ca4795e9f2554182696f656f2..2313351f6c49ea08451b06dc347c91aeeed4d755 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -31,82 +31,67 @@ node_map_type MulConverter(const std::shared_ptr mul_op, auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "[NPU] Converting " + op_type + "..."; - auto output_node = std::make_shared(unique_op_type); - auto x_var_name = op_info->Input("X").front(); auto y_var_name = op_info->Input("Y").front(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto y = scope->FindVar(y_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto y_dims = y->dims(); int x_num_col_dims = op_info->GetAttr("x_num_col_dims"); int y_num_col_dims = op_info->GetAttr("y_num_col_dims"); - auto* xtensor = scope->FindVar(x_var_name)->GetMutable(); - auto* ytensor = scope->FindVar(y_var_name)->GetMutable(); - - int m = xtensor->dims().Slice(0, x_num_col_dims).production(); - int x_w = xtensor->dims() - .Slice(x_num_col_dims, xtensor->dims().size()) - .production(); - int y_h = ytensor->dims().Slice(0, y_num_col_dims).production(); - int n = ytensor->dims() - .Slice(y_num_col_dims, ytensor->dims().size()) - .production(); - CHECK_EQ(x_w, y_h) << "[NPU] x_w must be equal with y_h"; - int k = x_w; + int m = x_dims.Slice(0, x_num_col_dims).production(); + int k = x_dims.Slice(x_num_col_dims, x_dims.size()).production(); + CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production()) + << "[NPU] columns of X must be equal with rows of Y"; + int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production(); LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; LOG(INFO) << "x_var_name:" << x_var_name << ", is data: " << inputs_map.count(x_var_name); LOG(INFO) << "y_var_name:" << y_var_name << ", is data: " << inputs_map.count(y_var_name); CHECK(inputs_map.count(x_var_name)) - << "[NPU] MatMul only support X is data, Y is const yet"; + << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet."; + + auto mul_node = std::make_shared(unique_op_type); + // add input x node which supports persistable and non-persistable tensor, and + // reshape to (m, k) if (inputs_map.count(x_var_name)) { - auto xsrc = inputs_map.at(x_var_name); - auto reshapex = std::make_shared(x_var_name + "_reshape"); - reshapex->set_input_tensor(*xsrc); - reshapex->set_attr_shape({m, k}); - reshapex->set_attr_axis(0); - lite::npu::OpList::Global().add(xsrc); - lite::npu::OpList::Global().add(reshapex); - output_node->set_input_x(*reshapex); + auto reshaped_x_node = + std::make_shared(x_var_name + "_reshape"); + reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name)); + reshaped_x_node->set_attr_shape({m, k}); + reshaped_x_node->set_attr_axis(0); + mul_node->set_input_x1(*reshaped_x_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reshaped_x_node); } else { - auto constx = std::make_shared(x_var_name); - ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, xtensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(xtensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - constx->set_attr_value(ptensor); - lite::npu::OpList::Global().add(constx); - output_node->set_input_x(*constx); + auto x_const_node = std::make_shared(x_var_name); + x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k})); + mul_node->set_input_x1(*x_const_node); + lite::npu::OpList::Global().add(x_const_node); } - + // add input y node which only supports persistable tensor, and reshape to (k, + // n) if (inputs_map.count(y_var_name)) { - auto ysrc = inputs_map.at(y_var_name); - auto reshapey = std::make_shared(y_var_name + "_reshape"); - reshapey->set_input_tensor(*ysrc); - reshapey->set_attr_shape({k, n}); - reshapey->set_attr_axis(0); - lite::npu::OpList::Global().add(ysrc); - lite::npu::OpList::Global().add(reshapey); - output_node->set_input_w(*reshapey); + auto reshaped_y_node = + std::make_shared(y_var_name + "_reshape"); + reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name)); + reshaped_y_node->set_attr_shape({k, n}); + reshaped_y_node->set_attr_axis(0); + mul_node->set_input_x2(*reshaped_y_node); + lite::npu::OpList::Global().add(inputs_map.at(y_var_name)); + lite::npu::OpList::Global().add(reshaped_y_node); } else { - auto consty = std::make_shared(y_var_name); - ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = desc.GetShape().GetShapeSize(); - CHECK_EQ(size, ytensor->dims().production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(desc); - auto* pdata = reinterpret_cast(ytensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - consty->set_attr_value(ptensor); - lite::npu::OpList::Global().add(consty); - output_node->set_input_w(*consty); + auto y_const_node = std::make_shared(y_var_name); + y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n})); + mul_node->set_input_x2(*y_const_node); + lite::npu::OpList::Global().add(y_const_node); } - lite::npu::OpList::Global().add(output_node); + lite::npu::OpList::Global().add(mul_node); node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; + outputs_map[op_info->Output("Out").front()] = mul_node; return outputs_map; } diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h index 8b4252de06e8934affe7592fc8ea521ad7d20025..9a432d17e543bece48fb1c1369ee90ff56e8dcbf 100644 --- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h +++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h @@ -16,23 +16,40 @@ #include "lite/kernels/npu/bridges/registry.h" -USE_NPU_BRIDGE(mul); -USE_NPU_BRIDGE(fc); +USE_NPU_BRIDGE(sigmoid); +USE_NPU_BRIDGE(relu); +USE_NPU_BRIDGE(tanh); +USE_NPU_BRIDGE(relu_clipped); +USE_NPU_BRIDGE(leaky_relu); +USE_NPU_BRIDGE(softsign); +USE_NPU_BRIDGE(hard_sigmoid); + +USE_NPU_BRIDGE(batch_norm); +USE_NPU_BRIDGE(concat); USE_NPU_BRIDGE(conv2d); USE_NPU_BRIDGE(depthwise_conv2d); -USE_NPU_BRIDGE(pool2d); -USE_NPU_BRIDGE(relu); +USE_NPU_BRIDGE(conv2d_transpose); + USE_NPU_BRIDGE(elementwise_add); USE_NPU_BRIDGE(fusion_elementwise_add_activation); +USE_NPU_BRIDGE(elementwise_sub); +USE_NPU_BRIDGE(elementwise_mul); +USE_NPU_BRIDGE(elementwise_div); + +USE_NPU_BRIDGE(fc); +USE_NPU_BRIDGE(bilinear_interp); +USE_NPU_BRIDGE(nearest_interp); +USE_NPU_BRIDGE(mul); +USE_NPU_BRIDGE(pad2d); +USE_NPU_BRIDGE(pool2d); +USE_NPU_BRIDGE(reduce_mean); +USE_NPU_BRIDGE(reshape); +USE_NPU_BRIDGE(reshape2); USE_NPU_BRIDGE(scale); +USE_NPU_BRIDGE(shuffle_channel); USE_NPU_BRIDGE(softmax); -USE_NPU_BRIDGE(concat); USE_NPU_BRIDGE(split); +USE_NPU_BRIDGE(sqrt); +USE_NPU_BRIDGE(square); USE_NPU_BRIDGE(transpose); USE_NPU_BRIDGE(transpose2); -USE_NPU_BRIDGE(shuffle_channel); -USE_NPU_BRIDGE(batch_norm); -USE_NPU_BRIDGE(bilinear_interp); -USE_NPU_BRIDGE(conv2d_transpose); -USE_NPU_BRIDGE(reshape); -USE_NPU_BRIDGE(reshape2); diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index 5915b7a8aadfec38c1388177d726d6a33d612349..7bbe94d5db6b0345bb4a3fefe8a75f2a696902e9 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/pool_op.h" #include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" @@ -32,44 +33,78 @@ node_map_type PoolConverter(const std::shared_ptr pool_op, std::shared_ptr pool_node = std::make_shared(unique_op_type); auto x_var_name = op_info->Input("X").front(); + auto x = scope->FindTensor(x_var_name); + pool_node->set_input_x(*inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(pool_node); + + int mode = 0; auto pooling_type = op_info->GetAttr("pooling_type"); - int npu_mode = 0; if (pooling_type == "max") { - npu_mode = 0; + mode = 0; } else if (pooling_type == "avg") { - npu_mode = 1; + mode = 1; CHECK(op_info->GetAttr("exclusive")) << "[NPU] exclusive must be true in HiAI DDK"; } else { LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type; } - bool npu_global_pooling = op_info->GetAttr("global_pooling"); + pool_node->set_attr_mode(mode); + + int pad_mode = 0; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + if (padding_algorithm == "SAME") { + pad_mode = 6; + } else if (padding_algorithm == "VALID") { + pad_mode = 5; + } + pool_node->set_attr_pad_mode(pad_mode); + + bool global_pooling = op_info->GetAttr("global_pooling"); + pool_node->set_attr_global_pooling(global_pooling); + auto ksize = op_info->GetAttr>("ksize"); - auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); + auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()); + pool_node->set_attr_window(window); - auto padding = op_info->GetAttr>("paddings"); - auto npu_pad = - ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]}; + auto paddings = op_info->GetAttr>("paddings"); + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the inputs size."; + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } auto strides = op_info->GetAttr>("strides"); + operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + auto npu_pad = ge::AttrValue::LIST_INT{ + paddings[0], paddings[1], paddings[2], paddings[3]}; + pool_node->set_attr_pad(npu_pad); + auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end()); - int npu_ceil_mode = 0; + pool_node->set_attr_stride(npu_stride); + + int ceil_mode = 0; if (op_info->HasAttr("ceil_mode")) { - npu_ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; + ceil_mode = op_info->GetAttr("ceil_mode") ? 1 : 0; } - - pool_node->set_input_x(*inputs_map.at(x_var_name)); - pool_node->set_attr_mode(npu_mode); - pool_node->set_attr_pad_mode(0); - pool_node->set_attr_global_pooling(npu_global_pooling); - pool_node->set_attr_window(npu_window); - pool_node->set_attr_pad(npu_pad); - pool_node->set_attr_stride(npu_stride); - pool_node->set_attr_ceil_mode(npu_ceil_mode); + pool_node->set_attr_ceil_mode(ceil_mode); // output_node->set_attr_data_mode(npu_data_mode); - lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); - lite::npu::OpList::Global().add(pool_node); - node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = pool_node; return outputs_map; diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc index d4543a6ae128a0c534b216e42c6f3488a1dbfbf9..298e06554776e0f9efeade540d6498d1f71f8a16 100644 --- a/lite/kernels/npu/bridges/pool_op_test.cc +++ b/lite/kernels/npu/bridges/pool_op_test.cc @@ -61,7 +61,7 @@ void pool_ref(const std::shared_ptr op) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -163,7 +163,8 @@ void test_pool(int bs, opdesc.SetAttr("global_pooling", global_pooling); opdesc.SetAttr("exclusive", exclusive); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); // create and convert op to NPU model, then run it on NPU auto op = CreateOp(opdesc, &scope); diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4725bdfb0e17c4f99dfd2359ff34c96f9e5af6e5 --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type ReduceMeanConverter( + const std::shared_ptr reduce_mean_op, + const node_map_type& inputs_map) { + auto scope = reduce_mean_op->scope(); + auto op_info = reduce_mean_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + // get input, and op attributes + auto x_var_name = op_info->Input("X").front(); + auto x_dims = scope->FindTensor(x_var_name)->dims(); + auto keep_dim = op_info->GetAttr("keep_dim"); + auto dim = op_info->GetAttr>("dim"); + CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty."; + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_dims.size(); + } + } + std::sort(dim.begin(), dim.end()); + + // create reduce_mean(reduce_sum + scale) node and set input node from + // inputs_map + // creat reduce_sum node + auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum"); + auto reduce_sum_node = std::make_shared(unique_reduce_sum); + CHECK(inputs_map.count(x_var_name)); + reduce_sum_node->set_input_x(*inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reduce_sum_node); + + auto dim_const_node = + std::make_shared(unique_reduce_sum + "/dim"); + dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData(dim)); + reduce_sum_node->set_input_w(*dim_const_node); + lite::npu::OpList::Global().add(dim_const_node); + + reduce_sum_node->set_attr_keep_dims(keep_dim); + + // create scale node + auto unique_scale = lite::npu::UniqueName("scale"); + auto scale_node = std::make_shared(unique_scale); + scale_node->set_input_x(*reduce_sum_node); + lite::npu::OpList::Global().add(scale_node); + + float scale = 1; + for (size_t i = 0; i < dim.size(); i++) { + scale /= x_dims[dim[i]]; + } + + std::vector scale_bias_shape = x_dims.Vectorize(); + if (keep_dim) { + for (size_t i = 0; i < dim.size(); i++) { + scale_bias_shape[dim[i]] = 1; + } + } else { + const int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + scale_bias_shape[dim[i]] = kDelFlag; + } + scale_bias_shape.erase( + remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag), + scale_bias_shape.end()); + } + + auto filter_const_node = + std::make_shared(unique_scale + "/filter"); + filter_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(scale, scale_bias_shape)); + scale_node->set_input_filter(*filter_const_node); + lite::npu::OpList::Global().add(filter_const_node); + + scale_node->set_attr_axis(1); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = scale_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(reduce_mean, + paddle::lite::kernels::npu::bridges::ReduceMeanConverter); diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..8646ce5c25b367cf3c9055f1ed13a225149a9cc7 --- /dev/null +++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/reduce_mean_op.h" +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +void reduce_mean_n(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = channel_in * hw_size; + int data_index, src_index; + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = c * hw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int n = 0; n < num_in; ++n) { + src_index = n * chw_size + data_index; + dst[data_index] += static_cast(src[src_index]) / num_in; + } + } + } + } +} + +void reduce_mean_c(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int hw_size = height_in * width_in; + int chw_size = hw_size * channel_in; + int data_index, src_index0, src_index; + for (int n = 0; n < num_in; ++n) { + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + data_index = n * hw_size + h * width_in + w; + src_index0 = n * chw_size + h * width_in + w; + dst[data_index] = 0.0; + for (int c = 0; c < channel_in; ++c) { + src_index = src_index0 + c * hw_size; + dst[data_index] += static_cast(src[src_index]) / channel_in; + } + } + } + } +} + +void reduce_mean_h(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int cw_size = channel_in * width_in; + int chw_size = cw_size * height_in; + int hw_size = height_in * width_in; + int data_index, src_index, src_index0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int w = 0; w < width_in; ++w) { + data_index = n * cw_size + c * width_in + w; + src_index0 = n * chw_size + c * hw_size + w; + dst[data_index] = 0.0; + for (int h = 0; h < height_in; ++h) { + src_index = src_index0 + h * width_in; + dst[data_index] += static_cast(src[src_index]) / height_in; + } + } + } + } +} + +void reduce_mean_w(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + int ch_size = channel_in * height_in; + int hw_size = height_in * width_in; + int chw_size = ch_size * width_in; + int data_index = 0; + int src_index0 = 0; + int src_index = 0; + for (int n = 0; n < num_in; ++n) { + for (int c = 0; c < channel_in; ++c) { + for (int h = 0; h < height_in; ++h) { + data_index = n * ch_size + c * height_in + h; + src_index0 = n * chw_size + c * hw_size + h * width_in; + dst[data_index] = 0.0; + for (int w = 0; w < width_in; ++w) { + src_index = src_index0 + w; + dst[data_index] += static_cast(src[src_index]) / width_in; + } + } + } + } +} + +void reduce_mean_all(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + float mean = 0.0; + int src_index; + int n_id, c_id; + int all = num_in * channel_in * height_in * width_in; + for (int n = 0; n < num_in; ++n) { + n_id = n * channel_in * height_in * width_in; + for (int c = 0; c < channel_in; ++c) { + c_id = c * height_in * width_in; + for (int h = 0; h < height_in; ++h) { + for (int w = 0; w < width_in; ++w) { + src_index = n_id + c_id + h * width_in + w; + mean = src[src_index] / all; + } + } + } + } + dst[0] = mean; +} + +void reduce_mean_nc(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce n first. + DDimLite ddimA({1, channel_in, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in); +} + +void reduce_mean_ch(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce c first + DDimLite ddimA({num_in, 1, height_in, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in); +} + +void reduce_mean_hw(const float* src, + float* dst, + int num_in, + int channel_in, + int height_in, + int width_in) { + // reduce h first + DDimLite ddimA({num_in, channel_in, 1, width_in}); + lite::Tensor tensor_tmp; + tensor_tmp.Resize(ddimA); + float* tmp_out = tensor_tmp.mutable_data(); + reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in); + reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); +} + +void reduce_mean_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto x_dims = x->dims(); + auto x_data = x->data(); + auto out = scope->FindMutableTensor("out_ref"); + + auto dim = op_info->GetAttr>("dim"); + auto keep_dim = op_info->GetAttr("keep_dim"); + + auto x_rank = x_dims.size(); + if (!dim.empty()) { + for (size_t i = 0; i < dim.size(); i++) { + if (dim[i] < 0) { + dim[i] += x_rank; + } + } + } + + bool reduce_all = false; + sort(dim.begin(), dim.end()); + if (dim.size() == 0) { + reduce_all = true; + } + + std::vector out_dims; + if (reduce_all) { + if (keep_dim) { + for (size_t i = 0; i < x_dims.size(); i++) { + out_dims.push_back(1); + } + } else { + out_dims.push_back(1); + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + out_dims.push_back(x_dims[i]); + } + if (keep_dim) { + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = 1L; + } + } else { + int64_t kDelFlag = -2; + for (size_t i = 0; i < dim.size(); ++i) { + out_dims[dim[i]] = kDelFlag; + } + out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag), + out_dims.end()); + } + out->Resize(DDim(out_dims)); + } + + auto out_data = out->mutable_data(); + int in_n = x_dims[0]; + int in_c = x_dims[1]; + int in_h = x_dims[2]; + int in_w = x_dims[3]; + + if (dim.size() == 0) { + reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim.size() == 1) { + switch (dim[0]) { + case 0: + reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 1: + reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 2: + reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w); + break; + case 3: + reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w); + break; + default: + LOG(FATAL) << "error!!!"; + } + } else if (dim.size() == 2) { + if (dim[0] == 0 && dim[1] == 1) { + reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 1 && dim[1] == 2) { + reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w); + } else if (dim[0] == 2 && dim[1] == 3) { + reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w); + } else { + LOG(FATAL) << "invalid dim!!"; + } + } +} + +void test_reduce_mean(const std::vector& input_shape, + std::vector dim, + bool keep_dim) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("reduce_mean"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("dim", dim); + opdesc.SetAttr("keep_dim", keep_dim); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + reduce_mean_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, reduce_mean) { + std::vector> reduce_dim{ + {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}}; + for (auto dim : reduce_dim) { + for (auto keep_dim : {true, false}) { + test_reduce_mean({1, 2, 3, 4}, dim, keep_dim); + } + } +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(reduce_mean); +USE_NPU_BRIDGE(reduce_mean); diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index b2ed556faf543cca138dad1cb773225202fbaca5..a554aac94f270517d26ed76016678989b87b6ea6 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -41,8 +41,10 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, reshape_node->set_input_tensor(*inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); - // read shape from actual shape tensor as input "w" if 'Shape' is found - if (lite::npu::HasInputArg(op_info, scope, "Shape")) { + // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr) + if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) { + LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor."; + } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) { auto actual_shape_var_name = op_info->Input("Shape").front(); if (!inputs_map.count(actual_shape_var_name)) { auto actual_shape = diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..84ab3a9eb2db7420a7dd193e1c1cc6c32a362e55 --- /dev/null +++ b/lite/kernels/npu/bridges/sqrt_op.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type SqrtConverter(const std::shared_ptr sqrt_op, + const node_map_type& inputs_map) { + auto scope = sqrt_op->scope(); + auto op_info = sqrt_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + std::shared_ptr sqrt_node = + std::make_shared(unique_op_type); + + auto x_var_name = op_info->Input("X").front(); + + CHECK(inputs_map.count(x_var_name)); + sqrt_node->set_input_x(*inputs_map.at(x_var_name)); + + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(sqrt_node); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = sqrt_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter); diff --git a/lite/kernels/npu/bridges/sqrt_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..015d61685b2d99c3df55269442d61b4a137a2ca3 --- /dev/null +++ b/lite/kernels/npu/bridges/sqrt_op_test.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +template +void sqrt_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + + for (size_t i = 0; i < x->numel(); i++) { + out_data[i] = std::sqrtf(x_data[i]); + } +} + +void test_sqrt(const std::vector& input_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x, 0, 5); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("sqrt"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + sqrt_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, sqrt) { + test_sqrt({2}); + test_sqrt({2, 3}); + test_sqrt({1, 2, 3, 4}); + test_sqrt({5, 6, 7, 8}); +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(sqrt); +USE_NPU_BRIDGE(sqrt); diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ca91adba0a8b24e6559599cb5952f8b47722ba3 --- /dev/null +++ b/lite/kernels/npu/bridges/square_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/npu/builder.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +node_map_type SquareConverter(const std::shared_ptr square_op, + const node_map_type& inputs_map) { + auto scope = square_op->scope(); + auto op_info = square_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "[NPU] Converting " + op_type + "..."; + + std::shared_ptr square_node = + std::make_shared(unique_op_type); + + auto x_var_name = op_info->Input("X").front(); + + CHECK(inputs_map.count(x_var_name)); + square_node->set_input_x(*inputs_map.at(x_var_name)); + + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(square_node); + + node_map_type outputs_map; + outputs_map[op_info->Output("Out").front()] = square_node; + return outputs_map; +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_NPU_BRIDGE(square, + paddle::lite::kernels::npu::bridges::SquareConverter); diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d715c11430096a0b6503fbe6047a40c3c29ba8f5 --- /dev/null +++ b/lite/kernels/npu/bridges/square_op_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/npu/bridges/registry.h" +#include "lite/kernels/npu/bridges/test_helper.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace npu { +namespace bridges { + +template +void square_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + + auto x = scope->FindTensor("x"); + auto out = scope->FindMutableTensor("out_ref"); + out->Resize(x->dims()); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + + for (size_t i = 0; i < x->numel(); i++) { + out_data[i] = x_data[i] * x_data[i]; + } +} + +void test_square(const std::vector& input_shape) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.NewTensor(x_var_name); + auto* out = scope.NewTensor(out_var_name); + auto* out_ref = scope.NewTensor(out_ref_var_name); + x->Resize(input_shape); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("square"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + + // execute reference implementation and save to output tensor + square_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2); + } +} + +TEST(NPUBridges, square) { + test_square({2}); + test_square({2, 3}); + test_square({1, 2, 3, 4}); + test_square({5, 6, 7, 8}); +} + +} // namespace bridges +} // namespace npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(square); +USE_NPU_BRIDGE(square); diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt index d070eb84c5313e7539f28da0a90dcc3662be01a1..99b23c19f0f5870102782f0b4d639f6103257c31 100644 --- a/lite/kernels/opencl/CMakeLists.txt +++ b/lite/kernels/opencl/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT LITE_WITH_OPENCL) +if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL)) return () endif() diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc index 04a78face2b9c07c42aceb53f0f797ded46e59d9..e13d12ec224c4ececf53c55c8acb1f1b0e483801 100644 --- a/lite/kernels/opencl/conv_compute.cc +++ b/lite/kernels/opencl/conv_compute.cc @@ -38,15 +38,20 @@ void ConvCompute::PrepareForRun() { int w_out = output_dims[3]; int kernel_h = filter_dims[2]; // oihw int kernel_w = filter_dims[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; int stride_h = param.strides[0]; int stride_w = param.strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int groups = param.groups; bool relu_fused = param.fuse_relu; - bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1); + bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1); bool zero_pad = (pad_h == 0) && (pad_w == 0); + bool pad_equal = + ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3])); + VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No"); VLOG(3) << "groups:" << groups << " stride_h:" << stride_h << " stride_w:" << stride_w << " pad_h:" << pad_h @@ -60,7 +65,7 @@ void ConvCompute::PrepareForRun() { << filter_dims[2] << " " << filter_dims[3]; if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 && - zero_pad && no_dilation) { + zero_pad && no_dilation && pad_equal) { // conv2d_1x1 kernel_func_names_.push_back("gemm_batch"); kernel_func_paths_.push_back("buffer/fc_kernel.cl"); @@ -70,7 +75,7 @@ void ConvCompute::PrepareForRun() { build_options_.push_back("-DCL_DTYPE=float"); } impl_ = &ConvCompute::Conv2d1x1; - } else { + } else if (pad_equal) { kernel_func_names_.push_back("im2col"); kernel_func_names_.push_back("gemm_batch"); kernel_func_paths_.push_back("buffer/im2col_kernel.cl"); @@ -85,6 +90,9 @@ void ConvCompute::PrepareForRun() { col_buffer_.reset(new lite::Tensor); col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out}); col_buffer_->mutable_data(TARGET(kOpenCL)); + } else { + LOG(FATAL) << "This pad not support ! " << paddings[0] << ", " + << paddings[1] << ", " << paddings[2] << ", " << paddings[3]; } for (size_t i = 0; i < kernel_func_names_.size(); i++) { @@ -102,17 +110,19 @@ void ConvCompute::GemmlikeConv2d() { int c_in = x_dims[1]; int h_in = x_dims[2]; int w_in = x_dims[3]; + auto paddings = *param.paddings; + auto dilations = *param.dilations; int c_out = output_dims[1]; int h_out = output_dims[2]; int w_out = output_dims[3]; int kernel_h = filter_dims[2]; int kernel_w = filter_dims[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int pad_h = paddings[0]; + int pad_w = paddings[2]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; - int dilation_h = param.dilations[0]; - int dilation_w = param.dilations[1]; + int dilation_h = dilations[0]; + int dilation_w = dilations[1]; auto* x_buf = param.x->data(); auto* filter_buf = param.filter->data(); diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc index a7417e3525605e208c8e25cd5d34200e6652053d..3bc7a0734db0314f911981027ceeef02fcbf96c7 100644 --- a/lite/kernels/opencl/conv_compute_test.cc +++ b/lite/kernels/opencl/conv_compute_test.cc @@ -24,7 +24,6 @@ namespace lite { #define A(i, j) a[i * lda + j] #define B(i, j) cur_b[i * ldb + j] #define C(i, j) cur_c[i * ldc + j] - template static void conv_basic(const Dtype1* din, Dtype2* dout, @@ -227,10 +226,12 @@ TEST(conv2d, compute_conv2d_1x1) { param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; param.groups = group; - param.dilations = {dilation, dilation}; + std::vector dilations = {dilation, dilation}; param.fuse_relu = relu_flag; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); @@ -454,11 +455,14 @@ TEST(conv2d, compute_conv2d_gemm) { param.bias = bias_flag ? &bias : nullptr; param.output = &out; param.strides = {stride, stride}; - param.paddings = {pad, pad}; + std::vector paddings = {pad, pad, pad, pad}; param.groups = group; - param.dilations = {dilation, dilation}; + std::vector dilations = {dilation, dilation}; param.fuse_relu = relu_flag; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); + kernel->SetParam(param); std::unique_ptr conv_context(new KernelContext); context->As().CopySharedTo( diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc index 62734610e280c89f9df2e367fd7251c7d25756e7..ed942d7f0cb7b0bab119f258fb6393b9dbd211a6 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc @@ -44,7 +44,7 @@ class DepthwiseConv2dCompute auto x_dims = param.x->dims(); auto filter_dims = param.filter->dims(); auto output_dims = param.output->dims(); - auto paddings = param.paddings; + auto paddings = *param.paddings; auto strides = param.strides; auto& context = ctx_->As(); diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc index a189acaf919e605b4810770e7136d00baeea4bfa..3556d1abedd5b4548b78b90b75de2ee86572fdb7 100644 --- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc +++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc @@ -105,7 +105,8 @@ TEST(depthwise_conv2d, compute) { param.x = &input; param.filter = &filter; param.output = &output; - param.paddings = std::vector{0, 0}; + std::vector paddings = {0, 0}; + param.paddings = std::make_shared>(paddings); param.strides = std::vector{1, 1}; std::unique_ptr context(new KernelContext); diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc index dc4bdfe64c65f21e8f68a26df3e2962087f50bef..3387a0887d3422636e39e742149f84672e8e75d4 100644 --- a/lite/kernels/opencl/io_copy_compute.cc +++ b/lite/kernels/opencl/io_copy_compute.cc @@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute auto* wait_list = context.cl_wait_list(); auto* x_ptr = param.x->data(); - /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list` - in kernel and enable wait_list + /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to + `cl_wait_list` + in kernel and `wait_list` enabled auto it = wait_list->find(x_ptr); if (it != wait_list->end()) { VLOG(4) << "--- Find the sync event for the target cl tensor. ---"; diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc index dc2e851595b08e1ff401499502fab64df4dfa46f..d275b312d67b5aba7050a195949ee4c3792b5da7 100644 --- a/lite/kernels/opencl/pool_compute.cc +++ b/lite/kernels/opencl/pool_compute.cc @@ -44,16 +44,22 @@ class PoolCompute const auto& out_dims = param.output->dims(); const std::string pooling_type = param.pooling_type; const bool global_pooling = param.global_pooling; - std::vector paddings = param.paddings; + std::vector paddings = *param.paddings; std::vector strides = param.strides; std::vector ksize = param.ksize; if (global_pooling) { for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; + paddings[2 * i] = 0; + paddings[2 * i + 1] = 0; ksize[i] = static_cast(in_dims[i + 2]); } } - + bool pads_equal = + (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]); + if (!pads_equal) { + LOG(FATAL) + << "padding requires pad_left == pad_right, pad_top == pad_bottom"; + } auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); auto* input_buf = param.x->data(); @@ -89,7 +95,7 @@ class PoolCompute CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, static_cast(paddings[0])); CL_CHECK_FATAL(status); - status = kernel.setArg(++arg_idx, static_cast(paddings[1])); + status = kernel.setArg(++arg_idx, static_cast(paddings[2])); CL_CHECK_FATAL(status); status = kernel.setArg(++arg_idx, *output_buf); CL_CHECK_FATAL(status); diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc index 53f64e950500425655fbd450d5961a2a8dbc412d..25f0e72634775f4c5e82a6bd800f9ca980da2e34 100644 --- a/lite/kernels/opencl/pool_compute_test.cc +++ b/lite/kernels/opencl/pool_compute_test.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "lite/backends/opencl/target_wrapper.h" #include "lite/core/op_registry.h" @@ -88,9 +89,10 @@ TEST(pool2d, compute) { param.output = &out; param.global_pooling = true; param.pooling_type = "avg"; - param.paddings = std::vector{0, 0}; + std::vector paddings = {0, 0, 0, 0}; param.strides = std::vector{1, 1}; param.ksize = std::vector{7, 7}; + param.paddings = std::make_shared>(paddings); std::unique_ptr context(new KernelContext); context->As().InitOnce(); diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt index da955e4fd5902373cd881f85a8bc715eef7cec94..bf3a1685f028740da1b7f4dfa38f19b73d30df89 100644 --- a/lite/kernels/x86/CMakeLists.txt +++ b/lite/kernels/x86/CMakeLists.txt @@ -5,6 +5,7 @@ add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${li # lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps}) add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(cast_compute_x86 X86 basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) add_kernel(slice_compute_x86 X86 basic SRCS slice_compute.cc DEPS ${lite_kernel_deps}) add_kernel(squeeze_compute_x86 X86 basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps}) add_kernel(fill_constant_batch_size_like_compute_x86 X86 basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_function) @@ -15,8 +16,10 @@ add_kernel(conv_compute_x86 X86 basic SRCS conv_compute.cc DEPS ${lite_kernel_de # lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} ) # lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col) add_kernel(pool_compute_x86 X86 basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling) +add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_deps}) add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps}) add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function) +add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper) # add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} ) @@ -26,6 +29,7 @@ add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_comp # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86) # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) +add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type) # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86) # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86) @@ -33,12 +37,27 @@ add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps}) add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling) +add_kernel(search_group_padding_compute_x86 X86 basic SRCS search_group_padding_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(sequence_reverse_compute_x86 X86 basic SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps}) add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax) add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps}) add_kernel(batch_norm_compute_x86 X86 basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps}) add_kernel(reduce_sum_compute_x86 X86 basic SRCS reduce_compute.cc DEPS ${lite_kernel_deps}) add_kernel(lookup_table_compute_x86 X86 basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps}) add_kernel(sequence_reshape_compute_x86 X86 basic SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(match_matrix_tensor_compute_x86 X86 basic SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps} blas math_function) +add_kernel(search_seq_depadding_compute_x86 X86 basic SRCS search_seq_depadding_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps} blas math_function) +add_kernel(sequence_concat_compute_x86 X86 basic SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(var_conv_2d_compute_x86 X86 basic SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps} blas fluid_data_type) +add_kernel(attention_padding_mask_compute_x86 X86 basic SRCS attention_padding_mask_compute.cc DEPS ${lite_kernel_deps}) +add_kernel(sequence_arithmetic_compute_x86 X86 basic SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps}) + +# for content-dnn specific +add_kernel(search_aligned_mat_mul_compute_x86 X86 extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} blas) +add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEPS ${lite_kernel_deps} blas) +add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling) +add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc) if(NOT LITE_WITH_X86) return() @@ -47,12 +66,14 @@ add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kerne lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86) lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86) +lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86) lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86) lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86) lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86) lite_cc_test(test_reshape_compute_x86 SRCS reshape_compute_test.cc DEPS reshape_compute_x86) lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86) lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86) +lite_cc_test(test_sequence_reverse_compute_x86 SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_x86) lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86) lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86) lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86) @@ -63,7 +84,19 @@ lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_com lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86) lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86) lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86) - +lite_cc_test(test_cast_compute_x86 SRCS cast_compute_test.cc DEPS cast_compute_x86) lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86) +lite_cc_test(test_layer_norm_compute_x86 SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_x86) lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86) lite_cc_test(test_transpose_compute_x86 SRCS transpose_compute_test.cc DEPS transpose_compute_x86) +lite_cc_test(test_search_fc_compute_x86 SRCS search_fc_compute_test.cc DEPS search_fc_compute_x86) +lite_cc_test(test_search_seq_depadding_compute_x86 SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_x86) +lite_cc_test(test_search_grnn_compute_x86 SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_x86) +lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86) +lite_cc_test(test_lookup_table_compute_x86 SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_x86) +lite_cc_test(test_stack_compute_x86 SRCS stack_compute_test.cc DEPS stack_compute_x86) +lite_cc_test(test_search_group_padding_compute_x86 SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_x86) +lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_x86) +lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86) +#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86) +lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86) diff --git a/lite/kernels/x86/attention_padding_mask_compute.cc b/lite/kernels/x86/attention_padding_mask_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..0c35c416e7771f7896c5378ec8c0199b91ffd685 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/attention_padding_mask_compute.h" + +REGISTER_LITE_KERNEL( + search_attention_padding_mask, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::AttentionPaddingMaskCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..b9124e5ad49a0d68c41a21fe55d28102f09d14b9 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute.h @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/eigen.h" +#include "lite/operators/attention_padding_mask_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class AttentionPaddingMaskCompute + : public KernelLite { + public: + using param_t = operators::AttentionPaddingMaskParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto* bottom0 = param.X; + auto* bottom1 = param.Y; + auto* _pad_begin = param.pad_begin; + auto* top = param.Out; + int _pad_id = param.pad_id; + float _mask = param.mask; + auto src_len = static_cast(bottom1->lod()[0][1]); + const int att_batch = bottom0->lod()[0].size() - 1; + const int src_batch = bottom1->lod()[0].size() - 1; + int* pad_begin = _pad_begin->mutable_data(); + for (int i = 0; i < src_batch; ++i) { + const auto* src_data = bottom1->data() + src_len * i; + int index = src_len - 1; + for (; index >= 0 && _pad_id == static_cast(src_data[index]); + --index) { + } + pad_begin[i] = index + 1; + } + + const auto att_len = static_cast(bottom0->lod()[0][1]); + auto* top_data = top->mutable_data(); + memcpy(top_data, + bottom0->data(), + bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T)); + for (int i = 0; i < att_batch; ++i) { + for (int j = 0; j < att_len; ++j) { + top_data = top->mutable_data() + src_len * (att_len * i + j); + int src_idx = i % src_batch; + for (int k = pad_begin[src_idx]; k < src_len; ++k) { + top_data[k] = _mask; + } + } + } + } + + virtual ~AttentionPaddingMaskCompute() = default; + + private: + lite::Tensor src_offset_; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..35ce822e010fc3ce2dc756b86e3a437789cc8359 --- /dev/null +++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/attention_padding_mask_compute.cc" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void attention_padding_mask_ref( + const Tensor& x, + const Tensor& y, + Tensor* out, + Tensor* pad_begin, + const operators::AttentionPaddingMaskParam& param) { + auto attn_offset = x.lod()[0]; + auto src_offset = y.lod()[0]; + int attn_seq_num = attn_offset.size() - 1; + int src_seq_num = src_offset.size() - 1; + int attn_seq_len = attn_offset[1]; + int src_seq_len = x.dims()[1]; + CHECK_EQ(attn_seq_num % src_seq_num, 0); + + auto count = x.numel(); + auto attn_data = x.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + memcpy(out_data, attn_data, count * sizeof(float)); + + for (int i = 0; i < attn_seq_num; ++i) { + for (int j = 0; j < attn_seq_len; ++j) { + auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j); + int src_seq_idx = i % src_seq_num; + int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx]; + for (int k = cur_len; k < src_seq_len; k++) { + tmp_out_data[k] = param.mask; + } + } + } +} + +void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) { + std::vector x_dims{static_cast(lod[0].back()), dim2rd}; + x->Resize(x_dims); + x->set_lod(lod); + auto x_data = x->mutable_data(); + auto x_num = x->numel(); + for (int i = 0; i < x_num; i++) { + x_data[i] = (i - x_num) * 1.1; + } +} + +int get_max_len(const LoD& lod) { + int max_len = 0; + auto offset = lod[0]; + for (int i = 0; i < offset.size() - 1; i++) { + int cur_len = offset[i + 1] - offset[i]; + max_len = max_len < cur_len ? cur_len : max_len; + } + return max_len; +} + +TEST(attention_padding_mask_x86, retrive_op) { + auto attention_padding_mask = + KernelRegistry::Global().Create( + "attention_padding_mask"); + ASSERT_FALSE(attention_padding_mask.empty()); + ASSERT_TRUE(attention_padding_mask.front()); +} + +TEST(attention_padding_mask_x86, init) { + AttentionPaddingMaskCompute attention_padding_mask; + ASSERT_EQ(attention_padding_mask.precision(), PRECISION(kFloat)); + ASSERT_EQ(attention_padding_mask.target(), TARGET(kX86)); +} + +TEST(attention_padding_mask_x86, run_test) { + lite::Tensor x, y; + lite::Tensor out, pad_begin, out_ref, pad_begin_ref; + + LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}}; + prepare_input(&x, x_lod, get_max_len(y_lod)); + prepare_input(&y, y_lod, 1); + + operators::AttentionPaddingMaskParam param; + param.X = &x; + param.Y = &y; + param.pad_id = 12800001; + param.mask = -90000000.f; + param.Out = &out; + param.pad_begin = &pad_begin; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + AttentionPaddingMaskCompute attention_padding_mask_kernel; + attention_padding_mask_kernel.SetParam(param); + attention_padding_mask_kernel.SetContext(std::move(ctx)); + attention_padding_mask_kernel.Run(); + + attention_padding_mask_ref(x, y, &out_ref, &pad_begin_ref, param); + auto out_data = out.data(); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_attention_padding_mask, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d342056c7f19e9eba0fe16196d772da6bd5fda3c --- /dev/null +++ b/lite/kernels/x86/cast_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/cast_compute.h" + +REGISTER_LITE_KERNEL(cast, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::CastCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/cast_compute.h b/lite/kernels/x86/cast_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..06e47e9a5023ea149510e8f10bf719cd6a854349 --- /dev/null +++ b/lite/kernels/x86/cast_compute.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/data_type.h" +#include "lite/fluid/hostdevice.h" +#include "lite/fluid/transform.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +class CastOpFunctor { + public: + CastOpFunctor(const lite::Tensor* in, + lite::Tensor* out, + const lite::Context& context) + : input(in), output(out), ctx(context) {} + + template + void apply() const { + auto* in_begin = input->data(); + auto numel = input->dims().production(); + auto* in_end = in_begin + numel; + auto* out_begin = output->mutable_data(); + paddle::lite::fluid::Transform trans; + trans( + ctx, in_begin, in_end, out_begin, CastOpTransformFunctor()); + } + + private: + const lite::Tensor* input; + lite::Tensor* output; + const lite::Context& ctx; +}; + +template +class CastCompute : public KernelLite { + public: + using param_t = operators::CastParam; + + void Run() override { + auto param = param_.get_mutable(); + auto& context = ctx_->As(); + auto x = param->X; + auto out = param->Out; + auto out_dtype = param->out_dtype; + paddle::lite::fluid::VisitDataType( + static_cast(out_dtype), + CastOpFunctor(x, out, context)); + } + virtual ~CastCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f7aa52ca6d0dde603357f009220b4a3a53f56833 --- /dev/null +++ b/lite/kernels/x86/cast_compute_test.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/cast_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(cast_x86, retrive_op) { + auto cast = + KernelRegistry::Global().Create("cast"); + ASSERT_FALSE(cast.empty()); + ASSERT_TRUE(cast.front()); +} + +TEST(cast_x86, init) { + CastCompute cast; + ASSERT_EQ(cast.precision(), PRECISION(kFloat)); + ASSERT_EQ(cast.target(), TARGET(kX86)); +} + +TEST(cast_x86, run_test) { + lite::Tensor x, out; + constexpr int batch_size = 1; + std::vector x_shape{batch_size, 1, 3, 3}; + x.Resize(lite::DDim(x_shape)); + + std::vector out_shape{batch_size, 1, 3, 3}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(1); + } + + CastCompute cast; + operators::CastParam param; + param.X = &x; + param.Out = &out; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + cast.SetContext(std::move(ctx)); + cast.SetParam(param); + cast.Run(); + + std::vector ref_results = {1, 1, 1, 1, 1, 1, 1, 1, 1}; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(cast, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h index 48cb3c74ef3c05675115ab7cec09f16322d1410a..e9f403059f90cf6635bc22db3e6890b86cbe85f6 100644 --- a/lite/kernels/x86/conv_compute.h +++ b/lite/kernels/x86/conv_compute.h @@ -67,7 +67,7 @@ class Conv2dCompute : public KernelLite { lite::DDim col_shape(col_shape_vec); lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1); bool is_expand = IsExpand( - filter_shape_vec, param.strides, param.paddings, param.dilations); + filter_shape_vec, param.strides, *param.paddings, *param.dilations); lite::Tensor col; lite::Tensor col_matrix; if (is_expand) { @@ -95,20 +95,15 @@ class Conv2dCompute : public KernelLite { auto blas = paddle::lite::x86::math::GetBlas(context); for (int i = 0; i < batch_size; i++) { - lite::Tensor in_batch; - lite::Tensor tmp_in_batch = param.x->Slice(i, i + 1); - tmp_in_batch.Resize(input_shape); - in_batch.ShareDataWith(tmp_in_batch); - lite::Tensor out_batch; - lite::Tensor tmp_out_batch = param.output->Slice(i, i + 1); - tmp_out_batch.Resize(output_matrix_shape); - out_batch.ShareDataWith(tmp_out_batch); + lite::Tensor in_batch = param.x->Slice(i, i + 1); + in_batch.Resize(input_shape); + lite::Tensor out_batch = param.output->Slice(i, i + 1); + out_batch.Resize(output_matrix_shape); for (int g = 0; g < param.groups; g++) { - lite::Tensor in_slice; - in_slice.ShareDataWith( + lite::Tensor in_slice = in_batch.Slice(static_cast(g * in_step), - static_cast((g + 1) * in_step))); - + static_cast((g + 1) * in_step)); + auto paddings = *param.paddings; if (!is_expand) { col.ShareDataWith(in_slice); col_matrix.ShareDataWith(col); @@ -117,32 +112,30 @@ class Conv2dCompute : public KernelLite { // im2col im2col(context, in_slice, - param.dilations, + *param.dilations, param.strides, - std::vector{param.paddings[0], - param.paddings[1], - param.paddings[0], - param.paddings[1]}, + std::vector{ + paddings[0], paddings[2], paddings[0], paddings[2]}, &(col)); } else if (data_dim == 3U) { // vol2col vol2col(context, in_slice, - param.dilations, + *param.dilations, param.strides, - param.paddings, + *param.paddings, &(col)); } // gemm lite::Tensor out_slice; - out_slice.ShareDataWith( + out_slice = out_batch.Slice(static_cast(g * out_step), - static_cast((g + 1) * out_step))); + static_cast((g + 1) * out_step)); lite::Tensor filter_slice; - filter_slice.ShareDataWith( + filter_slice = filter.Slice(static_cast(g * out_step), - static_cast((g + 1) * out_step))); + static_cast((g + 1) * out_step)); blas.MatMul(filter_slice, false, col_matrix, diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc index f2dde962b9e77ce26336d17f07f29f5874ef9722..2827c6577e5bf311b4002526d4ac10f636162d96 100644 --- a/lite/kernels/x86/conv_compute_test.cc +++ b/lite/kernels/x86/conv_compute_test.cc @@ -73,9 +73,11 @@ TEST(conv2d_x86, run_test) { param.bias = &b; param.output = &out; param.strides = {1, 1}; - param.paddings = {0, 0}; + std::vector paddings = {0, 0, 0, 0}; param.groups = 1; - param.dilations = {1, 1}; + std::vector dilations = {1, 1}; + param.paddings = std::make_shared>(paddings); + param.dilations = std::make_shared>(dilations); LOG(INFO) << 123; std::unique_ptr ctx(new KernelContext); ctx->As(); diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc index 1eb76332ccc21b0c5196d71b9246ed8b144a6593..dace1e90258a93aa5c8e89d1d9369adf39416659 100644 --- a/lite/kernels/x86/fill_constant_compute.cc +++ b/lite/kernels/x86/fill_constant_compute.cc @@ -29,6 +29,38 @@ class FillConstantCompute : public KernelLite { public: using param_t = operators::FillConstantParam; + inline DDimLite GetShape(const param_t& param) { + // 1. shape is a Tensor + if (param.shape_tensor != nullptr) { + auto* shape_tensor = param.shape_tensor; + auto* shape_data = shape_tensor->data(); + auto vec_shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + return DDimLite(vec_shape); + } + + // 2. shape is a list/tuple containing Tensor + auto shape_tensor_list = param.shape_tensor_list; + if (shape_tensor_list.size() > 0) { + std::vector vec_shape; + for (size_t i = 0; i < shape_tensor_list.size(); ++i) { + auto tensor = shape_tensor_list[i]; + vec_shape.push_back(*tensor->data()); + } + return DDimLite(vec_shape); + } + + // 3. shape is a list/tuple without containing Tensor + auto vec_shape = param.shape; + return DDimLite(vec_shape); + } + + void PrepareForRun() override { + auto& param = *param_.get_mutable(); + auto outdims = GetShape(param); + param.Out->Resize(outdims); + } + void Run() override { auto& param = *param_.get_mutable(); auto& context = ctx_->As(); @@ -55,5 +87,9 @@ REGISTER_LITE_KERNEL(fill_constant, kNCHW, paddle::lite::kernels::x86::FillConstantCompute, def) + .BindInput("ShapeTensor", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) + .BindInput("ShapeTensorList", + {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); diff --git a/lite/kernels/x86/gather_compute.cc b/lite/kernels/x86/gather_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..836f336271ef53c338cca89855b48c94c778cc54 --- /dev/null +++ b/lite/kernels/x86/gather_compute.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/gather_compute.h" + +typedef paddle::lite::kernels::x86::GatherCompute GatherInt32; +typedef paddle::lite::kernels::x86::GatherCompute GatherInt64; + +REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); + +REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Index", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6ee270647f8fb7d7ec540047cd4d546a7eb89ce8 --- /dev/null +++ b/lite/kernels/x86/gather_compute.h @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/api/paddle_place.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" +#include "lite/fluid/data_type.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +/** + * A thin wrapper for gathering on cpu tensor + * Return a new tensor from source tensor, gathered according to index + * input[src]: type-T source Tensor + * input[index]: type-IndexT index Tensor (1-D) + * return: output tensor + */ +template +void CPUGather(const lite::Tensor* src, + const lite::Tensor* index, + lite::Tensor* output) { + // check index of shape 1-D + if (index->dims().size() == 2) { + CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 " + "when Index(input)'s dimension's size " + "equal to 2 in Gather(Op)."; + } else { + CHECK(index->dims().size() == 1) + << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op)."; + } + int64_t index_size = index->dims()[0]; + + auto src_dims = src->dims(); + + const T* p_src = src->data(); + const IndexT* p_index = index->data(); + T* p_output = output->mutable_data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + for (int64_t i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} + +template +class GatherCompute : public KernelLite { + public: + using param_t = operators::GatherParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + auto x = param.X; + auto index = param.Index; + auto out = param.Out; + + out->mutable_data(); + if (x->dims().production() == 0) return; + /* + * Since there's no type defined for lite::Tensor in Paddle-Lite, then + * convert the Index's value to float which must be int32_t or int64_t and + * this supposes to cause no precision difference during inference just for + * now. + * Alternatively, if define the Tensor's type during registering, may cause + * a redefinition error. + */ + CPUGather(x, index, out); + } + + virtual ~GatherCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19 --- /dev/null +++ b/lite/kernels/x86/gather_compute_test.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/gather_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(gather_x86, retrive_op) { + auto gather = + KernelRegistry::Global().Create( + "gather"); + ASSERT_FALSE(gather.empty()); + int cnt = 0; + for (auto item = gather.begin(); item != gather.end(); ++item) { + cnt++; + ASSERT_TRUE(*item); + } + ASSERT_EQ(cnt, 2); +} + +TEST(gather_x86, int32_init) { + GatherCompute gather; + ASSERT_EQ(gather.precision(), PRECISION(kFloat)); + ASSERT_EQ(gather.target(), TARGET(kX86)); +} + +TEST(gather_x86, int64_init) { + GatherCompute gather; + ASSERT_EQ(gather.precision(), PRECISION(kFloat)); + ASSERT_EQ(gather.target(), TARGET(kX86)); +} + +template +void test_case_1dims() { + lite::Tensor x, index, out; + std::vector x_shape{10}; + x.Resize(lite::DDim(x_shape)); + std::vector index_shape{3}; + index.Resize(lite::DDim(index_shape)); + std::vector out_shape{3}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto index_data = index.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector index_value{1, 3, 5}; + for (int i = 0; i < index.dims().production(); ++i) { + index_data[i] = static_cast(index_value[i]); + } + + GatherCompute gather; + operators::GatherParam param; + + param.X = &x; + param.Index = &index; + param.Out = &out; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + gather.SetContext(std::move(ctx)); + gather.SetParam(param); + gather.Run(); + + std::vector ref_data{1, 3, 5}; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +template +void test_case_2dims() { + lite::Tensor x, index, out; + std::vector x_shape{10, 20}; + x.Resize(lite::DDim(x_shape)); + std::vector index_shape{3}; + index.Resize(lite::DDim(index_shape)); + std::vector out_shape{3, 20}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto index_data = index.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector index_value{1, 3, 5}; + for (int i = 0; i < index.dims().production(); ++i) { + index_data[i] = static_cast(index_value[i]); + } + + GatherCompute gather; + operators::GatherParam param; + + param.X = &x; + param.Index = &index; + param.Out = &out; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + gather.SetContext(std::move(ctx)); + gather.SetParam(param); + gather.Run(); + + std::vector ref_data(60); + for (int i = 0; i < 20; ++i) { + ref_data[i] = static_cast(20 + i); + } + for (int i = 20; i < 40; ++i) { + ref_data[i] = static_cast(40 + i); + } + for (int i = 40; i < 60; ++i) { + ref_data[i] = static_cast(60 + i); + } + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +TEST(gather_x86, run_test_1dims) { + test_case_1dims(); + test_case_1dims(); +} + +TEST(gather_x86, run_test_2dims) { + test_case_2dims(); + test_case_2dims(); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def); +USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in); diff --git a/lite/kernels/x86/layer_norm_compute.cc b/lite/kernels/x86/layer_norm_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..4854a69a1d5f38bff102d984f990aea4ad723439 --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/layer_norm_compute.h" + +REGISTER_LITE_KERNEL(layer_norm, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::LayerNormCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..bbbdb91debfd7d7b046a3eb18a535462c69e358c --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute.h @@ -0,0 +1,91 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/backends/x86/jit/helper.h" +#include "lite/backends/x86/jit/kernel_base.h" +#include "lite/backends/x86/jit/kernels.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/layer_norm_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class LayerNormCompute : public KernelLite { + public: + using param_t = operators::LayerNormParam; + + void Run() override { + auto ¶m = *param_.get_mutable(); + float epsilon = param.epsilon; + auto Scale = param.Scale; + auto Bias = param.Bias; + auto x = param.X; + + auto y = param.Y; + auto Mean = param.Mean; + auto Var = param.Variance; + auto begin_norm_axis = param.begin_norm_axis; + + auto x_dims = x->dims(); + + y->mutable_data(); + Mean->mutable_data(); + Var->mutable_data(); + + auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + lite::DDim matrix_shape({left, right}); + + lite::Tensor in; + in.ShareDataWith(*x); + in.Resize(matrix_shape); + lite::Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + PADDLE_ENFORCE_EQ(Mean->numel(), left); + PADDLE_ENFORCE_EQ(Var->numel(), left); + PADDLE_ENFORCE_EQ(Scale->numel(), right); + PADDLE_ENFORCE_EQ(Bias->numel(), right); + + auto ker = paddle::lite::jit::KernelFuncs, + lite::fluid::CPUPlace>::Cache() + .At(right); + ker(in.mutable_data(), + out.mutable_data(), + Mean->mutable_data(), + Var->mutable_data(), + Scale->data(), + Bias->data(), + static_cast(left), + static_cast(epsilon), + right); + } + + virtual ~LayerNormCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbac39505204b3799f6c5274f80690196e83a725 --- /dev/null +++ b/lite/kernels/x86/layer_norm_compute_test.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/layer_norm_compute.h" +#include +#include +#include +#include +#include "lite/backends/x86/jit/helper.h" +#include "lite/backends/x86/jit/kernel_base.h" +#include "lite/backends/x86/jit/kernels.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +std::vector ref(lite::Tensor* x, + lite::Tensor* Scale, + lite::Tensor* Bias, + lite::Tensor* y, + lite::Tensor* Mean, + lite::Tensor* Var, + int begin_norm_axis, + float epsilon) { + auto x_dims = x->dims(); + + y->mutable_data(); + Mean->mutable_data(); + Var->mutable_data(); + + auto matrix_dim = x_dims.Flatten2D(begin_norm_axis); + int left = static_cast(matrix_dim[0]); + int right = static_cast(matrix_dim[1]); + lite::DDim matrix_shape({left, right}); + + x->Resize(matrix_shape); + Tensor out; + out.ShareDataWith(*y); + out.Resize(matrix_shape); + + auto ker = paddle::lite::jit::KernelFuncs, + lite::fluid::CPUPlace>::Cache() + .At(right); + ker(x->mutable_data(), + out.mutable_data(), + Mean->mutable_data(), + Var->mutable_data(), + Scale->data(), + Bias->data(), + static_cast(left), + static_cast(epsilon), + right); + + std::vector ref_data; + auto result = out.mutable_data(); + for (int i = 0; i < y->dims().production(); ++i) { + ref_data.emplace_back(result[i]); + } + return ref_data; +} + +// layer_norm +TEST(layer_norm_x86, retrive_op) { + auto layer_norm = + KernelRegistry::Global().Create( + "layer_norm"); + ASSERT_FALSE(layer_norm.empty()); + ASSERT_TRUE(layer_norm.front()); +} + +TEST(layer_norm_x86, init) { + lite::kernels::x86::LayerNormCompute layer_norm; + ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat)); + ASSERT_EQ(layer_norm.target(), TARGET(kX86)); +} + +TEST(layer_norm_x86, run_test) { + lite::Tensor x; + lite::Tensor Scale; + lite::Tensor Bias; + + lite::Tensor out; + lite::Tensor Mean; + lite::Tensor Var; + + std::vector x_shape({1, 2, 3, 1}); + x.Resize(lite::DDim(x_shape)); + std::vector out_shape({1, 2, 3, 1}); + out.Resize(lite::DDim(out_shape)); + + int begin_norm_axis = 0; + float epsilon = 1e-5; + int pre = 1; + int post = 1; + for (int i = 0; i < begin_norm_axis; ++i) { + pre *= x_shape[i]; + } + for (int i = begin_norm_axis; i < x_shape.size(); ++i) { + post *= x_shape[i]; + } + std::vector scale_shape({post}); + Scale.Resize(scale_shape); + std::vector bias_shape({post}); + Bias.Resize(bias_shape); + + auto x_data = x.mutable_data(); + auto scale_data = Scale.mutable_data(); + auto bias_data = Bias.mutable_data(); + auto out_data = out.mutable_data(); + auto mean_data = Mean.mutable_data(); + auto var_data = Var.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < Scale.dims().production(); ++i) { + scale_data[i] = 1.5; + } + for (int64_t i = 0; i < Bias.dims().production(); ++i) { + bias_data[i] = 0.25; + } + + LayerNormCompute layer_norm; + operators::LayerNormParam param; + + param.X = &x; + param.Y = &out; + param.Scale = &Scale; + param.Bias = &Bias; + param.Mean = &Mean; + param.Variance = &Var; + param.begin_norm_axis = begin_norm_axis; + param.epsilon = epsilon; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + layer_norm.SetContext(std::move(ctx)); + layer_norm.SetParam(param); + layer_norm.Run(); + + std::vector ref_data = + ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon); + for (int j = 0; j < out.dims().production(); ++j) { + EXPECT_NEAR(out_data[j], ref_data[j], 1e-5); + // LOG(INFO) << out_data[j]; + } + LOG(INFO) << *mean_data; + LOG(INFO) << *var_data; +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(layer_norm, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc index 364593251e17453011bad5b2c1057fc25d54d7c8..856a07a94cada4702d47820605436cee6523a527 100644 --- a/lite/kernels/x86/lookup_table_compute.cc +++ b/lite/kernels/x86/lookup_table_compute.cc @@ -32,3 +32,13 @@ REGISTER_LITE_KERNEL(lookup_table, .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); +REGISTER_LITE_KERNEL(lookup_table_v2, + kX86, + kInt64, + kNCHW, + paddle::lite::kernels::x86::LookupTableCompute, + def) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h index e0d7752ca77c810700f57722c4186b4e02d6411f..d5719f332ce4b0b590b0cab26c5a98e864d2cc5e 100644 --- a/lite/kernels/x86/lookup_table_compute.h +++ b/lite/kernels/x86/lookup_table_compute.h @@ -30,7 +30,6 @@ class LookupTableCompute : public KernelLite { void Run() override { auto ¶m = *param_.get_mutable(); - // auto& context = context_->As(); auto *ids_t = param.Ids; auto *output_t = param.Out; int64_t padding_idx = param.padding_idx; @@ -41,18 +40,18 @@ class LookupTableCompute : public KernelLite { int64_t row_number = table_t->dims()[0]; int64_t row_width = table_t->dims()[1]; - auto *table = table_t->data(); - auto *output = output_t->mutable_data(); - memset(output, 0, output_t->dims().production() * sizeof(float)); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(); + memset(output, 0, output_t->dims().production() * sizeof(T)); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != -1 && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(float)); + memset(output + i * row_width, 0, row_width * sizeof(T)); } else { CHECK_LT(ids[i], row_number); CHECK_GE(ids[i], 0); memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(float)); + row_width * sizeof(T)); } } } diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..86b2d39186b10de6def72a217cd6c70773b59420 --- /dev/null +++ b/lite/kernels/x86/lookup_table_compute_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/lookup_table_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(lookup_table_x86, compute) { + LookupTableCompute lookup_table; + operators::LookupTableParam param; + lite::Tensor w, ids, out, out_ref; + int64_t padding_idx = -1; + + int vocab_size = 40; + int emb_size = 50; + int ids_h = 30; + int ids_w = 20; + + auto w_dim = DDim({vocab_size, emb_size}); + auto ids_dim = DDim({ids_h, ids_w}); + auto out_dim = DDim({ids_h, ids_w, emb_size}); + + w.Resize(w_dim); + ids.Resize(ids_dim); + out.Resize(out_dim); + out_ref.Resize(out_dim); + + auto* w_data = w.mutable_data(); + auto* ids_data = ids.mutable_data(); + auto* out_data = out.mutable_data(); + auto* out_ref_data = out_ref.mutable_data(); + + int w_num = w_dim.production(); + for (int i = 0; i < w_num; i++) { + w_data[i] = static_cast(i + 1) / (w_num + 1); + } + int ids_num = ids_dim.production(); + for (int i = 0; i < ids_num; i++) { + ids_data[i] = i % vocab_size; + } + int out_num = out_dim.production(); + for (int i = 0; i < out_num; i++) { + out_ref_data[i] = + static_cast((i % (vocab_size * emb_size)) + 1) / (w_num + 1); + } + + param.W = &w; + param.Ids = &ids; + param.Out = &out; + param.padding_idx = padding_idx; + lookup_table.SetParam(param); + lookup_table.Run(); + for (int i = 0; i < out_num; i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def); diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..feda180d22e59b2ca0e8f0f89f3c7a1ddb8acd4a --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/match_matrix_tensor_compute.h" +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +void MatchMatrixTensorCompute::Run() { + auto& context = ctx_->As(); + auto& param = this->Param(); + auto* x = param.x; + auto* w = param.w; + auto* y = param.y; + auto* out = param.out; + auto* tmp = param.tmp; + int dim_t = param.dim_t; + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->template data(); + auto* bottom_r_data = y->template data(); + auto* t_data = w->template data(); + auto* out_data = out->template mutable_data(); + auto* bottom_l_trans_data = tmp->template mutable_data(); + memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T)); + memset(bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + auto blas = lite::x86::math::GetBlas(context); + blas.GEMM(CblasNoTrans, + CblasNoTrans, + x->dims()[0], + dim_t * dim_in, + dim_in, + 1.0f, + bottom_l_data, + dim_in, + t_data, + dim_t * dim_in, + 0.0f, + bottom_l_trans_data, + dim_t * dim_in); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + + auto blas = lite::x86::math::GetBlas(context); + blas.GEMM(CblasNoTrans, + CblasTrans, + len_l, + len_r, + dim_in, + 1.0f, + l_t_data, + dim_t * dim_in, + r_data, + dim_in, + 0.0f, + top_data, + len_r); + } + } + + int batch_size = x->lod()[0].size() - 1; + int lod_lv1_size = batch_size * dim_t; + int lod_lv2_size = x->lod()[0].back() * dim_t; + std::vector out_lod0(batch_size + 1, 0); + std::vector out_lod1(lod_lv1_size + 1, 0); + std::vector out_lod2(lod_lv2_size + 1, 0); + for (int i = 0; i < batch_size; i++) { + out_lod0[i + 1] = out_lod0[i] + dim_t; + int len_l = offset_l[i + 1] - offset_l[i]; + + for (int j = 0; j < dim_t; j++) { + out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l; + int len_r = offset_r[i + 1] - offset_r[i]; + + for (int k = 0; k < len_l; k++) { + out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] = + out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r; + } + } + } + + LoD out_lod; + out_lod.push_back(top_offset); + out_lod.push_back(offset_l); + out_lod.push_back(offset_r); + out->set_lod(out_lod); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + match_matrix_tensor, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::MatchMatrixTensorCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/match_matrix_tensor_compute.h b/lite/kernels/x86/match_matrix_tensor_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..6189676fd846e2ac73fb17ffb966cdf815d9a371 --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class MatchMatrixTensorCompute + : public KernelLite { + public: + using param_t = operators::MatchMatrixTensorParam; + + void Run() override; + + virtual ~MatchMatrixTensorCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0c3f3ad50940ab0059ab04fb507a786f735584b9 --- /dev/null +++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/match_matrix_tensor_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(match_matrix_tensor_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "match_matrix_tensor"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(match_matrix_tensor_x86, init) { + MatchMatrixTensorCompute mmtc; + ASSERT_EQ(mmtc.precision(), PRECISION(kFloat)); + ASSERT_EQ(mmtc.target(), TARGET(kX86)); +} + +TEST(match_matrix_tensor_x86, run_test) { + int ix = 5, iy = 4, h = 2, dim_t = 2; + lite::Tensor x, w, y, out, tmp; + x.Resize({ix, h}); + w.Resize({h, dim_t, h}); + y.Resize({iy, h}); + out.Resize({18, 1}); + tmp.Resize({20, 1}); + + LoD x_lod{}; + x_lod.push_back({0, 2, 5}); + x.set_lod(x_lod); + LoD y_lod{}; + y_lod.push_back({0, 3, 4}); + y.set_lod(y_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.numel(); i++) { + x_data[i] = static_cast(i); + } + auto* y_data = y.mutable_data(); + for (int64_t i = 0; i < y.numel(); i++) { + y_data[i] = static_cast(i); + } + auto* w_data = w.mutable_data(); + for (int64_t i = 0; i < w.numel(); i++) { + w_data[i] = static_cast(i); + } + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + MatchMatrixTensorCompute mmtc; + mmtc.SetContext(std::move(ctx)); + + operators::MatchMatrixTensorParam param; + param.x = &x; + param.w = &w; + param.y = &y; + param.dim_t = dim_t; + param.out = &out; + param.tmp = &tmp; + + mmtc.SetParam(param); + mmtc.Run(); + + std::vector ref_results = {5, + 23, + 41, + 17, + 75, + 133, + 7, + 33, + 59, + 27, + 125, + 223, + 323, + 455, + 587, + 557, + 793, + 1029}; + auto* out_data = out.mutable_data(); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + // LOG(INFO) << out_data[i]; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(match_matrix_tensor, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc index b618d2d3775e148c4b5f2c864eaa4de2dc40c08a..1216d99ad807c673ee6aa764fd895732540d86c5 100644 --- a/lite/kernels/x86/mean_compute.cc +++ b/lite/kernels/x86/mean_compute.cc @@ -54,29 +54,6 @@ class MeanCompute : public KernelLite { virtual ~MeanCompute() = default; }; -template -class MeanGradCompute : public KernelLite { - public: - using param_t = operators::MeanGradParam; - - void Run() override { - auto& param = *param_.get_mutable(); - auto& context = ctx_->As(); - CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1); - CHECK(context.x86_device_context()); - - param.X_grad->template mutable_data(); - T x_grad_size = static_cast(param.X_grad->raw_tensor().numel()); - Eigen::DSizes bcast(static_cast(x_grad_size)); - EigenVector::Flatten(param.X_grad->raw_tensor()) - .device(*(context.x86_device_context()->eigen_device())) = - (EigenVector::From(param.Out_grad->raw_tensor()) / x_grad_size) - .broadcast(bcast); - } - - virtual ~MeanGradCompute() = default; -}; - } // namespace x86 } // namespace kernels } // namespace lite @@ -93,16 +70,3 @@ REGISTER_LITE_KERNEL(mean, .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); - -REGISTER_LITE_KERNEL(mean_grad, - kX86, - kFloat, - kNCHW, - paddle::lite::kernels::x86::MeanGradCompute, - def) - .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) - .BindInput(paddle::framework::GradVarName("Out"), - {LiteType::GetTensorTy(TARGET(kX86))}) - .BindOutput(paddle::framework::GradVarName("X"), - {LiteType::GetTensorTy(TARGET(kX86))}) - .Finalize(); diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc index 64558f66772381ad402a3eb203bb6efd9fceff60..3de4340543cff6867f7879f0551be7a33c9e6862 100644 --- a/lite/kernels/x86/mul_compute.cc +++ b/lite/kernels/x86/mul_compute.cc @@ -24,21 +24,3 @@ REGISTER_LITE_KERNEL(mul, .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); - -// #ifdef LITE_WITH_TRAIN -// REGISTER_LITE_KERNEL(mul_grad, -// kX86, -// kFloat, -// kNCHW, -// paddle::lite::kernels::x86::MulGradCompute, -// def) -// .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindInput(paddle::framework::GradVarName("Out"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindOutput(paddle::framework::GradVarName("X"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .BindOutput(paddle::framework::GradVarName("Y"), -// {LiteType::GetTensorTy(TARGET(kX86))}) -// .Finalize(); -// #endif diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h index e204fc81f28de4af43d63e289b01d81188502988..be58f24ba2ed37db6661ecaaceb0d9d70fdd75d4 100644 --- a/lite/kernels/x86/mul_compute.h +++ b/lite/kernels/x86/mul_compute.h @@ -81,78 +81,6 @@ class MulCompute : public KernelLite { virtual ~MulCompute() = default; }; -#ifdef LITE_WITH_TRAIN -template -class MulGradCompute : public KernelLite { - public: - void Run() override { - auto& context = ctx_->As(); - auto& param = *param_.get_mutable(); - CHECK(context.x86_device_context()); - - auto* x = ¶m.x->raw_tensor(); - auto* y = ¶m.y->raw_tensor(); - - Tensor x_matrix, y_matrix; - - if (x->dims().size() > 2) { - x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims); - } else { - x_matrix = *x; - } - - if (y->dims().size() > 2) { - y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims); - - } else { - y_matrix = *y; - } - - auto* dout = ¶m.output_grad->raw_tensor(); - - Tensor dout_mat; - dout_mat.ShareDataWith(*dout); - dout_mat.Resize( - {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0], - framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]}); - - auto* dx = ¶m.x_grad->raw_tensor(); - auto* dy = ¶m.y_grad->raw_tensor(); - - if (dx != nullptr) { - dx->set_lod(x->lod()); - } - if (dy != nullptr) { - dy->set_lod(y->lod()); - } - - auto blas = paddle::operators::math::GetBlas( - *context.x86_device_context()); - if (dx) { - // dx->mutable_data(context.x86_device_context->GetPlace()); - param.x_grad->template mutable_data(); - Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( - *dx, param.x_num_col_dims) - : *dx; - - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix); - } - if (dy) { - // dy->yutable_data(context.x86_device_context->GetPlace()); - param.y_grad->template mutable_data(); - Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix( - *dy, param.y_num_col_dims) - : *dy; - // dy = x' * dout. dy K x N, dout : M x N, x : M x K - blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix); - } - } - - virtual ~MulGradCompute() = default; -}; -#endif - } // namespace x86 } // namespace kernels } // namespace lite diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h index 57bcddcec9512d626962465e717b7a202cfe0b17..0dccb245b1267ac7ffa7c75bda9b491ffc3cd191 100644 --- a/lite/kernels/x86/pool_compute.h +++ b/lite/kernels/x86/pool_compute.h @@ -35,7 +35,6 @@ class PoolCompute : public KernelLite { auto& param = *param_.get_mutable(); if (param.global_pooling) { for (size_t i = 0; i < param.ksize.size(); ++i) { - param.paddings[i] = 0; param.ksize[i] = static_cast(param.x->dims()[i + 2]); } } @@ -52,7 +51,7 @@ class PoolCompute : public KernelLite { param.x, param.ksize, param.strides, - param.paddings, + *param.paddings, pool_process, true, false, @@ -68,7 +67,7 @@ class PoolCompute : public KernelLite { param.x, param.ksize, param.strides, - param.paddings, + *param.paddings, pool_process, param.exclusive, param.adaptive, diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc index 87b75a0760bca45057f25b2cb948a66feb22496c..4ea727cedd5206f5f1ac2685297f72c3019bb313 100644 --- a/lite/kernels/x86/pool_compute_test.cc +++ b/lite/kernels/x86/pool_compute_test.cc @@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) { param.x = &x; param.output = &out; param.strides = {2, 2}; - param.paddings = {0, 0}; + std::vector paddings = {0, 0, 0, 0}; + param.paddings = std::make_shared>(paddings); param.ksize = {2, 2}; param.pooling_type = "max"; std::unique_ptr ctx(new KernelContext); diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.cc b/lite/kernels/x86/search_aligned_mat_mul_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..956f2a3beb8ae845b71c31600fdf8e6c758cab6a --- /dev/null +++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_aligned_mat_mul_compute.h" + +REGISTER_LITE_KERNEL( + search_aligned_mat_mul, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchAlignedMatMulCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.h b/lite/kernels/x86/search_aligned_mat_mul_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ea6b546c2ccefbb98269ad563a566cc668e6a441 --- /dev/null +++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchAlignedMatMulCompute + : public KernelLite { + public: + using param_t = operators::MatMulParam; + + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + bool x_transpose = param.transpose_X; + bool y_transpose = param.transpose_Y; + float alpha = param.alpha; + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + + lite::x86::math::MatDescriptor mat_dim_a; + mat_dim_a.height_ = M; + mat_dim_a.width_ = K; + mat_dim_a.stride_ = x_batch_size * x_inner_size; + mat_dim_a.batch_size_ = seq_num; + mat_dim_a.trans_ = x_transpose; + lite::x86::math::MatDescriptor mat_dim_b; + mat_dim_b.height_ = K; + mat_dim_b.width_ = N; + mat_dim_b.stride_ = y_batch_size * y_inner_size; + mat_dim_b.batch_size_ = seq_num; + mat_dim_b.trans_ = y_transpose; + auto blas = lite::x86::math::GetBlas(context); + blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, static_cast(alpha), out, T(0)); + } + + virtual ~SearchAlignedMatMulCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_fc_compute.cc b/lite/kernels/x86/search_fc_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf76113e01d81e899250a60203680cd984746f19 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_fc_compute.h" + +REGISTER_LITE_KERNEL(search_fc, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_fc_compute.h b/lite/kernels/x86/search_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e0f44de526be102ac7be4f44517d01e0bc28ff94 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/search_fc.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchFcCompute : public KernelLite { + public: + using param_t = operators::SearchFcParam; + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + param.Out->Resize({param.X->dims()[0], param.out_size}); + lite::x86::math::SearchFcFunctor search_fc; + search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size); + } + virtual ~SearchFcCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..425df2a0f0544d7345923cb2efdce96074845311 --- /dev/null +++ b/lite/kernels/x86/search_fc_compute_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_fc_compute.h" +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void fc_cpu_base(const lite::Tensor* X, + const lite::Tensor* W, + const lite::Tensor* b, + int out_size, + lite::Tensor* Out) { + const float* data_in = X->data(); + const float* bias = b->data(); + const float* weights = W->data(); + float* data_out = Out->mutable_data(); + int out_rows = X->dims()[0]; + int in_cols = X->numel() / out_rows; + int out_cols = W->numel() / in_cols; + int index_out; + + for (int i = 0; i < out_rows; i++) { + for (int j = 0; j < out_cols; j++) { + index_out = i * out_cols + j; + data_out[index_out] = bias ? bias[j] : 0; + + for (int k = 0; k < in_cols; k++) { + data_out[index_out] += + data_in[i * in_cols + k] * weights[j * in_cols + k]; + } + } + } +} + +TEST(search_fc_x86, retrive_op) { + auto search_fc = + KernelRegistry::Global().Create( + "search_fc"); + ASSERT_FALSE(search_fc.empty()); + ASSERT_TRUE(search_fc.front()); +} + +TEST(search_fc_x86, init) { + SearchFcCompute search_fc; + ASSERT_EQ(search_fc.precision(), PRECISION(kFloat)); + ASSERT_EQ(search_fc.target(), TARGET(kX86)); +} + +TEST(search_fc_x86, run_test) { + lite::Tensor x, w, b, out; + lite::Tensor out_ref; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + std::vector x_shape{1, 4}; + x.Resize(lite::DDim(x_shape)); + std::vector w_shape{3, 4}; + w.Resize(lite::DDim(w_shape)); + std::vector b_shape{3}; + b.Resize(lite::DDim(b_shape)); + std::vector out_shape{1, 4}; + out.Resize(lite::DDim(out_shape)); + out_ref.Resize(lite::DDim(out_shape)); + auto x_data = x.mutable_data(); + auto w_data = w.mutable_data(); + auto b_data = b.mutable_data(); + auto out_data = out.mutable_data(); + auto out_data_ref = out_ref.mutable_data(); + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i); + } + for (int64_t i = 0; i < w.dims().production(); i++) { + w_data[i] = static_cast(i); + } + for (int64_t i = 0; i < b.dims().production(); i++) { + b_data[i] = static_cast(i); + } + + fc_cpu_base(&x, &w, &b, 4, &out_ref); + + SearchFcCompute fc; + operators::SearchFcParam param; + param.X = &x; + param.W = &w; + param.b = &b; + param.Out = &out; + param.out_size = 4; + fc.SetParam(param); + fc.SetContext(std::move(ctx)); + fc.Run(); + + VLOG(3) << "output vs ref"; + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_data_ref[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_fc, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..95839ba71b9f63fad9d659fd65c0028005d29799 --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_grnn_compute.h" +#include +#include +#include "lite/backends/x86/math/blas.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +T sigmoid(T z) { + return 1 / (1 + std::exp(-z)); +} + +template +void CallGemm(const lite::x86::math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, + const int M, + const int N, + const int K, + const T alpha, + const T* A, + const T* B, + const T beta, + T* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +template +void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) { + auto& param = this->Param(); + auto* _idx_sorted_by_width = param.idx_sorted_by_width; + auto* _layout_input = param.layout_input; + auto* _input = input_blob; + + // usually total length + int dim0 = _input->dims()[0]; + // if it is id only sequence + int dim1 = 1; + // if its a embedding like sequence (dim1 would be embedding_size) + if (_input->dims().size() > 1) { + dim1 = _input->dims()[1]; + } + + int batch = _input->lod()[0].size() - 1; + auto& offset = _input->lod()[0]; + + Tensor _width; + _width.Resize({batch}); + _idx_sorted_by_width->Resize({batch}); + int* width_data = _width.template mutable_data(); + int* idx_sorted_by_width_data = + _idx_sorted_by_width->template mutable_data(); + // sort sequence by width (descending) and find the largest width in the + // batch + for (int i = 0; i < batch; i++) { + width_data[i] = offset[i + 1] - offset[i]; + idx_sorted_by_width_data[i] = i; + } + std::sort(idx_sorted_by_width_data, + idx_sorted_by_width_data + batch, + [&_width](int a, int b) { + return _width.template data()[a] > + _width.template data()[b]; + }); + int max_width = width_data[idx_sorted_by_width_data[0]]; + + // start of reorganizing the input + std::vector new_offset; + new_offset.resize(max_width + 1); + + new_offset[0] = 0; + int j = batch - 1; + int last_width = 0; + int sub_row = 0; + int sub_col = 0; + + for (int i = 1; i <= max_width;) { + for (int k = j; k >= 0; --k) { + if (width_data[idx_sorted_by_width_data[k]] > last_width) { + sub_row = width_data[idx_sorted_by_width_data[k]] - last_width; + sub_col = k + 1; + + for (int s = 0; s < sub_row; s++) { + new_offset[i] = new_offset[i - 1] + sub_col; + i++; + } + // move on + last_width = width_data[idx_sorted_by_width_data[k]]; + j = k - 1; + break; + } + } + } + + // copying to the reorganized buffer + if (_input->dims().size() == 1) { + // _layout_input.reshape_batch_sequence({dim0}, new_offset); + LOG(FATAL) << "_input->dims().size() = 1, error."; + } else { + // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset); + LoD new_lod; + new_lod.push_back(new_offset); + _layout_input->set_lod(new_lod); + _layout_input->Resize({dim0, dim1}); + } + + auto* new_emb = _layout_input->template mutable_data(); + for (int i = 0; i < max_width; i++) { + int w = new_offset[i + 1] - new_offset[i]; + auto* emb_start = new_emb + dim1 * new_offset[i]; + for (int j = 0; j < w; ++j) { + memcpy(emb_start + dim1 * j, + _input->template data() + + dim1 * offset[idx_sorted_by_width_data[j]] + dim1 * i, + dim1 * sizeof(T)); + } + } +} + +template +void SearchGrnnCompute::CopyBack(T* from, T* to, int step) { + auto& param = this->Param(); + auto* _input = param.x; + auto* _layout_input = param.layout_input; + auto* _idx_sorted_by_width = param.idx_sorted_by_width; + + const auto& offset = _input->lod()[0]; + const auto& new_offset = _layout_input->lod()[0]; + const auto* idx_sorted_by_width_data = + _idx_sorted_by_width->template data(); + for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) { + int w = new_offset[i + 1] - new_offset[i]; + for (int j = 0; j < w; j++) { + memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i), + from + (new_offset[i] + j) * step, + step * sizeof(T)); + } + } +} + +template +void SearchGrnnCompute::Run() { + auto& context = ctx_->As(); + auto& param = this->Param(); + auto* bottom = param.x; + auto* wi = param.wi; + auto* wh = param.wh; + auto* top = param.out; + auto* _buffer = param.tmp_buffer; + int _cap_h = param.num_hidden; + int _cap_e = param.num_input; + + int _cap_l = bottom->dims()[0]; + int batch = bottom->lod()[0].size() - 1; + + const auto& offset = bottom->lod()[0]; + LoD top_lod; + top_lod.push_back(offset); + top->set_lod(top_lod); + std::vector top_dims_vec{_cap_l, _cap_h}; + top->Resize(top_dims_vec); + auto* top_hidden = top->template mutable_data(); + + const auto* dense_e2h = wi->template data(); + const auto* dense_h2h = wh->template data(); + + const auto* e2h = dense_e2h; + const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h; + const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h; + const auto* h2h = dense_h2h; + const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h; + const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h; + + PrepareLayout(bottom); + + auto* _layout_input = param.layout_input; + auto* new_emb = _layout_input->template mutable_data(); + const auto& new_offset = _layout_input->lod()[0]; + int max_width = _layout_input->lod()[0].size() - 1; + + // this buffer is used for book keeping info which will be used in bp + // buffer also needed in bp, so make it larger + _buffer->Resize({20, _cap_l, _cap_h}); + auto* buffer_data = _buffer->template mutable_data(); + auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h; + auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h; + auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h; + auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h; + auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h; + auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h; + auto* r = buffer_data + 6 * _cap_l * _cap_h; + auto* z = buffer_data + 7 * _cap_l * _cap_h; + auto* tilde = buffer_data + 8 * _cap_l * _cap_h; + // the internal hidden + auto* hidden = buffer_data + 19 * _cap_l * _cap_h; + + auto blas = lite::x86::math::GetBlas(context); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2h, + 0.0f, + w_x_e); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2hr, + 0.0f, + wr_x_e); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + _cap_l, + _cap_h, + _cap_e, + 1.0f, + new_emb, + e2hz, + 0.0f, + wz_x_e); + + // precompute hidden0 + for (int i = 0; i < batch * _cap_h; i++) { + tilde[i] = std::tanh(w_x_e[i]); + z[i] = sigmoid(wz_x_e[i]); + hidden[i] = (1. - z[i]) * tilde[i]; + } + + // recurrence + for (int i = 1; i < max_width; i++) { + int w_tm1 = new_offset[i] - new_offset[i - 1]; + int w = new_offset[i + 1] - new_offset[i]; + + // precompute hidden i-1 to hidden i + auto* htm1 = hidden + new_offset[i - 1] * _cap_h; + + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2h, + 0.0f, + u_x_h + new_offset[i] * _cap_h); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2hr, + 0.0f, + ur_x_h + new_offset[i] * _cap_h); + CallGemm(blas, + CblasNoTrans, + CblasTrans, + w, + _cap_h, + _cap_h, + 1.0f, + htm1, + h2hz, + 0.0f, + uz_x_h + new_offset[i] * _cap_h); + + // compute the gate and hidden + for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h; + j++) { + r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]); + z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]); + tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]); + hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j]; + } + } + + CopyBack(hidden, top_hidden, _cap_h); +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(search_grnn, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchGrnnCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("idx_sorted_by_width", + {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))}) + .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_grnn_compute.h b/lite/kernels/x86/search_grnn_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..66866761e139479863d98dd757d1a90ae36de9f5 --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute.h @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchGrnnCompute : public KernelLite { + public: + using param_t = operators::SearchGrnnParam; + + void Run() override; + + virtual ~SearchGrnnCompute() = default; + + private: + void PrepareLayout(const Tensor* input); + void CopyBack(T* from, T* to, int step); +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..b85d97e3f1be1f2f02837d347e42ce6731c58414 --- /dev/null +++ b/lite/kernels/x86/search_grnn_compute_test.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_grnn_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_grnn_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "search_grnn"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(search_grnn_x86, init) { + SearchGrnnCompute ssdc; + ASSERT_EQ(ssdc.precision(), PRECISION(kFloat)); + ASSERT_EQ(ssdc.target(), TARGET(kX86)); +} + +TEST(search_grnn_x86, run_test) { + int num_input = 128; + int num_hidden = 128; + int num_batch = 3; + lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer; + x.Resize({num_batch, num_input}); + wi.Resize({3, num_hidden, num_input}); + wh.Resize({3, num_hidden, num_hidden}); + // out.Resize({num_batch, num_hidden}); + LoD x_lod{}; + x_lod.push_back({0, 1, 3}); + x.set_lod(x_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.numel(); i++) { + x_data[i] = static_cast(i); + } + auto* wi_data = wi.mutable_data(); + for (int64_t i = 0; i < wi.numel(); i++) { + wi_data[i] = static_cast(i); + } + auto* wh_data = wh.mutable_data(); + for (int64_t i = 0; i < wh.numel(); i++) { + wh_data[i] = static_cast(i); + } + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::SearchGrnnParam param; + param.x = &x; + param.wi = &wi; + param.wh = &wh; + param.out = &out; + param.idx_sorted_by_width = &idx_sorted_by_width; + param.layout_input = &layout_input; + param.tmp_buffer = &tmp_buffer; + param.num_input = num_input; + param.num_hidden = num_hidden; + + SearchGrnnCompute sgc; + sgc.SetContext(std::move(ctx)); + sgc.SetParam(param); + sgc.Run(); + + // std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + auto* out_data = out.mutable_data(); + LOG(INFO) << out.numel(); + for (int i = 0; i < out.numel(); i++) { + // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + LOG(INFO) << out_data[i]; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_grnn, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_group_padding_compute.cc b/lite/kernels/x86/search_group_padding_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..d1847ac9dbafc533b8720ab65e6fa1915d5a136e --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_group_padding_compute.h" + +REGISTER_LITE_KERNEL( + search_group_padding, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchGroupPaddingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_emb_padding", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_new", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_padding", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..17244d15d9124d9d61d1f4fdef4f12590958c0be --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchGroupPaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchGroupPaddingParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + auto* bottom0 = param.x; + auto* top0 = param.out_emb_padding; + auto* top1 = param.out_new; + auto* top2 = param.out_padding; + + int _pad_id = param.pad_id; + + int batch = bottom0->lod()[0].size() - 1; + int dim0 = bottom0->dims()[0]; + int dim1 = bottom0->dims()[1]; + + const auto offset = bottom0->lod()[0]; + int max_seq = 0; + for (int i = 0; i < batch; ++i) { + if (offset[i + 1] - offset[i] > max_seq) { + max_seq = offset[i + 1] - offset[i]; + } + } + + std::vector new_offset; + new_offset.resize(batch + 1); + for (int i = 0; i < batch + 1; ++i) { + new_offset[i] = i * max_seq; + } + + // for padding data + lite::LoD top0_lod; + top0_lod.push_back(new_offset); + top0->set_lod(top0_lod); + top0->Resize({batch * max_seq, dim1}); + // for origin input id + // already set by ShareLoD in InferShape + lite::LoD top1_lod; + top1_lod.push_back(offset); + top1->set_lod(top1_lod); + top1->Resize({dim0, 1}); + memset(top1->mutable_data(), + 0, + top1->dims()[0] * top1->dims()[1] * sizeof(T)); + // for padding input id + lite::LoD top2_lod; + top2_lod.push_back(new_offset); + top2->set_lod(top2_lod); + top2->Resize({batch * max_seq, 1}); + // copy data + const auto* bottom_data = bottom0->data(); + auto* top_data = top0->mutable_data(); + auto* top_padding_input_data = top2->mutable_data(); + for (int i = 0; i < batch; i++) { + const int copy_step = offset[i + 1] - offset[i]; + const int start = i * max_seq; + memcpy(top_data + start * dim1, + bottom_data + offset[i] * dim1, + copy_step * dim1 * sizeof(T)); + memset(top_data + (start + copy_step) * dim1, + 0, + (max_seq - copy_step) * dim1 * sizeof(T)); + // for padding input id + memset(top_padding_input_data + start, 0, copy_step * sizeof(T)); + for (int j = start + copy_step; j < start + max_seq; j++) { + top_padding_input_data[j] = static_cast(_pad_id); + } + } + } + + virtual ~SearchGroupPaddingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4c36c2a63488a6bb902a2b8b4ad81fa32b37672 --- /dev/null +++ b/lite/kernels/x86/search_group_padding_compute_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_group_padding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_group_padding_x86, retrieve_op) { + auto search_group_padding = + KernelRegistry::Global().Create( + "search_group_padding"); + ASSERT_FALSE(search_group_padding.empty()); + ASSERT_TRUE(search_group_padding.front()); +} + +TEST(search_group_padding_x86, init) { + SearchGroupPaddingCompute search_group_padding; + ASSERT_EQ(search_group_padding.precision(), PRECISION(kFloat)); + ASSERT_EQ(search_group_padding.target(), TARGET(kX86)); +} + +TEST(search_group_padding_x86, run_test) { + lite::Tensor x, out_emb_padding, out_new, out_padding; + x.Resize({2, 3}); + out_emb_padding.Resize({-1, 3}); + out_new.Resize({2, 1}); + out_padding.Resize({-1, 1}); + LoD x_lod{}; + x_lod.push_back({0, 1}); + x.set_lod(x_lod); + + auto* x_data = x.mutable_data(); + for (int64_t i = 0; i < x.dims().production(); i++) { + x_data[i] = static_cast(i); + } + SearchGroupPaddingCompute sgp_kernel; + operators::SearchGroupPaddingParam param; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + sgp_kernel.SetContext(std::move(ctx)); + + param.x = &x; + param.out_emb_padding = &out_emb_padding; + param.out_new = &out_new; + param.out_padding = &out_padding; + + sgp_kernel.SetParam(param); + sgp_kernel.Run(); + + std::vector out_emb_padding_ref = {0, 1, 2}; + std::vector out_new_ref = {0, 0}; + std::vector out_padding_ref = {0}; + auto* out_emb_padding_data = out_emb_padding.mutable_data(); + auto* out_new_data = out_new.mutable_data(); + auto* out_padding_data = out_padding.mutable_data(); + for (int i = 0; i < out_emb_padding.dims().production(); i++) { + EXPECT_NEAR(out_emb_padding_data[i], out_emb_padding_ref[i], 1e-5); + } + for (int i = 0; i < out_new.dims().production(); i++) { + EXPECT_NEAR(out_new_data[i], out_new_ref[i], 1e-5); + } + for (int i = 0; i < out_padding.dims().production(); i++) { + EXPECT_NEAR(out_padding_data[i], out_padding_ref[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_group_padding, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_seq_depadding_compute.cc b/lite/kernels/x86/search_seq_depadding_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..db1816fb48fb85ade2b4ab0b96e7aa4de5236ced --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_depadding_compute.h" +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +void SearchSeqDepaddingCompute::Run() { + auto& param = this->Param(); + auto* pad = param.pad; + auto* src = param.src; + auto* out = param.out; + + const int pad_batch = pad->lod()[0].size() - 1; + const int src_batch = src->lod()[0].size() - 1; + if (pad_batch % src_batch != 0) { + LOG(FATAL) << "Mismatch batch size."; + } + + const auto& pad_offset = pad->lod()[0]; + const int pad_cap_e = pad->dims()[1]; + const auto& src_offset = src->lod()[0]; + const int src_cap_l = src->dims()[0]; + + LoD out_lod; + out_lod.push_back(src_offset); + out->set_lod(out_lod); + out->Resize({src_cap_l, pad_cap_e}); + + const auto* pad_data = pad->template data(); + auto* out_data = out->template mutable_data(); + for (int i = 0; i < src_batch; ++i) { + const int src_i_l = src_offset[i + 1] - src_offset[i]; + const int pad_i_l = pad_offset[i + 1] - pad_offset[i]; + if (pad_i_l < src_i_l) { + LOG(FATAL) + << "the length of padding seq input is less than source seq input."; + } + memcpy(out_data + src_offset[i] * pad_cap_e, + pad_data + pad_offset[i] * pad_cap_e, + src_i_l * pad_cap_e * sizeof(T)); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL( + search_seq_depadding, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchSeqDepaddingCompute, + def) + .BindInput("Pad", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Src", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_seq_depadding_compute.h b/lite/kernels/x86/search_seq_depadding_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..e48fa92723e332424df02cac3d044d4f2af129b8 --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/operators/op_params.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchSeqDepaddingCompute + : public KernelLite { + public: + using param_t = operators::SearchSeqDepaddingParam; + + void Run() override; + + virtual ~SearchSeqDepaddingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d978b35ed040d6b7c44354f37999e6e34e2e3ef --- /dev/null +++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_depadding_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(search_seq_depadding_x86, retrive_op) { + auto kernel = + KernelRegistry::Global().Create( + "search_seq_depadding"); + ASSERT_FALSE(kernel.empty()); + ASSERT_TRUE(kernel.front()); +} + +TEST(search_seq_depadding_x86, init) { + SearchSeqDepaddingCompute ssdc; + ASSERT_EQ(ssdc.precision(), PRECISION(kFloat)); + ASSERT_EQ(ssdc.target(), TARGET(kX86)); +} + +TEST(search_seq_depadding_x86, run_test) { + lite::Tensor pad, src, out; + pad.Resize({2 * 3, 4}); + src.Resize({3, 1}); + out.Resize({3, 4}); + LoD pad_lod{}; + pad_lod.push_back({0, 4, 6}); + pad.set_lod(pad_lod); + LoD src_lod{}; + src_lod.push_back({0, 2, 3}); + src.set_lod(src_lod); + + auto* pad_data = pad.mutable_data(); + for (int64_t i = 0; i < pad.dims().production(); i++) { + pad_data[i] = static_cast(i); + } + SearchSeqDepaddingCompute ssdc; + operators::SearchSeqDepaddingParam param; + + std::unique_ptr ctx(new KernelContext); + ctx->As(); + ssdc.SetContext(std::move(ctx)); + + param.pad = &pad; + param.src = &src; + param.out = &out; + + ssdc.SetParam(param); + ssdc.Run(); + + std::vector ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19}; + auto* out_data = out.mutable_data(); + for (int i = 0; i < out.dims().production(); i++) { + EXPECT_NEAR(out_data[i], ref_results[i], 1e-3); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(search_seq_depadding, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/search_seq_fc_compute.cc b/lite/kernels/x86/search_seq_fc_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0845bd74c764b04f0e89353ff5c457965e5f115 --- /dev/null +++ b/lite/kernels/x86/search_seq_fc_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/search_seq_fc_compute.h" + +REGISTER_LITE_KERNEL(search_seq_fc, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SearchSeqFcCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..80ef54b30b762848eceb16940c9f60ef8ba96927 --- /dev/null +++ b/lite/kernels/x86/search_seq_fc_compute.h @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SearchSeqFcCompute : public KernelLite { + public: + using param_t = operators::SearchSeqFcParam; + + void Run() override { + auto& context = ctx_->As(); + auto& param = *param_.get_mutable(); + + auto x = param.x; + auto w = param.w; + auto b = param.b; + auto out = param.out; + auto out_size = param.out_size; + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto out_dims = out->dims(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]"; + CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size"; + + auto blas = lite::x86::math::GetBlas(context); + blas.MatMul(*x, false, *w, true, out); + + if (b != nullptr) { + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + int M = x_dims[0]; + int N = w_dims[0]; + for (int i = 0; i < M; i++) { + blas.AXPY( + N, static_cast(1), b->data(), out->mutable_data() + i * N); + } + } + } + + virtual ~SearchSeqFcCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_arithmetic_compute.cc b/lite/kernels/x86/sequence_arithmetic_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..95fa27e3d4e7fdbf639a5b275568311907f8344d --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_arithmetic_compute.h" + +REGISTER_LITE_KERNEL( + sequence_arithmetic, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); +REGISTER_LITE_KERNEL( + search_seq_arithmetic, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceArithmeticCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..88510b8b1c7a04ab01da9af331f9d1f72765b215 --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute.h @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceArithmeticCompute + : public KernelLite { + public: + using param_t = operators::SequenceArithmeticParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto x = param.X; + auto y = param.Y; + auto out = param.Out; + int op_type = param.op_type; + + out->Resize(x->dims()); + out->set_lod(x->lod()); + + auto x_data = x->data(); + auto y_data = y->data(); + auto out_data = out->mutable_data(); + auto x_seq_offset = x->lod()[0]; + auto y_seq_offset = y->lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = (x->numel()) / (x->dims()[0]); + + // sum + if (op_type == 1) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] + input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + + // sub + if (op_type == 2) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] - input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + + // mul + if (op_type == 3) { + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + t_out[j] = input_x[j] * input_y[j]; + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len)); + } + } + } + } + + virtual ~SequenceArithmeticCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec --- /dev/null +++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_arithmetic_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +void sequence_arithmetic_compute_ref(const Tensor& x, + const Tensor& y, + Tensor* out, + int op_type) { + auto x_data = x.data(); + auto y_data = y.data(); + out->Resize(x.dims()); + out->set_lod(x.lod()); + auto out_data = out->mutable_data(); + auto x_seq_offset = x.lod()[0]; + auto y_seq_offset = y.lod()[0]; + int seq_num = x_seq_offset.size() - 1; + int inner_size = x.numel() / x.dims()[0]; + + for (int i = 0; i < seq_num; i++) { + int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size; + int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size; + auto input_x = x_data + x_seq_offset[i] * inner_size; + auto input_y = y_data + y_seq_offset[i] * inner_size; + auto t_out = out_data + x_seq_offset[i] * inner_size; + int len = std::min(len_x, len_y); + for (int j = 0; j < len; j++) { + switch (op_type) { + case 1: + t_out[j] = input_x[j] + input_y[j]; + break; + case 2: + t_out[j] = input_x[j] - input_y[j]; + break; + case 3: + t_out[j] = input_x[j] * input_y[j]; + break; + default: + break; + } + } + if (len_x > len) { + memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len)); + } + } +} + +void prepare_input(Tensor* x, const LoD& x_lod) { + x->Resize({static_cast(x_lod[0].back()), 3}); + x->set_lod(x_lod); + auto x_data = x->mutable_data(); + for (int i = 0; i < x->numel(); i++) { + x_data[i] = (i - x->numel() / 2) * 1.1; + } +} + +TEST(sequence_arithmetic_x86, retrive_op) { + auto sequence_arithmetic = + KernelRegistry::Global().Create( + "sequence_arithmetic"); + ASSERT_FALSE(sequence_arithmetic.empty()); + ASSERT_TRUE(sequence_arithmetic.front()); +} + +TEST(sequence_arithmetic_x86, init) { + SequenceArithmeticCompute sequence_arithmetic; + ASSERT_EQ(sequence_arithmetic.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_arithmetic.target(), TARGET(kX86)); +} + +TEST(sequence_arithmetic_x86, run_test) { + SequenceArithmeticCompute sequence_arithmetic; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + lite::Tensor x, y, out, out_ref; + lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}}; + prepare_input(&x, x_lod); + prepare_input(&y, y_lod); + + operators::SequenceArithmeticParam param; + param.X = &x; + param.Y = &y; + param.Out = &out; + param.op_type = 1; + + sequence_arithmetic.SetContext(std::move(ctx)); + sequence_arithmetic.SetParam(param); + sequence_arithmetic.Run(); + + sequence_arithmetic_compute_ref(x, y, &out_ref, param.op_type); + auto out_data = out.data(); + auto out_ref_data = out_ref.data(); + for (int i = 0; i < out.numel(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-3); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_arithmetic, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_concat_compute.cc b/lite/kernels/x86/sequence_concat_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..facdad39d383c3a2134599e1490c89e9d5afa543 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_concat_compute.h" + +REGISTER_LITE_KERNEL(sequence_concat, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceConcatCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..553e2e8b0667106f25685a9ef155d7e61a672f31 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute.h @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +template +class SequenceConcatCompute + : public KernelLite { + public: + using param_t = operators::SequenceConcatParam; + + void Run() override { + auto& param = *param_.get_mutable(); + // auto& param = Param(); + T* dout = param.Out->mutable_data(); + + std::vector x_in_order; + param.Out->set_lod(ConcatLoD(param.X, &x_in_order)); + + int num = x_in_order.size(); + int out_rows = 1; + + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel() / out_rows; + } + + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(dout + col_idx, input_data, sizeof(T) * col_len); + col_idx += col_len; + } + } + + virtual ~SequenceConcatCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..be1f86a5c848b5c03634ea2a1aed0d57f2283879 --- /dev/null +++ b/lite/kernels/x86/sequence_concat_compute_test.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_concat_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace { +inline LoD ConcatLoD(const std::vector& xs, + std::vector* xs_in_order) { + std::vector result; + result.resize(xs[0]->lod()[0].size()); + + for (size_t i = 1; i < result.size(); ++i) { + size_t sum = 0; + for (size_t j = 0; j < xs.size(); ++j) { + auto& x_lod = xs[j]->lod()[0]; + if (x_lod[i - 1] < x_lod[i]) { + xs_in_order->emplace_back(xs[j]->Slice(x_lod[i - 1], x_lod[i])); + } + sum += x_lod[i]; + } + result[i] = sum; + } + LoD lod; + lod.emplace_back(result); + return lod; +} + +static void sequence_concat_ref(const std::vector& xs, + lite::Tensor* out) { + std::vector out_dims; + int64_t batch_size = 0; + int64_t feature_size = 0; + for (const auto& tensor : xs) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + out_dims[0] = batch_size; + out->Resize(out_dims); + std::vector x_in_order; + out->set_lod(ConcatLoD(xs, &x_in_order)); + + int num = x_in_order.size(); + std::vector input_cols(num); + for (int i = 0; i < num; ++i) { + input_cols[i] = x_in_order[i].numel(); + } + float* out_data = out->mutable_data(); + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + auto input_data = x_in_order[j].data(); + memcpy(out_data + col_idx, input_data, sizeof(float) * col_len); + col_idx += col_len; + } +} + +#define PREPARE_INPUT(name) \ + name.Resize({name##_lod_len, feature_len}); \ + name.set_lod(lod_info_##name); \ + float* name##_data = name.mutable_data(); \ + for (int i = 0; i < name.numel(); ++i) { \ + name##_data[i] = (i - 2.0) * 1.0; \ + } + +} // namespace + +TEST(sequence_concat_x86, retrive_op) { + auto sequence_concat = + KernelRegistry::Global().Create( + "sequence_concat"); + ASSERT_FALSE(sequence_concat.empty()); + ASSERT_TRUE(sequence_concat.front()); +} + +TEST(sequence_concat_x86, init) { + SequenceConcatCompute sequence_concat; + ASSERT_EQ(sequence_concat.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_concat.target(), TARGET(kX86)); +} + +TEST(sequence_concat_x86, run_test) { + SequenceConcatCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::SequenceConcatParam param; + lite::Tensor x1, x2, x3; + lite::Tensor y, y_ref; + + int32_t x1_lod_len = 10, feature_len = 4; + int32_t x2_lod_len = 4, x3_lod_len = 8; + int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len; + LoD lod_info_x1{{0, 3, 5, 6, 10}}; + LoD lod_info_x2{{0, 1, 2, 3, 4}}; + LoD lod_info_x3{{0, 2, 4, 6, 8}}; + LoD lod_info_y{{0, 0, 0, 0, 0}}; + for (size_t i = 0; i < lod_info_x1[0].size(); ++i) { + lod_info_y[0][i] = + lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i]; + } + + PREPARE_INPUT(x1); + PREPARE_INPUT(x2); + PREPARE_INPUT(x3); + + y_ref.Resize({y_lod_len, feature_len}); + y.Resize({y_lod_len, feature_len}); + y_ref.set_lod(lod_info_y); + y.set_lod(lod_info_y); + + std::vector xs{&x1, &x2, &x3}; + + param.X = xs; + param.Out = &y; + seq_kernel.SetParam(param); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + + auto* y_data = y.mutable_data(); + sequence_concat_ref(xs, &y_ref); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_concat, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_reverse_compute.cc b/lite/kernels/x86/sequence_reverse_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c391e12ad1df671517c182509e415325bb8ce56 --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_reverse_compute.h" + +typedef paddle::lite::kernels::x86::SequenceReverseCompute + ReverseFp32; +typedef paddle::lite::kernels::x86::SequenceReverseCompute + ReverseInt64; + +REGISTER_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, ReverseFp32, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); + +REGISTER_LITE_KERNEL(sequence_reverse, kX86, kInt64, kNCHW, ReverseInt64, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_reverse_compute.h b/lite/kernels/x86/sequence_reverse_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..ab93972276664acc8585bd150a53601c039ccf87 --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceReverseCompute : public KernelLite { + public: + using param_t = operators::SequenceReverseParam; + + void Run() override { + auto& param = this->template Param(); + auto* output = param.Out; + const auto* din = param.X->template data(); + + T* dout = output->template mutable_data(); + CHECK_NE(din, dout) + << "SequenceReverse Op does not support in-place operation"; + const auto lod = param.X->lod()[param.X->lod().size() - 1]; + const size_t lod_count = lod.size(); + + size_t limit = static_cast(param.X->numel()); + size_t row_numel = static_cast(limit / param.X->dims()[0]); + + for (size_t idx = 0; idx < lod_count - 1; ++idx) { + auto start_pos = lod[idx]; + auto end_pos = lod[idx + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(dout + pos * row_numel, + din + cur_pos * row_numel, + row_numel * sizeof(T)); + } + } + output->set_lod(param.X->lod()); + } + + virtual ~SequenceReverseCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b84241c8b19e3db57dd7ef6339496191a7486be --- /dev/null +++ b/lite/kernels/x86/sequence_reverse_compute_test.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_reverse_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +namespace { +static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) { + const auto* x_data = x->data(); + auto seq_offset = x->lod()[x->lod().size() - 1]; + int width = x->numel() / x->dims()[0]; + auto* y_data = y->mutable_data(); + for (int i = 0; i < seq_offset.size() - 1; ++i) { + auto start_pos = seq_offset[i]; + auto end_pos = seq_offset[i + 1]; + for (auto pos = start_pos; pos < end_pos; ++pos) { + auto cur_pos = end_pos - pos - 1 + start_pos; + std::memcpy(y_data + pos * width, + x_data + cur_pos * width, + width * sizeof(float)); + } + } +} +} // namespace + +TEST(sequence_reverse_x86, retrive_op) { + auto sequence_reverse = + KernelRegistry::Global().Create( + "sequence_reverse"); + ASSERT_FALSE(sequence_reverse.empty()); + ASSERT_TRUE(sequence_reverse.front()); +} + +TEST(sequence_reverse_x86, init) { + SequenceReverseCompute sequence_reverse; + ASSERT_EQ(sequence_reverse.precision(), PRECISION(kFloat)); + ASSERT_EQ(sequence_reverse.target(), TARGET(kX86)); +} + +TEST(sequence_reverse_x86, run_test) { + SequenceReverseCompute seq_kernel; + std::unique_ptr ctx(new KernelContext); + + operators::SequenceReverseParam param; + lite::Tensor x, x_ref; + lite::Tensor y, y_ref; + + int32_t lod_len = 10, feature_len = 4; + LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}}; + + x.Resize({lod_len, feature_len}); + x_ref.Resize({lod_len, feature_len}); + y.Resize({lod_len, feature_len}); + y_ref.Resize({lod_len, feature_len}); + x.set_lod(lod_info); + x_ref.set_lod(lod_info); + y.set_lod(lod_info); + y_ref.set_lod(lod_info); + + auto* y_data = y.mutable_data(); + float* x_data = x.mutable_data(); + float* x_ref_data = x_ref.mutable_data(); + float* y_ref_data = y_ref.mutable_data(); + + for (int i = 0; i < x.numel(); ++i) { + x_ref_data[i] = (i - 2.0) * 1.0; + x_data[i] = (i - 2.0) * 1.0; + } + + param.X = &x; + param.Out = &y; + seq_kernel.SetParam(param); + + seq_kernel.SetContext(std::move(ctx)); + seq_kernel.Run(); + + sequence_reverse_ref(&x_ref, &y_ref); + for (int i = 0; i < y.numel(); i++) { + EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..9bd8b287507426798e0ec24f8854e812016b0054 --- /dev/null +++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/sequence_topk_avg_pooling_compute.h" + +REGISTER_LITE_KERNEL( + sequence_topk_avg_pooling, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SequenceTopkAvgPoolingCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..724415288a72932392d5726778830095c8810e15 --- /dev/null +++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "lite/backends/x86/math/sequence_topk_avg_pooling.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class SequenceTopkAvgPoolingCompute + : public KernelLite { + public: + using param_t = operators::SequenceTopkAvgPoolingParam; + + void Run() override { + auto& param = *param_.get_mutable(); + lite::x86::math::SequenceTopkAvgPoolingFunctor + sequence_topk_avg_pooling; + sequence_topk_avg_pooling(*param.X, + *param.ROW, + *param.COLUMN, + param.Out, + param.pos, + param.channel_num, + param.topks); + }; + virtual ~SequenceTopkAvgPoolingCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc index a00aa6d566b4bd9f6a880ab5255f40c71bb1360c..3a2cdc29ed262740aec0efca9460800f57f43437 100644 --- a/lite/kernels/x86/softmax_compute.cc +++ b/lite/kernels/x86/softmax_compute.cc @@ -23,3 +23,13 @@ REGISTER_LITE_KERNEL(softmax, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); +REGISTER_LITE_KERNEL(search_seq_softmax, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::SoftmaxCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f69319a6ca44a7f1a191df16db6b9b6c29553ac --- /dev/null +++ b/lite/kernels/x86/stack_compute.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/stack_compute.h" + +REGISTER_LITE_KERNEL(stack, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::StackCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..12a6c3490eff9d446de96366c8dd5fe6b2a4bd06 --- /dev/null +++ b/lite/kernels/x86/stack_compute.h @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "lite/core/kernel.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" +#include "lite/operators/stack_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class StackCompute : public KernelLite { + public: + using param_t = operators::StackParam; + + void Run() override { + auto& param = *param_.get_mutable(); + auto x = param.X; + auto y = param.Out; + + int axis = param.axis; + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + auto y_data = y->mutable_data(); + std::vector x_datas(n); + for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data(); + + int pre = 1, post = 1; + auto dim = x[0]->dims(); + for (int i = 0; i < axis; ++i) pre *= dim[i]; + for (int i = axis; i < dim.size(); ++i) post *= dim[i]; + + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy( + y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } + } + + virtual ~StackCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d105165a98f936b7a6973e57f5199977a0b8bed3 --- /dev/null +++ b/lite/kernels/x86/stack_compute_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/stack_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +// stack +TEST(stack_x86, retrive_op) { + auto stack = + KernelRegistry::Global().Create("stack"); + ASSERT_FALSE(stack.empty()); + ASSERT_TRUE(stack.front()); +} + +TEST(stack_x86, init) { + lite::kernels::x86::StackCompute stack; + ASSERT_EQ(stack.precision(), PRECISION(kFloat)); + ASSERT_EQ(stack.target(), TARGET(kX86)); +} + +TEST(stack_x86, run_test) { + lite::Tensor x; + lite::Tensor out; + int num_input = 5; + + std::vector x_shape({10, 20, 10}); + x.Resize(lite::DDim(x_shape)); + + std::vector out_shape({5, 10, 20, 10}); + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); ++i) { + x_data[i] = static_cast(i); + } + std::vector input; + for (int i = 0; i < num_input; ++i) { + input.emplace_back(&x); + } + + // StackCompute stack; + StackCompute stack; + operators::StackParam param; + + param.X = input; + param.Out = &out; + int axis = 0; + param.axis = axis; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + stack.SetContext(std::move(ctx)); + stack.SetParam(param); + stack.Run(); + + int ref_data = 0; + for (int j = 0; j < out.dims().production(); ++j) { + EXPECT_NEAR(out_data[j], ref_data, 1e-5); + ref_data++; + ref_data = (ref_data >= 2000) ? (ref_data - 2000) : ref_data; + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(stack, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/var_conv_2d_compute.cc b/lite/kernels/x86/var_conv_2d_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..48ae1b055efc85e16905e3201d467017fc650a5a --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/var_conv_2d_compute.h" + +REGISTER_LITE_KERNEL(var_conv_2d, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::VarConv2DCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..c94cb2ca2d43a138b5769653d6cad2d52d420563 --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute.h @@ -0,0 +1,213 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include "lite/backends/x86/math/blas.h" +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +template +class VarConv2DCompute : public KernelLite { + public: + using param_t = operators::VarConv2DParam; + + void Im2Col(const lite::Tensor& input, lite::Tensor* col) const { + auto& param = *param_.get_mutable(); + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + // std::vector col_lod_vec; + // col_lod_vec.push_back(top_offset); + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } + } + + void Run() override { + auto& param = *param_.get_mutable(); + auto& context = ctx_->As(); + auto* bottom = param.X; + // auto* in_row = param.ROW; + // auto* in_col = param.COLUMN; + auto* w = param.W; + auto* top = param.Out; + auto* col = param.Col; + + int output_channel = param.output_channel; + int input_channel = param.input_channel; + int kernel_h = param.kernel_h; + int kernel_w = param.kernel_w; + int stride_h = param.stride_h; + int stride_w = param.stride_w; + + Im2Col(*bottom, col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + // const auto& offset_x = in_col->lod()[0]; + // const auto& offset_y = in_row->lod()[0]; + const auto& offset_y = param.X->lod()[1]; + const auto& offset_x = param.X->lod()[2]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + auto blas = lite::x86::math::GetBlas(context); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } + } + + virtual ~VarConv2DCompute() = default; +}; + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d6ae5a67bfc9deba1fb097fa5c0c0cf323b65e48 --- /dev/null +++ b/lite/kernels/x86/var_conv_2d_compute_test.cc @@ -0,0 +1,315 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/x86/var_conv_2d_compute.h" +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/core/tensor.h" +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +static void im2col_ref(const lite::Tensor& input, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + lite::Tensor* col) { + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_x * top_im_y; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + col->Resize(col_dims_vec); + auto* top_data = col->mutable_data(); + const auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } +} + +static void var_conv_2d_ref(const lite::Tensor* bottom, + const lite::Tensor* w, + const lite::Tensor* in_row, + const lite::Tensor* in_col, + const int kernel_h, + const int kernel_w, + const int stride_h, + const int stride_w, + const int input_channel, + const int output_channel, + lite::Tensor* top, + lite::Tensor* col) { + std::unique_ptr ctx(new KernelContext); + auto& context = ctx->As(); + + im2col_ref(*bottom, + in_row, + in_col, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + LoD top_lod; + top_lod.push_back(top_offset); + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + top->Resize(top_dims_vec); + auto* top_data = top->mutable_data(); + const auto* w_data = w->data(); + const auto* col_data = col->data(); + + auto blas = lite::x86::math::GetBlas(context); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(false, + false, + output_channel, + top_im_size, + input_channel * kernel_h * kernel_w, + 1.0, + w_data, + input_channel * kernel_h * kernel_w, + col_data + col_offset[b], + top_im_size, + 0.0, + top_data + top_offset[b], + top_im_size); + } +} + +TEST(var_conv_2d_x86, retrive_op) { + auto var_conv_2d = + KernelRegistry::Global().Create( + "var_conv_2d"); + ASSERT_FALSE(var_conv_2d.empty()); + ASSERT_TRUE(var_conv_2d.front()); +} + +TEST(var_conv_2d_x86, init) { + VarConv2DCompute var_conv_2d; + ASSERT_EQ(var_conv_2d.precision(), PRECISION(kFloat)); + ASSERT_EQ(var_conv_2d.target(), TARGET(kX86)); +} + +TEST(var_conv_2d_x86, run_test) { + VarConv2DCompute var_conv_2d; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + + operators::VarConv2DParam param; + + lite::Tensor X, W, ROW, COLUMN; + lite::Tensor Out, Col; + int kernel_h, kernel_w; + int stride_h, stride_w; + int input_channel, output_channel; + + output_channel = 5; + input_channel = 5; + kernel_h = 5; + kernel_w = 5; + stride_h = 1; + stride_w = 1; + std::vector w_dims_vec; + w_dims_vec.push_back(output_channel); + w_dims_vec.push_back(input_channel * kernel_h * kernel_w); + W.Resize(w_dims_vec); + auto* w_data = W.mutable_data(); + for (int i = 0; i < W.numel(); ++i) { + w_data[i] = i - 1.f; + } + + std::vector row_lod_vec{0, 10, 20}; + LoD row_lod; + row_lod.push_back(row_lod_vec); + ROW.set_lod(row_lod); + + std::vector column_lod_vec{0, 10, 20}; + LoD column_lod; + column_lod.push_back(column_lod_vec); + COLUMN.set_lod(column_lod); + + int x_size = 0; + std::vector x_lod_vec; + x_lod_vec.push_back(0); + for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) { + int height = row_lod_vec[i + 1] - row_lod_vec[i]; + int width = column_lod_vec[i + 1] - column_lod_vec[i]; + x_lod_vec.push_back(height * width * input_channel); + x_size += height * width * input_channel; + } + std::vector x_dims_vec{x_size, 1}; + LoD x_lod; + x_lod.push_back(x_lod_vec); + x_lod.push_back(row_lod_vec); + x_lod.push_back(column_lod_vec); + X.Resize(x_dims_vec); + X.set_lod(x_lod); + auto* x_data = X.mutable_data(); + for (int i = 0; i < X.numel(); ++i) { + x_data[i] = i % 20 * 1.f; + } + + param.X = &X; + param.W = &W; + // param.ROW = &ROW; + // param.COLUMN = &COLUMN; + param.Out = &Out; + param.Col = &Col; + param.stride_h = stride_h; + param.stride_w = stride_w; + param.kernel_h = kernel_h; + param.kernel_w = kernel_w; + param.input_channel = input_channel; + param.output_channel = output_channel; + var_conv_2d.SetParam(param); + var_conv_2d.SetContext(std::move(ctx)); + var_conv_2d.Run(); + + lite::Tensor top_ref, col_ref; + var_conv_2d_ref(&X, + &W, + &ROW, + &COLUMN, + kernel_h, + kernel_w, + stride_h, + stride_w, + input_channel, + output_channel, + &top_ref, + &col_ref); + + for (int i = 0; i < Out.numel(); ++i) { + EXPECT_NEAR(Out.data()[i], top_ref.data()[i], 1e-5); + } + for (int i = 0; i < Col.numel(); ++i) { + EXPECT_NEAR(Col.data()[i], col_ref.data()[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(var_conv_2d, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc index 2c758cf9507087fb53d476ff86a64707e0c6249b..d6fc806ad4541a735ea4ef6eff292076836ac5e7 100644 --- a/lite/kernels/xpu/bridges/conv_op.cc +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "lite/operators/conv_op.h" #include "lite/backends/xpu/builder.h" #include "lite/kernels/xpu/bridges/registry.h" @@ -46,14 +47,36 @@ node_map_type ConvConverter(const std::shared_ptr op, auto groups = op_info->GetAttr("groups"); auto dilations = op_info->GetAttr>("dilations"); auto fuse_relu = op_info->GetAttr("fuse_relu"); - CHECK_EQ(strides.size(), 2); - CHECK_EQ(paddings.size(), 2); - CHECK_EQ(dilations.size(), 2); + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "Paddings size should be the same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + std::vector output_shape({bs, oc}); for (size_t i = 0; i < 2; i++) { const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; output_shape.push_back( - (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1); + (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) / + strides[i] + + 1); } DDim output_dims(output_shape); diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc index ebdb67bd0d2801a9036696f52790f7104279b0cb..70929ffcd596c299b6d8975c2bfbb8941fc67525 100644 --- a/lite/kernels/xpu/bridges/conv_op_test.cc +++ b/lite/kernels/xpu/bridges/conv_op_test.cc @@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr op) { int stride_h = strides[0]; int dila_w = dilations[1]; int dila_h = dilations[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int pad_h = paddings[0]; int batch_size = input_dims[0]; int in_ch_size = input_dims[1]; @@ -175,7 +175,8 @@ void test_conv(int bs, opdesc.SetOutput("Output", {output_var_name}); opdesc.SetAttr("dilations", std::vector({dilation, dilation})); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("groups", groups); opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); if (has_bias) { diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc index ed5f922d59b5ca5e387076c9a533c4b4c251cc87..7efc6b464c00c945c71c8c5689e18823cde10f97 100644 --- a/lite/kernels/xpu/bridges/pool_op_test.cc +++ b/lite/kernels/xpu/bridges/pool_op_test.cc @@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr op) { int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; if (global_pooling == true) { for (int n = 0; n < in_n; ++n) { @@ -162,7 +162,8 @@ void test_pool(int bs, opdesc.SetAttr("global_pooling", global_pooling); opdesc.SetAttr("exclusive", exclusive); opdesc.SetAttr("strides", std::vector({stride, stride})); - opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("paddings", + std::vector({padding, padding, padding, padding})); opdesc.SetAttr("ceil_mode", ceil_mode); // create and convert op to XPU model, then run it on XPU diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc index 13b6cb5b77d00a2a5f733a0015dec4dbebc088b7..ed3f45c598e74a0450454c15ad0cd9ad09266f8e 100644 --- a/lite/model_parser/model_parser.cc +++ b/lite/model_parser/model_parser.cc @@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir, SaveParamNaive(path, exec_scope, var.Name()); } } - VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully"; + LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully"; } #endif diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt index 49badbb27b00979117f9e75d1c66763a7be99837..7c4048c204b0889f9a9bd72a7e94da3777441d37 100644 --- a/lite/operators/CMakeLists.txt +++ b/lite/operators/CMakeLists.txt @@ -2,11 +2,10 @@ set(op_DEPS tensor op op_params scope memory) lite_cc_library(op_params SRCS op_params.cc DEPS tensor any) +# 1.baisc ops used in basic models add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS}) add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS}) add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS}) -add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS}) -add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS}) add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS}) add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS}) add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS}) @@ -15,57 +14,64 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} ) add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS}) add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS}) add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS}) -add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS}) -add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS}) add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS}) add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS}) -add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS}) -add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS}) add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS}) -add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS}) add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS}) add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS}) -#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS}) -add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS}) -add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS}) add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS}) add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS}) add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS}) add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS}) -add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS}) -add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS}) -add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS}) -add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) -add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS}) add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS}) -add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS}) -add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS}) add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS}) -add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS}) -add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS}) add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS}) -add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS}) add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS}) add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS}) add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS}) -add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS}) add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS}) -add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS}) -add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS}) -add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS}) -add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS}) add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS}) -add_operator(unsqueeze_op_lite extra SRCS unsqueeze_op.cc DEPS ${op_DEPS}) -add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS}) +add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS}) +add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS}) +add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS}) +add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS}) +add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS}) +add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS}) +add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS}) +add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS}) +add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS}) +add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS}) +add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS}) +add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS}) +add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS}) + +# 2.basic ops not used in basic models +add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS}) +add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS}) +add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS}) +add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS}) +add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS}) + +# 3.extra ops +add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS}) +add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS}) +add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS}) +add_operator(uniform_random_op extra SRCS uniform_random_op.cc DEPS ${op_DEPS}) +add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS}) +add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS}) +add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS}) +add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS}) +add_operator(density_prior_box_op extra SRCS density_prior_box_op.cc DEPS ${op_DEPS}) +add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS}) +add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS}) +add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS}) +add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS}) +add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS}) add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS}) -add_operator(reduce_mean_op extra SRCS reduce_mean_op.cc DEPS ${op_DEPS}) -add_operator(stack_op extra SRCS stack_op.cc DEPS ${op_DEPS}) -add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS}) -add_operator(affine_channel_op extra SRCS affine_channel_op.cc DEPS ${op_DEPS}) add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS}) add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS}) add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS}) @@ -73,16 +79,26 @@ add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS}) add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS}) add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS}) add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS}) -add_operator(range_op extra SRCS range_op.cc DEPS ${op_DEPS}) add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS}) + add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS}) add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS}) add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS}) +add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS}) add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS}) +add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS}) +add_operator(search_grnn_op_lite extra SRCS search_grnn_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_softmax_op_lite extra SRCS search_seq_softmax_op.cc DEPS ${op_DEPS}) +add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_DEPS}) +add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS}) +add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS}) +add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS}) # for OCR specific add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS}) add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS}) +add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS}) add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS}) add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS}) add_operator(logical_xor extra SRCS logical_op.cc DEPS ${op_DEPS}) @@ -106,7 +122,11 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS}) add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS}) add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS}) add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS}) - +# for content-dnn specific +add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS}) +add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS}) +add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS}) +add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS}) if (NOT LITE_WITH_X86) lite_cc_test(test_fc_op SRCS fc_op_test.cc @@ -122,8 +142,8 @@ if (NOT LITE_WITH_X86) lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory) lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope) lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm) - lite_cc_test(test_fusion_elementwise_activation_ops - SRCS fusion_elementwise_activation_ops_test.cc - DEPS fusion_elementwise_activation_ops memory) lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory) + lite_cc_test(test_fusion_elementwise_activation_ops + SRCS fusion_elementwise_activation_ops_test.cc + DEPS fusion_elementwise_activation_ops memory) endif() diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc index c3c5de311f41f88fbeed4b03f9bfd618cf51c3b3..6ddcee0cb9e7fb0ef6df8a8c03d85fe406590b9d 100644 --- a/lite/operators/activation_ops.cc +++ b/lite/operators/activation_ops.cc @@ -117,6 +117,7 @@ REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp); +REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp); REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp); diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a88df0e7a902c6cac63eb77377bb0b49ee30c9b3 --- /dev/null +++ b/lite/operators/attention_padding_mask_op.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/attention_padding_mask_op.h" +#include "lite/core/op_registry.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool AttentionPaddingMaskOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.pad_begin); + return true; +} + +bool AttentionPaddingMaskOp::InferShape() const { + auto src_len = param_.X->lod()[0][1]; + CHECK_EQ(src_len, param_.X->dims()[1]) + << "Mismatch source length, expect: " << src_len + << ", get: " << param_.X->lod()[0][1]; + auto att_batch = param_.X->lod()[0].size() - 1; + auto src_batch = param_.Y->lod()[0].size() - 1; + CHECK_EQ(att_batch % src_batch, 0) + << "Mismatch batch size, bottom0: " << att_batch + << ", bottom1: " << src_batch; + + param_.pad_begin->Resize({static_cast(src_batch)}); + param_.Out->Resize(param_.X->dims()); + param_.Out->set_lod(param_.X->lod()); + + return true; +} + +bool AttentionPaddingMaskOp::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + param_.X = scope->FindTensor(op_desc.Input("X").front()); + param_.Y = scope->FindTensor(op_desc.Input("Y").front()); + param_.Out = scope->FindMutableTensor(op_desc.Output("Out").front()); + param_.pad_begin = + scope->FindMutableTensor(op_desc.Output("pad_begin").front()); + + param_.pad_id = op_desc.GetAttr("pad_id"); + param_.mask = op_desc.GetAttr("mask"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(attention_padding_mask, + paddle::lite::operators::AttentionPaddingMaskOp); +REGISTER_LITE_OP(search_attention_padding_mask, + paddle::lite::operators::AttentionPaddingMaskOp); diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h new file mode 100644 index 0000000000000000000000000000000000000000..894d68f6226720139aee07274d4ac5cf660749f1 --- /dev/null +++ b/lite/operators/attention_padding_mask_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace operators { + +class AttentionPaddingMaskOp : public OpLite { + public: + AttentionPaddingMaskOp() {} + + explicit AttentionPaddingMaskOp(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "attention_padding_mask"; } + + private: + mutable AttentionPaddingMaskParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index ceca1a61ce3457ed0a2c25541d02bd868c380b3b..6dab55ff3b6c55e7763484d78c6c36bf85017128 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -39,56 +39,38 @@ bool ConvOpLite::CheckShape() const { return true; } -inline int ConvOutputSize( - int input_size, int filter_size, int dilation, int padding, int stride) { +inline int ConvOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + 2 * padding - dkernel) / stride + 1; - // CHECK_GT_OR_FALSE(output_size, 0); + int output_size = + (input_size + (pad_left + pad_right) - dkernel) / stride + 1; return output_size; } -inline void UpdatePaddingAndDilation(std::vector* paddings, - std::vector* dilations, - const std::vector& strides, - const std::string padding_algorithm, - const lite::DDim data_dims, - const lite::DDim& ksize) { - // when padding_desc is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (size_t i = 0; i < strides.size(); ++i) { - int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; - int pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], - (int64_t)0); - // pad - *(paddings->begin() + i) = pad_sum / 2; - // dilation - *(dilations->begin() + i) = 1; - } - } else if (padding_algorithm == "VALID") { - for (auto& it : *paddings) { - it = 0; - } - } -} - bool ConvOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); - UpdatePaddingAndDilation(¶m_.paddings, - ¶m_.dilations, + UpdatePaddingAndDilation(param_.paddings.get(), + param_.dilations.get(), param_.strides, padding_algorithm_, in_dims, filter_dims); std::vector output_shape({in_dims[0], filter_dims[0]}); + auto paddings = *param_.paddings; + auto dilations = *param_.dilations; for (size_t i = 0; i < param_.strides.size(); ++i) { output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], - param_.dilations[i], - param_.paddings[i], + dilations[i], + paddings[i * 2], + paddings[i * 2 + 1], param_.strides[i])); } diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index e764819f6308e9723f185bc73979000af7f72b5b..3ab34bc1d0bd631b0641cebd3db29cfff9316bb0 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include "lite/core/kernel.h" @@ -47,9 +48,10 @@ class ConvOpLite : public OpLite { param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); - param_.dilations = op_desc.GetAttr>("dilations"); + auto dilations = op_desc.GetAttr>("dilations"); + param_.dilations = std::make_shared>(dilations); // optional params std::vector input_arg_names = op_desc.InputArgumentNames(); @@ -109,12 +111,24 @@ class ConvOpLite : public OpLite { param_.output_scale = op_desc.GetAttr("output_scale"); } } + + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < param_.strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the input size."; + } + } + param_.paddings = std::make_shared>(paddings); return true; } - void AttachKernel(KernelBase* kernel) override { - kernel->SetParam(param_); - } + void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "conv2d"; } @@ -123,6 +137,34 @@ class ConvOpLite : public OpLite { std::string padding_algorithm_{""}; }; +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const lite::DDim data_dims, + const lite::DDim& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc index fb6b431fff8ab20dd1a6d1abc8aff7443771ee2f..a472ae07455dd1b10688a4b033358bba70d8f34f 100644 --- a/lite/operators/conv_transpose_op.cc +++ b/lite/operators/conv_transpose_op.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "lite/operators/conv_transpose_op.h" +#include #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" @@ -32,24 +32,75 @@ bool ConvTransposeOpLite::CheckShape() const { CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size()); CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U); - CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size()); CHECK_OR_FALSE(in_dims[1] % param_.groups == 0); + CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL); return true; } +inline int ConvTransposeOutputSize(int input_size, + int filter_size, + int dilation, + int pad_left, + int pad_right, + int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = (input_size - 1) * stride - pad_left - pad_right + dkernel; + + return output_size; +} + +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const lite::DDim data_dims, + const lite::DDim& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + bool ConvTransposeOpLite::InferShape() const { const auto in_dims = param_.x->dims(); const auto filter_dims = param_.filter->dims(); + UpdatePaddingAndDilation(param_.paddings.get(), + param_.dilations.get(), + param_.strides, + padding_algorithm_, + in_dims, + filter_dims); + auto paddings = *param_.paddings; + auto dilations = *param_.dilations; + std::vector output_shape; output_shape.push_back(in_dims[0]); output_shape.push_back(filter_dims[1] * param_.groups); - for (int i = 0; i < param_.strides.size(); i++) { - int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1; - int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent - - 2 * param_.paddings[i]; - output_shape.push_back(output_len); + for (size_t i = 0; i < param_.strides.size(); ++i) { + output_shape.push_back(ConvTransposeOutputSize(in_dims[i + 2], + filter_dims[i + 2], + dilations[i], + paddings[i * 2], + paddings[i * 2 + 1], + param_.strides[i])); } // Set output dims @@ -58,8 +109,8 @@ bool ConvTransposeOpLite::InferShape() const { } // TODO(Superjomn) replace framework::OpDesc with a lite one. -bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, - lite::Scope *scope) { +bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { auto X = op_desc.Input("Input").front(); auto Filter = op_desc.Input("Filter").front(); auto Out = op_desc.Output("Output").front(); @@ -68,9 +119,27 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, param_.output = scope->FindVar(Out)->GetMutable(); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); param_.groups = op_desc.GetAttr("groups"); - param_.dilations = op_desc.GetAttr>("dilations"); + auto dilations = op_desc.GetAttr>("dilations"); + + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the input size."; + } + } + param_.paddings = std::make_shared>(paddings); + param_.dilations = std::make_shared>(dilations); // optional params std::vector input_arg_names = op_desc.InputArgumentNames(); @@ -81,7 +150,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc, auto bias_var = scope->FindVar(bias_arguments.front()); if (bias_var != nullptr) { param_.bias = - const_cast(&(bias_var->Get())); + const_cast(&(bias_var->Get())); } } } diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h index d8b64c78efdcc00b5842c90336ce195b55d59370..fb25c022f974ad195bf72b19cb9b459b2d11d5f2 100644 --- a/lite/operators/conv_transpose_op.h +++ b/lite/operators/conv_transpose_op.h @@ -44,6 +44,7 @@ class ConvTransposeOpLite : public OpLite { private: mutable ConvParam param_; + std::string padding_algorithm_{""}; }; } // namespace operators diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc index 6e4bee4da87095245d90c6af5db98d2e95d7d3d8..acf9701cbd750e83ba51f25c66064c2dd7781db6 100644 --- a/lite/operators/fill_constant_op.cc +++ b/lite/operators/fill_constant_op.cc @@ -29,6 +29,12 @@ class FillConstantOp : public OpLite { } bool InferShape() const override { + lite::Tensor* shape_tensor_ = param_.shape_tensor; + if (param_.shape.empty() && shape_tensor_ != nullptr) { + param_.Out->Resize(shape_tensor_->dims()); + return true; + } + param_.Out->Resize(param_.shape); return true; } @@ -41,6 +47,23 @@ class FillConstantOp : public OpLite { param_.shape = opdesc.GetAttr>("shape"); param_.value = opdesc.GetAttr("value"); param_.force_cpu = opdesc.GetAttr("force_cpu"); + param_.shape_tensor = nullptr; + param_.shape_tensor_list = {}; + + std::vector input_arg_names = opdesc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), + input_arg_names.end(), + "ShapeTensor") != input_arg_names.end()) { + auto args = opdesc.Input("ShapeTensor"); + auto* var = scope->FindVar(args.front()); + param_.shape_tensor = var->GetMutable(); + } + if (opdesc.HasAttr("ShapeTensorList")) { + auto args = opdesc.Input("ShapeTensorList"); + auto* var = scope->FindVar(args.front()); + param_.shape_tensor_list = + *(var->GetMutable>()); + } return true; } diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc index b98240ba4f255377c0ac661950a45bef0a7d0516..936da73d89007f4f6dd36fa770df537996c40a51 100644 --- a/lite/operators/interpolate_op.cc +++ b/lite/operators/interpolate_op.cc @@ -45,23 +45,42 @@ bool InterpolateOp::InferShape() const { int out_h; int out_w; - if (OutSize != nullptr) { - auto outsize_data = OutSize->data(); - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - param_.Out->Resize({n, c, h_out, w_out}); + auto SizeTensor = param_.SizeTensor; + if (!SizeTensor.empty()) { + CHECK(SizeTensor.size() == 2) + << "Input(SizeTensor)'size of Op(interpolate) must be 2. " + "Attr(out_shape)'s length must be 2 for 4-D input tensor."; + out_h = param_.out_h; + out_w = param_.out_w; + param_.Out->Resize({n, c, out_h, out_w}); + return true; + } + + auto Scale = param_.Scale; + if (Scale) { + auto scale_dims = Scale->dims(); + CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1."; + out_h = -1; + out_w = -1; } else { - if (0 >= param_.out_h && 0 >= param_.out_w) { - out_h = h * param_.scale; - out_w = w * param_.scale; + auto scale = param_.scale; + if (scale > 0) { + out_h = static_cast(h * scale); + out_w = static_cast(w * scale); out_h = out_h > 0 ? out_h : -1; out_w = out_w > 0 ? out_w : -1; } else { out_h = param_.out_h; out_w = param_.out_w; } - param_.Out->Resize({n, c, out_h, out_w}); } + + if (OutSize != nullptr) { + auto out_lod = param_.Out->mutable_lod(); + *out_lod = param_.X->lod(); + } + param_.Out->Resize({n, c, out_h, out_w}); + return true; } @@ -76,6 +95,24 @@ bool InterpolateOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) { } else { param_.OutSize = nullptr; } + + if (op_desc.HasInput("SizeTensor")) { + auto size_tensor = op_desc.Input("SizeTensor"); + for (auto var : size_tensor) { + param_.SizeTensor.push_back( + scope->FindVar(var)->GetMutable()); + } + } + + if (op_desc.HasInput("Scale")) { + auto scale_var_names = op_desc.Input("Scale"); + if (scale_var_names.size() > 0) { + param_.Scale = + scope->FindVar(scale_var_names.front())->GetMutable(); + } + } else { + param_.Scale = nullptr; + } auto Out = op_desc.Output("Out").front(); param_.X = scope->FindVar(X)->GetMutable(); param_.Out = scope->FindVar(Out)->GetMutable(); diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c783695163b1d95964ac1a8a9d79d7167811261a --- /dev/null +++ b/lite/operators/lookup_table_v2_op.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/lookup_table_v2_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool LookupTableV2OpLite::CheckShape() const { + CHECK_OR_FALSE(param_.W) + CHECK_OR_FALSE(param_.Ids) + CHECK_OR_FALSE(param_.Out) + + auto table_dims = param_.W->dims(); + + CHECK_EQ_OR_FALSE(table_dims.size(), 2) + + return true; +} + +bool LookupTableV2OpLite::InferShape() const { + auto table_dims = param_.W->dims(); + auto ids_dims = param_.Ids->dims(); + + std::vector out_dims; + for (int i = 0; i < ids_dims.size(); ++i) { + out_dims.push_back(ids_dims[i]); + } + out_dims.push_back(table_dims[1]); + param_.Out->Resize(lite::DDim{out_dims}); + param_.Out->set_lod(param_.Ids->lod()); + return true; +} + +bool LookupTableV2OpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto input = op_desc.Input("W").front(); + auto ids = op_desc.Input("Ids").front(); + auto out = op_desc.Output("Out").front(); + + param_.W = scope->FindVar(input)->GetMutable(); + param_.Ids = scope->FindVar(ids)->GetMutable(); + param_.Out = scope->FindVar(out)->GetMutable(); + + param_.padding_idx = op_desc.GetAttr("padding_idx"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(lookup_table_v2, paddle::lite::operators::LookupTableV2OpLite) diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..dabff3f0cac75cb70cde6eb6e95df34dc36901fe --- /dev/null +++ b/lite/operators/lookup_table_v2_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class LookupTableV2OpLite : public OpLite { + public: + LookupTableV2OpLite() {} + explicit LookupTableV2OpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "LookupTable"; } + + private: + mutable LookupTableParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc index 34b00653f91d03f8e661fac56b5931d928be15b2..aff3e5af5566771411acf20736fdbec703f5def9 100644 --- a/lite/operators/lrn_op.cc +++ b/lite/operators/lrn_op.cc @@ -37,11 +37,13 @@ bool LrnOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) { auto Out_name = opdesc.Output("Out").front(); param_.X = GetVar(scope, X_name); param_.Out = GetMutableVar(scope, Out_name); - param_.local_size = opdesc.GetAttr("local_size"); + param_.n = opdesc.GetAttr("n"); param_.alpha = opdesc.GetAttr("alpha"); param_.beta = opdesc.GetAttr("beta"); param_.k = opdesc.GetAttr("k"); - param_.norm_region = opdesc.GetAttr("norm_region"); + if (opdesc.HasAttr("norm_region")) { + param_.norm_region = opdesc.GetAttr("norm_region"); + } return true; } diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8095a94bf75cd5d6d9087509449c159056ebc28 --- /dev/null +++ b/lite/operators/match_matrix_tensor_op.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/match_matrix_tensor_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool MatchMatrixTensorOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.y); + CHECK_OR_FALSE(param_.w); + CHECK_OR_FALSE(param_.out); + CHECK_OR_FALSE(param_.tmp); + + DDim x_dims = param_.x->dims(); + DDim y_dims = param_.y->dims(); + DDim w_dims = param_.w->dims(); + int dim_t = param_.dim_t; + + CHECK_OR_FALSE(x_dims.size() == 2); + CHECK_OR_FALSE(y_dims.size() == 2); + CHECK_OR_FALSE(w_dims.size() == 3); + + CHECK_OR_FALSE(x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] && + w_dims[1] == dim_t); + + return true; +} + +bool MatchMatrixTensorOpLite::InferShape() const { + const Tensor* x = param_.x; + const Tensor* y = param_.y; + DDim x_dims = param_.x->dims(); + DDim y_dims = param_.y->dims(); + DDim w_dims = param_.w->dims(); + int dim_t = param_.dim_t; + + const auto& x_lod = x->lod(); + CHECK_OR_FALSE(!x_lod.empty()); + const auto& x_lod_0 = x_lod[0]; + CHECK_OR_FALSE(x_lod_0.size() >= 2); + CHECK_OR_FALSE(x_dims[0] == x_lod_0.back()); + + const auto& y_lod = y->lod(); + CHECK_OR_FALSE(!y_lod.empty()); + const auto& y_lod_0 = y_lod[0]; + CHECK_OR_FALSE(y_lod_0.size() >= 2); + CHECK_OR_FALSE(y_dims[0] == y_lod_0.back()); + + CHECK_OR_FALSE(x_lod_0.size() == y_lod_0.size()); + + int out_dim_0 = 0; + for (size_t i = 1; i < x_lod_0.size(); i++) { + int x_len = x_lod_0[i] - x_lod_0[i - 1]; + int y_len = y_lod_0[i] - y_lod_0[i - 1]; + out_dim_0 += (x_len * y_len); + } + out_dim_0 *= dim_t; + int tmp_dim_0 = x_dims[0] * dim_t * x_dims[1]; + + param_.out->Resize({out_dim_0, 1}); + param_.tmp->Resize({tmp_dim_0, 1}); + return true; +} + +bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto y = op_desc.Input("Y").front(); + auto out = op_desc.Output("Out").front(); + auto tmp = op_desc.Output("Tmp").front(); + + param_.x = scope->FindVar(x)->GetMutable(); + param_.w = scope->FindVar(w)->GetMutable(); + param_.y = scope->FindVar(y)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.tmp = scope->FindVar(tmp)->GetMutable(); + + param_.dim_t = op_desc.GetAttr("dim_t"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(match_matrix_tensor, + paddle::lite::operators::MatchMatrixTensorOpLite); diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h new file mode 100644 index 0000000000000000000000000000000000000000..404183ea5bda3c35ba8b833853bc0005d60b9f7d --- /dev/null +++ b/lite/operators/match_matrix_tensor_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class MatchMatrixTensorOpLite : public OpLite { + public: + MatchMatrixTensorOpLite() {} + + explicit MatchMatrixTensorOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "match_matrix_tensor"; } + + private: + mutable MatchMatrixTensorParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 8609f178886808f5dedf2de86e7cf7941c4a4c5d..4f0c707484f6a66148dabc80968665c1d38de445 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -89,11 +90,21 @@ struct FcParam { WITH_INT8_CONFIG }; +struct SearchSeqFcParam { + lite::Tensor* x{nullptr}; + lite::Tensor* w{nullptr}; + lite::Tensor* b{nullptr}; + lite::Tensor* out{nullptr}; + int out_size; +}; + // For Interpolate Op struct InterpolateParam { lite::Tensor* X{}; lite::Tensor* OutSize{}; lite::Tensor* Out{}; + std::vector SizeTensor; + lite::Tensor* Scale{}; float scale{0.f}; int out_h{-1}; @@ -101,6 +112,7 @@ struct InterpolateParam { bool align_corners{true}; int align_mode{1}; std::string interp_method{"Nearest"}; + DataLayoutType data_layout{DATALAYOUT(kNCHW)}; }; // For Mul Op @@ -242,9 +254,19 @@ struct ConvParam { lite::Tensor* residualData{nullptr}; lite::Tensor* output{}; std::vector strides{1, 1}; - std::vector paddings{0, 0}; + /* paddings type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> paddings; int groups{1}; - std::vector dilations{1, 1}; + /* dilations type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> dilations; bool fuse_relu_before_depthwise_conv{false}; bool use_mkldnn{false}; bool fuse_relu{false}; // only used in mkldnn kernel @@ -291,7 +313,12 @@ struct PoolParam { bool global_pooling{ false}; // if true, knernel size and paddings will be ignored std::vector strides{1, 1}; - std::vector paddings{0, 0}; + /* paddings type change + * from std::vector to std::shared_ptr> + * to support dynamically modify padding + * let kernel param and operator param Synchronous update + */ + std::shared_ptr> paddings; bool exclusive{true}; bool adaptive{false}; bool ceil_mode{false}; @@ -317,6 +344,9 @@ struct DropoutParam { struct SplitParam { lite::Tensor* x{}; std::vector output{}; + lite::Tensor* axis_tensor; + std::vector sections_tensor_list{}; + int axis{-1}; int num{0}; std::vector sections; @@ -378,6 +408,9 @@ struct MeanGradParam { struct FillConstantParam { int dtype{static_cast(VarDescAPI::VarDataType::FP32)}; std::vector shape{}; + lite::Tensor* shape_tensor; + std::vector shape_tensor_list{}; + float value{0.0f}; // useless for x86, keep it for compatibility bool force_cpu{false}; @@ -511,8 +544,8 @@ struct GRUUnitParam { struct LrnParam { const lite::Tensor* X{}; lite::Tensor* Out{}; - int local_size{5}; - float alpha{1.}; + int n{5}; + float alpha{1e-4}; float beta{0.75}; float k{1.}; std::string norm_region{"AcrossChannels"}; @@ -729,6 +762,14 @@ struct SequencePoolParam { #endif }; +struct SearchGroupPaddingParam { + lite::Tensor* x{}; + lite::Tensor* out_emb_padding{}; + lite::Tensor* out_new{}; + lite::Tensor* out_padding{}; + int pad_id; +}; + struct SequenceReshapeParam { lite::Tensor* x{}; lite::Tensor* output{}; @@ -748,6 +789,32 @@ struct SequenceExpandAsParam { lite::Tensor* out{nullptr}; }; +struct SequenceReverseParam { + const lite::Tensor* X{}; + lite::Tensor* Out{}; +}; + +struct SequenceConcatParam { + std::vector X{}; + lite::Tensor* Out{}; +}; + +struct AttentionPaddingMaskParam { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + int pad_id; + float mask; + lite::Tensor* Out{}; + lite::Tensor* pad_begin{}; +}; + +struct SequenceArithmeticParam { + const lite::Tensor* X{}; + const lite::Tensor* Y{}; + int op_type{1}; + lite::Tensor* Out{}; +}; + struct ReduceMaxParam { const lite::Tensor* X{}; lite::Tensor* Out{}; @@ -776,6 +843,22 @@ struct ReduceParam { bool reduce_all{false}; }; +struct VarConv2DParam { + const lite::Tensor* X{}; + const lite::Tensor* ROW{}; + const lite::Tensor* COLUMN{}; + const lite::Tensor* W{}; + lite::Tensor* Out{}; + lite::Tensor* Col{}; + + int input_channel; + int output_channel; + int stride_h; + int stride_w; + int kernel_h; + int kernel_w; +}; + /// ----------------------- shape operators ---------------------- struct ShapeParam { const lite::Tensor* X{}; @@ -856,7 +939,7 @@ struct UnsqueezeParam { lite::Tensor* XShape{}; std::vector axes{}; const lite::Tensor* axes_tensor{}; - std::vector* axes_tensor_vct{}; + std::vector axes_tensor_vct{}; }; /// ----------------------- expand operators ---------------------- @@ -922,6 +1005,57 @@ struct AssignValueParam { lite::Tensor* Out{}; }; +/// --------------- sequence_topk_avg_pooling operators ------------------ +struct SequenceTopkAvgPoolingParam { + const lite::Tensor* X{}; + const lite::Tensor* ROW{}; + const lite::Tensor* COLUMN{}; + lite::Tensor* Out{}; + lite::Tensor* pos{}; + int channel_num{}; + std::vector topks{}; +}; + +/// --------------- search_fc operators ------------------ +struct SearchFcParam { + const lite::Tensor* X{}; + const lite::Tensor* W{}; + const lite::Tensor* b{}; + lite::Tensor* Out{}; + int out_size{}; +}; +/// --------------------- match_matrix_tensor operators -------------------- +struct MatchMatrixTensorParam { + const lite::Tensor* x{}; + const lite::Tensor* y{}; + const lite::Tensor* w{}; + lite::Tensor* out{}; + lite::Tensor* tmp{}; + + int dim_t; +}; + +/// --------------------- search_seq_depadding operators -------------------- +struct SearchSeqDepaddingParam { + const lite::Tensor* pad{}; + const lite::Tensor* src{}; + lite::Tensor* out{}; +}; + +/// --------------------- search_grnn operators -------------------- +struct SearchGrnnParam { + const lite::Tensor* x{}; + const lite::Tensor* wi{}; + const lite::Tensor* wh{}; + int num_input; + int num_hidden; + + lite::Tensor* out{}; + lite::Tensor* tmp_buffer{}; + lite::Tensor* idx_sorted_by_width{}; + lite::Tensor* layout_input{}; +}; + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc index 1ebbc059b76572886f5ff7c8ce1e32b593070fa0..c6f6eed28f8cdb5f080b6f4367a1b88b1dbc0701 100644 --- a/lite/operators/pool_op.cc +++ b/lite/operators/pool_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/pool_op.h" +#include #include "lite/core/op_registry.h" namespace paddle { @@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const { const auto& x_dims = param_.x->dims(); const auto& ksize = param_.ksize; const auto& strides = param_.strides; - const auto& paddings = param_.paddings; + const auto& paddings = *param_.paddings; // "Pooling intput should be 4-D or 5-D tensor." CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5); @@ -34,20 +35,27 @@ bool PoolOpLite::CheckShape() const { CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U); // Strides size and pooling size should be the same. CHECK_OR_FALSE(ksize.size() == strides.size()); - // Paddings size and pooling size should be the same. - CHECK_OR_FALSE(ksize.size() == paddings.size()); + // Paddings size must be 4. + CHECK_OR_FALSE(paddings.size() == 4L); return true; } -int PoolOutputSize( - int input_size, int filter_size, int padding, int stride, bool ceil_mode) { +int PoolOutputSize(int input_size, + int filter_size, + int pad_left, + int pad_right, + int stride, + bool ceil_mode) { int output_size; if (!ceil_mode) { - output_size = (input_size - filter_size + 2 * padding) / stride + 1; + output_size = + (input_size - filter_size + pad_left + pad_right) / stride + 1; } else { output_size = - (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + (input_size - filter_size + pad_left + pad_right + stride - 1) / + stride + + 1; } return output_size; } @@ -55,14 +63,21 @@ int PoolOutputSize( bool PoolOpLite::InferShape() const { const auto x_dims = param_.x->dims(); std::vector& ksize = param_.ksize; + // dynamic update 4-pad + UpdatePadding(param_.paddings.get(), + param_.global_pooling, + param_.adaptive, + padding_algorithm_, + x_dims, + param_.strides, + ksize); if (param_.global_pooling) { ksize.resize(static_cast(x_dims.size()) - 2); for (size_t i = 0; i < ksize.size(); ++i) { - param_.paddings[i] = 0; ksize[i] = static_cast(x_dims[i + 2]); } } - + auto paddings = *param_.paddings; std::vector output_shape({x_dims[0], x_dims[1]}); if (param_.adaptive) { output_shape.insert( @@ -71,15 +86,14 @@ bool PoolOpLite::InferShape() const { for (size_t i = 0; i < param_.ksize.size(); ++i) { output_shape.push_back(PoolOutputSize(x_dims[i + 2], param_.ksize[i], - param_.paddings[i], + paddings[2 * i], + paddings[2 * i + 1], param_.strides[i], param_.ceil_mode)); } } param_.output->Resize(lite::DDim(output_shape)); - // ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - // ctx->ShareLoD("X", "Out"); return true; } diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h index aecec4c61955cecd67f485662feb1a937681c165..c44875ff95b554ca92cf5288597a5bdaf2cb1bf8 100644 --- a/lite/operators/pool_op.h +++ b/lite/operators/pool_op.h @@ -14,6 +14,8 @@ #pragma once +#include +#include #include #include #include "lite/core/kernel.h" @@ -51,7 +53,7 @@ class PoolOpLite : public OpLite { param_.ksize = op_desc.GetAttr>("ksize"); param_.global_pooling = op_desc.GetAttr("global_pooling"); param_.strides = op_desc.GetAttr>("strides"); - param_.paddings = op_desc.GetAttr>("paddings"); + auto paddings = op_desc.GetAttr>("paddings"); if (op_desc.HasAttr("exclusive")) { param_.exclusive = op_desc.GetAttr("exclusive"); @@ -65,7 +67,23 @@ class PoolOpLite : public OpLite { if (op_desc.HasAttr("use_quantizer")) { param_.use_quantizer = op_desc.GetAttr("use_quantizer"); } - // param_.data_format = op_desc.GetAttr("data_format"); + if (op_desc.HasAttr("padding_algorithm")) { + padding_algorithm_ = op_desc.GetAttr("padding_algorithm"); + } + // 2-pad to 4-pad + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } else { + if (paddings.size() != 4L) { + LOG(FATAL) + << "Paddings size should be the same or twice as the inputs size."; + } + } + param_.paddings = std::make_shared>(paddings); + return true; } @@ -75,8 +93,42 @@ class PoolOpLite : public OpLite { private: mutable PoolParam param_; + std::string padding_algorithm_{""}; }; +inline void UpdatePadding(std::vector *paddings, + const bool global_pooling, + const bool adaptive, + const std::string padding_algorithm, + const lite::DDim data_dims, + const std::vector &strides, + const std::vector &ksize) { + // when padding_algorithm is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (int i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = + std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + } + } else if (padding_algorithm == "VALID") { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } + + // if global_pooling == true or adaptive == true, padding will be ignore + if (global_pooling || adaptive) { + for (auto it = paddings->begin(); it != paddings->end(); it++) { + *it = 0; + } + } +} + } // namespace operators } // namespace lite } // namespace paddle diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..43a276e3c7a2f7481ade2ee18c1446593f7c5f43 --- /dev/null +++ b/lite/operators/search_aligned_mat_mul_op.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_aligned_mat_mul_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchAlignedMatMulOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_OR_FALSE(param_.Out); + + return true; +} + +bool SearchAlignedMatMulOpLite::InferShape() const { + const auto x_dims = param_.X->dims(); + const auto y_dims = param_.Y->dims(); + const auto& x_lod = param_.X->lod(); + const auto& y_lod = param_.Y->lod(); + bool x_transpose = param_.transpose_X; + bool y_transpose = param_.transpose_Y; + + CHECK_EQ(x_dims.size(), 2) << "X should be 2-D tensor"; + CHECK_EQ(y_dims.size(), 2) << "Y should be 2-D tensor"; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + CHECK(!y_lod.empty()) << "The Input(Y) must hold lod info."; + + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_GE(y_lod_0.size(), 2) << "The Input(Y)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(y_dims[0], static_cast(y_lod_0.back())) + << "The Input(Y)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(x_lod_0.size(), y_lod_0.size()) + << "The Length of X and Y must be equal."; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose ? x_inner_size : x_batch_size; + int N = y_transpose ? y_batch_size : y_inner_size; + int X_K = x_transpose ? x_batch_size : x_inner_size; + int Y_K = y_transpose ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + + LoD out_lod; + std::vector out_lod_0(seq_num + 1); + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + out_lod_0[i + 1] = out_lod_0[i] + M; + } + out_lod.push_back(out_lod_0); + DDim out_dims( + {static_cast(out_lod_0.back()), static_cast(N)}); + param_.Out->set_lod(out_lod); + param_.Out->Resize(out_dims); + return true; +} + +bool SearchAlignedMatMulOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + CHECK(!op_desc.Input("X").empty()); + CHECK(!op_desc.Input("Y").empty()); + CHECK(!op_desc.Output("Out").empty()); + auto X = op_desc.Input("X").front(); + auto Y = op_desc.Input("Y").front(); + auto Out = op_desc.Output("Out").front(); + param_.X = GetVar(scope, X); + param_.Y = GetVar(scope, Y); + param_.Out = GetMutableVar(scope, Out); + param_.transpose_X = op_desc.GetAttr("transpose_X"); + param_.transpose_Y = op_desc.GetAttr("transpose_Y"); + param_.alpha = op_desc.GetAttr("alpha"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_aligned_mat_mul, + paddle::lite::operators::SearchAlignedMatMulOpLite); diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7321b7e9d15331e6aad36364436a99d3d4089c8c --- /dev/null +++ b/lite/operators/search_aligned_mat_mul_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchAlignedMatMulOpLite : public OpLite { + public: + SearchAlignedMatMulOpLite() {} + + explicit SearchAlignedMatMulOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + std::string DebugString() const override { return "search_aligned_mat_mul"; } + + private: + mutable MatMulParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2e77e361624e681aa93e36610674df0e1f9a13af --- /dev/null +++ b/lite/operators/search_fc_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_fc_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchFcOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.W); + CHECK_OR_FALSE(param_.b); + CHECK_OR_FALSE(param_.Out); + + auto x_dims = param_.X->dims(); + CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) should be 2."; + auto w_dims = param_.W->dims(); + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + auto b_dims = param_.b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(w_dims[1], x_dims[1]) << "wrong shape: w_dims[1] != x_dims[1]"; + return true; +} + +bool SearchFcOpLite::InferShape() const { + auto out_size = param_.out_size; + lite::DDim dims(std::vector({-1, out_size})); + param_.Out->Resize(dims); + return true; +} + +bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto W = op_desc.Input("W").front(); + auto b = op_desc.Input("b").front(); + auto Out = op_desc.Output("Out").front(); + + param_.X = scope->FindVar(X)->GetMutable(); + param_.W = scope->FindVar(W)->GetMutable(); + param_.b = scope->FindVar(b)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + param_.out_size = op_desc.GetAttr("out_size"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_fc, paddle::lite::operators::SearchFcOpLite); diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a871cadd33b4f7d4b6130a0b8ac2974a738ac0c3 --- /dev/null +++ b/lite/operators/search_fc_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchFcOpLite : public OpLite { + public: + SearchFcOpLite() {} + explicit SearchFcOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_fc"; } + + private: + mutable SearchFcParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b56ae820bf9de4ffe6aa3f6db7a8e1385c8cc11f --- /dev/null +++ b/lite/operators/search_grnn_op.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_grnn_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchGrnnOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.wi); + CHECK_OR_FALSE(param_.wh); + CHECK_OR_FALSE(param_.out); + CHECK_OR_FALSE(param_.tmp_buffer); + CHECK_OR_FALSE(param_.idx_sorted_by_width); + CHECK_OR_FALSE(param_.layout_input); + + int _cap_h = param_.num_hidden; + int _cap_e = param_.num_input; + + const auto& x_dims = param_.x->dims(); + CHECK_OR_FALSE(x_dims.size() == 2); + CHECK_OR_FALSE(x_dims[1] == _cap_e); + + const auto& wi_dims = param_.wi->dims(); + CHECK_OR_FALSE(wi_dims.size() == 3); + CHECK_OR_FALSE(wi_dims[0] == 3); + CHECK_OR_FALSE(wi_dims[1] == _cap_h); + CHECK_OR_FALSE(wi_dims[2] == _cap_e); + + const auto& wh_dims = param_.wh->dims(); + CHECK_OR_FALSE(wh_dims.size() == 3); + CHECK_OR_FALSE(wh_dims[0] == 3); + CHECK_OR_FALSE(wh_dims[1] == _cap_h); + CHECK_OR_FALSE(wh_dims[2] == _cap_h); + + return true; +} + +bool SearchGrnnOpLite::InferShape() const { + const auto& x_dims = param_.x->dims(); + const auto& x_lod = param_.x->lod(); + CHECK_OR_FALSE(!x_lod.empty()); + CHECK_OR_FALSE(x_dims[0] == x_lod[0].back()); + param_.out->set_lod(x_lod); + + return true; +} + +bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto x = op_desc.Input("X").front(); + auto wi = op_desc.Input("Wi").front(); + auto wh = op_desc.Input("Wh").front(); + param_.x = scope->FindVar(x)->GetMutable(); + param_.wi = scope->FindVar(wi)->GetMutable(); + param_.wh = scope->FindVar(wh)->GetMutable(); + + param_.num_input = op_desc.GetAttr("num_input"); + param_.num_hidden = op_desc.GetAttr("num_hidden"); + + auto out = op_desc.Output("Out").front(); + auto tmp_buffer = op_desc.Output("tmp_buffer").front(); + auto idx_sorted_by_width = op_desc.Output("idx_sorted_by_width").front(); + auto layout_input = op_desc.Output("layout_input").front(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.tmp_buffer = scope->FindVar(tmp_buffer)->GetMutable(); + param_.idx_sorted_by_width = + scope->FindVar(idx_sorted_by_width)->GetMutable(); + param_.layout_input = + scope->FindVar(layout_input)->GetMutable(); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_grnn, paddle::lite::operators::SearchGrnnOpLite); diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..670af8a6c9ff9eafa33018a0303ea1a36b0a1e01 --- /dev/null +++ b/lite/operators/search_grnn_op.h @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchGrnnOpLite : public OpLite { + public: + SearchGrnnOpLite() {} + + explicit SearchGrnnOpLite(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "search_grnn"; } + + private: + mutable SearchGrnnParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ba4dde275f4b9662416bdf5190cacfafc56a40d --- /dev/null +++ b/lite/operators/search_group_padding_op.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_group_padding_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchGroupPaddingOp::CheckShape() const { + CHECK_EQ(param_.x->dims().size(), 2) << "The rank of X(Input) should be 2."; + CHECK_EQ(param_.x->lod().empty(), false) + << "Input Tensor of X does not contain LoD information."; + CHECK_GE(param_.x->lod()[0].size(), 2) + << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(param_.x->dims()[0], static_cast(param_.x->lod()[0].back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + + return true; +} + +bool SearchGroupPaddingOp::InferShape() const { + std::vector x_dims = param_.x->dims().Vectorize(); + + param_.out_emb_padding->Resize({-1, x_dims[1]}); + param_.out_new->Resize({x_dims[0], 1}); + param_.out_padding->Resize({-1, 1}); + return true; +} + +bool SearchGroupPaddingOp::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto x = op_desc.Input("X").front(); + auto out_emb_padding = op_desc.Output("Out_emb_padding").front(); + auto out_new = op_desc.Output("Out_new").front(); + auto out_padding = op_desc.Output("Out_padding").front(); + + param_.x = scope->FindVar(x)->GetMutable(); + param_.out_emb_padding = + scope->FindVar(out_emb_padding)->GetMutable(); + param_.out_new = scope->FindVar(out_new)->GetMutable(); + param_.out_padding = scope->FindVar(out_padding)->GetMutable(); + param_.pad_id = op_desc.GetAttr("pad_id"); + + CHECK(param_.out_emb_padding) + << "Output(Out_emb_padding) of SearchGroupPadding Op should not be null."; + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_group_padding, + paddle::lite::operators::SearchGroupPaddingOp); diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a8e96c9697b5f7de70349efa1f8b378a47c3823c --- /dev/null +++ b/lite/operators/search_group_padding_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchGroupPaddingOp : public OpLite { + public: + SearchGroupPaddingOp() {} + explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_group_padding"; } + + private: + mutable SearchGroupPaddingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..12d5123e05b41665550fb7e6b90a636093959263 --- /dev/null +++ b/lite/operators/search_seq_depadding_op.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_depadding_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqDepaddingOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.pad); + CHECK_OR_FALSE(param_.src); + CHECK_OR_FALSE(param_.out); + + DDim pad_dims = param_.pad->dims(); + DDim src_dims = param_.src->dims(); + CHECK_OR_FALSE(pad_dims.size() == 2); + CHECK_OR_FALSE(src_dims.size() == 2); + + const auto& pad_lod = param_.pad->lod(); + CHECK_OR_FALSE(!pad_lod.empty()); + const auto& pad_lod_0 = pad_lod[0]; + CHECK_OR_FALSE(pad_lod_0.size() >= 2); + CHECK_OR_FALSE(pad_dims[0] == pad_lod_0.back()); + + const auto& src_lod = param_.src->lod(); + CHECK_OR_FALSE(!src_lod.empty()); + const auto& src_lod_0 = src_lod[0]; + CHECK_OR_FALSE(src_lod_0.size() >= 2); + CHECK_OR_FALSE(src_dims[0] == src_lod_0.back()); + return true; +} + +bool SearchSeqDepaddingOpLite::InferShape() const { + DDim pad_dims = param_.pad->dims(); + param_.out->Resize({-1, pad_dims[1]}); + return true; +} + +bool SearchSeqDepaddingOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + auto pad = op_desc.Input("Pad").front(); + auto src = op_desc.Input("Src").front(); + auto out = op_desc.Output("Out").front(); + + param_.pad = scope->FindVar(pad)->GetMutable(); + param_.src = scope->FindVar(src)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_depadding, + paddle::lite::operators::SearchSeqDepaddingOpLite); diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..445d9e0f3bcba6204243e80023d826bf53d90c60 --- /dev/null +++ b/lite/operators/search_seq_depadding_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/operators/op_params.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqDepaddingOpLite : public OpLite { + public: + SearchSeqDepaddingOpLite() {} + + explicit SearchSeqDepaddingOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "search_seq_depadding"; } + + private: + mutable SearchSeqDepaddingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5cca5331ab80479656b1212df02c20d463a3707 --- /dev/null +++ b/lite/operators/search_seq_fc_op.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_fc_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqFcOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.w); + CHECK_OR_FALSE(param_.out); + return true; +} + +bool SearchSeqFcOpLite::InferShape() const { + const auto x_dims = param_.x->dims(); + const auto w_dims = param_.w->dims(); + const auto& x_lod = param_.x->lod(); + auto out_size = param_.out_size; + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size"; + + if (param_.b != nullptr) { + const auto b_dims = param_.b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + } + + param_.out->set_lod(x_lod); + param_.out->Resize({x_dims[0], w_dims[0]}); + return true; +} + +bool SearchSeqFcOpLite::AttachImpl(const cpp::OpDesc& op_desc, + lite::Scope* scope) { + CHECK(!op_desc.Input("X").empty()); + CHECK(!op_desc.Input("W").empty()); + CHECK(!op_desc.Output("Out").empty()); + auto x = op_desc.Input("X").front(); + auto w = op_desc.Input("W").front(); + auto out = op_desc.Output("Out").front(); + param_.x = scope->FindVar(x)->GetMutable(); + param_.w = scope->FindVar(w)->GetMutable(); + param_.out = scope->FindVar(out)->GetMutable(); + param_.out_size = op_desc.GetAttr("out_size"); + bool has_bias = op_desc.GetAttr("has_bias"); + if (has_bias) { + CHECK(!op_desc.Input("b").empty()); + auto b = op_desc.Input("b").front(); + param_.b = scope->FindVar(b)->GetMutable(); + } + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_fc, paddle::lite::operators::SearchSeqFcOpLite); diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3c4f7d82bfa66c2f323063f0297438c81ce18397 --- /dev/null +++ b/lite/operators/search_seq_fc_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqFcOpLite : public OpLite { + public: + SearchSeqFcOpLite() {} + + explicit SearchSeqFcOpLite(const std::string &type) : OpLite(type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override; + std::string DebugString() const override { return "search_seq_fc"; } + + private: + mutable SearchSeqFcParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..973ffa04c4562334af6d379b5446902036de8c5e --- /dev/null +++ b/lite/operators/search_seq_softmax_op.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/search_seq_softmax_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SearchSeqSoftmaxOp::CheckShape() const { + CHECK_OR_FALSE(param_.x); + CHECK_OR_FALSE(param_.output); + return true; +} + +bool SearchSeqSoftmaxOp::InferShape() const { + param_.output->Resize(param_.x->dims()); + param_.output->set_lod(param_.x->lod()); + return true; +} + +bool SearchSeqSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.x = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.output = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.axis = 1; + + CHECK(param_.x); + CHECK(param_.output); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(search_seq_softmax, + paddle::lite::operators::SearchSeqSoftmaxOp); diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f97e8ddd3a6c446fb5c53d5e603f43bbdf1e2525 --- /dev/null +++ b/lite/operators/search_seq_softmax_op.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SearchSeqSoftmaxOp : public OpLite { + public: + SearchSeqSoftmaxOp() {} + explicit SearchSeqSoftmaxOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "search_seq_softmax_op"; } + + private: + mutable SoftmaxParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..29c39ebc23f54c2c3c052e322575d97570195cfc --- /dev/null +++ b/lite/operators/sequence_arithmetic_op.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_arithmetic_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceArithmeticOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Y); + CHECK_EQ(param_.X->dims().size(), 2) << "Input X should a 2-D Tensor"; + CHECK_EQ(param_.Y->dims().size(), 2) << "Input Y should a 2-D Tensor"; + CHECK_OR_FALSE(param_.Out); + return true; +} + +bool SequenceArithmeticOp::InferShape() const { + param_.Out->Resize(param_.X->dims()); + param_.Out->set_lod(param_.X->lod()); + return true; +} + +bool SequenceArithmeticOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = scope->FindTensor(opdesc.Input("X").front()); + param_.Y = scope->FindTensor(opdesc.Input("Y").front()); + param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front()); + + param_.op_type = opdesc.GetAttr("op_type"); + + CHECK(param_.X); + CHECK(param_.Y); + CHECK(param_.Out); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_arithmetic, + paddle::lite::operators::SequenceArithmeticOp); +REGISTER_LITE_OP(search_seq_arithmetic, + paddle::lite::operators::SequenceArithmeticOp); diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9f844dfbf429599d829bc786c66ba6d05e40d79d --- /dev/null +++ b/lite/operators/sequence_arithmetic_op.h @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceArithmeticOp : public OpLite { + public: + SequenceArithmeticOp() {} + explicit SequenceArithmeticOp(const std::string &op_type) : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + + std::string DebugString() const override { return "sequence_arithmetic"; } + + private: + mutable SequenceArithmeticParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a54df890cc6b90910713ed7d6d44f9218e72e28 --- /dev/null +++ b/lite/operators/sequence_concat_op.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_concat_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceConcatOp::CheckShape() const { + CHECK_GT(param_.X.size(), 1) + << "The number of input sequences is at least two."; + CHECK_OR_FALSE(param_.Out); + size_t lod_size = 0; + for (const auto &t : param_.X) { + CHECK_EQ(t->lod().empty(), false) + << "Input Tensor of X does not contain LoD information."; + // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now."; + if (lod_size == 0) { + lod_size = t->lod()[0].size(); + } else { + CHECK_EQ(t->lod()[0].size(), lod_size) + << "The number of sequence must be same between each input"; + } + } + CHECK_NE(lod_size, 0) << "Each input must have sequence information"; + return true; +} + +bool SequenceConcatOp::InferShape() const { + int64_t batch_size = 0; + int64_t feature_size = 0; + std::vector out_dims; + for (const auto &tensor : param_.X) { + const auto x_dims = tensor->dims(); + if (out_dims.empty()) { + out_dims = x_dims.Vectorize(); + } + batch_size += x_dims[0]; + if (feature_size == 0) { + feature_size = x_dims.production() / x_dims[0]; + } else { + CHECK_EQ(feature_size, x_dims.production() / x_dims[0]) + << "Inputs of sequence concat must have same feature size"; + } + } + if (batch_size < 0) { + batch_size = -1; // Normalize batch size for compile time. + } + out_dims[0] = batch_size; + param_.Out->Resize(out_dims); + // LoD info will be computed in Kernel. + return true; +} + +bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + auto input_list = opdesc.Input("X"); + param_.X.clear(); + for (auto var : input_list) { + param_.X.push_back(scope->FindVar(var)->GetMutable()); + } + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null."; + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_concat, paddle::lite::operators::SequenceConcatOp); diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8cdc07ebca83b9c400b00a0f40556a788c5854e6 --- /dev/null +++ b/lite/operators/sequence_concat_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceConcatOp : public OpLite { + public: + SequenceConcatOp() {} + explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_concat"; } + + private: + mutable SequenceConcatParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd8fa2e8fd5816cc92355c9c73caf1aa76baf36c --- /dev/null +++ b/lite/operators/sequence_reverse_op.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_reverse_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceReverseOp::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.Out); + CHECK_EQ(param_.X->lod().empty(), false) + << "Input(X) Tensor of SequenceReverseOp does not contain " + "LoD information."; + CHECK_GE(param_.X->dims().size(), 2) + << "Rank of Input(X) must be not less than 2."; + return true; +} + +bool SequenceReverseOp::InferShape() const { + const auto *input = param_.X; + auto out_dims = input->dims(); + param_.Out->Resize(out_dims); + return true; +} + +bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc, + lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + param_.Out = + scope->FindVar(opdesc.Output("Y").front())->GetMutable(); + CHECK(param_.X); + CHECK(param_.Out); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_reverse, paddle::lite::operators::SequenceReverseOp); diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..326d0f68927199e9353a5bbe8c072d342c9e3d69 --- /dev/null +++ b/lite/operators/sequence_reverse_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceReverseOp : public OpLite { + public: + SequenceReverseOp() {} + explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "sequence_reverse"; } + + private: + mutable SequenceReverseParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6f5cbeeeee5816132d2ebcb7094949189931b931 --- /dev/null +++ b/lite/operators/sequence_topk_avg_pooling_op.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/sequence_topk_avg_pooling_op.h" +#include "lite/core/op_lite.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool SequenceTopkAvgPoolingOpLite::CheckShape() const { + CHECK_OR_FALSE(param_.X); + CHECK_OR_FALSE(param_.ROW); + CHECK_OR_FALSE(param_.COLUMN); + CHECK_OR_FALSE(param_.Out); + CHECK_OR_FALSE(param_.pos); + return true; +} + +bool SequenceTopkAvgPoolingOpLite::InferShape() const { + int channel_num = param_.channel_num; + std::vector topks = param_.topks; + auto row_dim = param_.ROW->dims(); + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + param_.Out->Resize(lite::DDim(vec_out_shape)); + param_.Out->set_lod(param_.ROW->lod()); + return true; +} + +bool SequenceTopkAvgPoolingOpLite::AttachImpl(const cpp::OpDesc &op_desc, + lite::Scope *scope) { + auto X = op_desc.Input("X").front(); + auto ROW = op_desc.Input("ROW").front(); + auto COLUMN = op_desc.Input("COLUMN").front(); + auto Out = op_desc.Output("Out").front(); + auto pos = op_desc.Output("pos").front(); + + param_.X = scope->FindVar(X)->GetMutable(); + param_.ROW = scope->FindVar(ROW)->GetMutable(); + param_.COLUMN = scope->FindVar(COLUMN)->GetMutable(); + param_.Out = scope->FindVar(Out)->GetMutable(); + param_.pos = scope->FindVar(pos)->GetMutable(); + param_.channel_num = op_desc.GetAttr("channel_num"); + param_.topks = op_desc.GetAttr>("topks"); + + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(sequence_topk_avg_pooling, + paddle::lite::operators::SequenceTopkAvgPoolingOpLite); diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1c1cfe3a9c7bc82c3e79fc372b98293183509dca --- /dev/null +++ b/lite/operators/sequence_topk_avg_pooling_op.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" +#include "lite/utils/all.h" + +namespace paddle { +namespace lite { +namespace operators { + +class SequenceTopkAvgPoolingOpLite : public OpLite { + public: + SequenceTopkAvgPoolingOpLite() {} + explicit SequenceTopkAvgPoolingOpLite(const std::string &op_type) + : OpLite(op_type) {} + + bool CheckShape() const override; + + bool InferShape() const override; + + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { + return "sequence_topk_avg_pooling"; + } + + private: + mutable SequenceTopkAvgPoolingParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc index 18280616aa00b734596b620727f6dcfd5beb67d7..ec98a0d6c3ba3b1e5cd1c7992b58e96917d21057 100644 --- a/lite/operators/split_op.cc +++ b/lite/operators/split_op.cc @@ -39,8 +39,16 @@ bool SplitOp::InferShape() const { const int outs_number = outs.size(); std::vector outs_dims; outs_dims.reserve(outs_number); - - if (num > 0) { + std::vector sections_tensor_list_ = + param_.sections_tensor_list; + if (sections.size() > 0 && sections_tensor_list_.size() > 0) { + std::vector vec_sections; + for (size_t i = 0; i < sections_tensor_list_.size(); ++i) { + auto dim = in_dims; + dim[axis] = sections_tensor_list_[i]->data()[0]; + outs_dims.push_back(dim); + } + } else if (num > 0) { int out_axis_dim = in_dims[axis] / num; for (int i = 0; i < outs_number; ++i) { auto dim = in_dims; @@ -55,6 +63,10 @@ bool SplitOp::InferShape() const { } } + if (param_.axis_tensor != nullptr) { + axis = param_.axis_tensor->data()[0]; + } + for (int j = 0; j < outs_dims.size(); ++j) { outs[j]->Resize(outs_dims[j]); } @@ -73,6 +85,21 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { for (auto var : outs) { param_.output.push_back(scope->FindVar(var)->GetMutable()); } + std::vector input_arg_names = opdesc.InputArgumentNames(); + if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") != + input_arg_names.end()) { + auto args = opdesc.Input("AxisTensor"); + auto *var = scope->FindVar(args.front()); + param_.axis_tensor = var->GetMutable(); + } + if (std::find(input_arg_names.begin(), + input_arg_names.end(), + "SectionsTensorList") != input_arg_names.end()) { + auto args = opdesc.Input("SectionsTensorList"); + auto *var = scope->FindVar(args.front()); + param_.sections_tensor_list = + *(var->GetMutable>()); + } return true; } diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc index 8db14d0660a7b48b94406e35908f0636a53d57f6..39b275b7b55f79f2c8daf16ab0a6acd2e76e8b48 100644 --- a/lite/operators/unsqueeze_op.cc +++ b/lite/operators/unsqueeze_op.cc @@ -66,10 +66,7 @@ bool UnsqueezeOp::InferShape() const { std::vector final_axes; auto axes = param_.axes; auto *axes_tensor = param_.axes_tensor; - std::vector axes_tensor_vct; - if (param_.axes_tensor_vct) { - axes_tensor_vct = *(param_.axes_tensor_vct); - } + auto axes_tensor_vct = param_.axes_tensor_vct; if (!axes.empty()) { final_axes = axes; @@ -79,7 +76,7 @@ bool UnsqueezeOp::InferShape() const { axes_tensor_data + axes_tensor->numel()); } else if (!axes_tensor_vct.empty()) { for (int i = 0; i < axes_tensor_vct.size(); i++) { - final_axes.push_back(axes_tensor_vct[i].data()[0]); + final_axes.push_back(axes_tensor_vct[i]->data()[0]); } } else { LOG(FATAL) << "Input axis error"; @@ -114,16 +111,12 @@ bool UnsqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { if (opdesc.HasInput("AxesTensorList") && opdesc.Input("AxesTensorList").size() > 0) { auto args = opdesc.Input("AxesTensorList"); - /* for (auto arg : args) { auto *var = scope->FindVar(arg); if (var != nullptr) { param_.axes_tensor_vct.push_back(var->GetMutable()); } } - */ - auto *var = scope->FindVar(args.front()); - param_.axes_tensor_vct = var->GetMutable>(); } CHECK(param_.X) << "Input(X) of UnsqueezeOp should not be null."; CHECK(param_.Out) << "Output(Out) of UnsqueezeOp should not be null."; diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5c7fe374fc90b20ee44df3d1619f44109b7387c0 --- /dev/null +++ b/lite/operators/var_conv_2d_op.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/var_conv_2d_op.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace operators { + +bool VarConv2dOp::CheckShape() const { + auto x_dims = param_.X->dims(); + CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2."; + auto w_dims = param_.W->dims(); + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor"; + CHECK_EQ(w_dims[0], param_.output_channel) + << "W dim[0] should be equal to OutputChannel"; + CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w) + << "W dim[1] should be equal to InputChannel * KernelH * KernelW"; + LoD x_lod = param_.X->lod(); + CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info."; + // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted."; + CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod[0].back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + // LoD row_lod = param_.ROW->lod(); + // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info."; + // LoD col_lod = param_.COLUMN->lod(); + // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod + // info."; + return true; +} + +bool VarConv2dOp::InferShape() const { return true; } + +bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) { + param_.X = const_cast( + &scope->FindVar(opdesc.Input("X").front())->Get()); + // param_.ROW = const_cast( + // &scope->FindVar(opdesc.Input("ROW").front())->Get()); + // param_.COLUMN = const_cast( + // &scope->FindVar(opdesc.Input("COLUMN").front())->Get()); + param_.W = const_cast( + &scope->FindVar(opdesc.Input("W").front())->Get()); + param_.Out = + scope->FindVar(opdesc.Output("Out").front())->GetMutable(); + param_.Col = + scope->FindVar(opdesc.Output("Col").front())->GetMutable(); + CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null."; + // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null."; + // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null."; + CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null."; + CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null."; + CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null."; + param_.output_channel = opdesc.GetAttr("OutputChannel"); + param_.input_channel = opdesc.GetAttr("InputChannel"); + param_.kernel_h = opdesc.GetAttr("KernelH"); + param_.kernel_w = opdesc.GetAttr("KernelW"); + param_.stride_h = opdesc.GetAttr("StrideH"); + param_.stride_w = opdesc.GetAttr("StrideW"); + return true; +} + +} // namespace operators +} // namespace lite +} // namespace paddle + +REGISTER_LITE_OP(var_conv_2d, paddle::lite::operators::VarConv2dOp); diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ce6309419cc582c2f93250dd6e8e59c04a951f91 --- /dev/null +++ b/lite/operators/var_conv_2d_op.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/scope.h" + +namespace paddle { +namespace lite { +namespace operators { + +class VarConv2dOp : public OpLite { + public: + VarConv2dOp() {} + explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {} + bool CheckShape() const override; + bool InferShape() const override; + bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; + void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } + std::string DebugString() const override { return "var_conv_2d"; } + + private: + mutable VarConv2DParam param_; +}; + +} // namespace operators +} // namespace lite +} // namespace paddle diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc index 7c0f867fae4bca1957ba1610db5f40b8c8dbabdf..eefd30f74f570f64d1b5617c9dddc836086394b1 100644 --- a/lite/tests/cv/image_convert_test.cc +++ b/lite/tests/cv/image_convert_test.cc @@ -17,8 +17,8 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/tests/cv/cv_basic.h" -#include "lite/tests/utils/timer.h" #include "lite/utils/cv/paddle_image_preprocess.h" DEFINE_int32(cluster, 3, "cluster id"); @@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; typedef paddle::lite_api::Tensor Tensor_api; typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; void fill_tensor_host_rand(uint8_t* dio, int64_t size) { uint seed = 256; @@ -285,8 +285,8 @@ void test_img(const std::vector& cluster_id, ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); for (int i = 0; i < test_iter; ++i) { - t1.clear(); - t1.start(); + t1.Reset(); + t1.Start(); LOG(INFO) << "image convert saber compute"; // 方法一: image_preprocess.imageCovert(src, lite_dst); @@ -329,8 +329,8 @@ void test_img(const std::vector& cluster_id, means, scales); - t1.end(); - double tdiff = t1.get_average_ms(); + t1.Stop(); + double tdiff = t1.LapTimes().Avg(); to += tdiff; if (tdiff < min_time) { min_time = tdiff; diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 02d40ce6cc4acfd582fb148f10aafc654ee13be0..549fabab5a20b7757585eacdc2fe4db64e0aaadf 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -39,6 +39,8 @@ if(LITE_BUILD_EXTRA) lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() lite_cc_test(test_kernel_pad2d_compute SRCS pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_prior_box_compute SRCS prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc index 0779caf67aac907e6f8ccde8b3e65d413cf65db9..7ea4293f080df31d9bb05b4998b5b2d9ae7d5a47 100644 --- a/lite/tests/kernels/bilinear_interp_compute_test.cc +++ b/lite/tests/kernels/bilinear_interp_compute_test.cc @@ -22,6 +22,27 @@ namespace paddle { namespace lite { +inline std::vector get_new_shape( + std::vector list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*(tensor->data()))); + } + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + template void resize_bilinear_align(std::vector inputs, lite::Tensor* output) { @@ -149,6 +170,9 @@ class BilinearInterpComputeTester : public arena::TestCase { protected: // common attributes for this op. std::string input0_ = "X"; + std::string sizetensor0_ = "SizeTensor0"; + std::string sizetensor1_ = "SizeTensor1"; + std::string input_scale_ = "Scale"; std::string input1_ = "OutSize"; std::string output_ = "Out"; @@ -162,6 +186,8 @@ class BilinearInterpComputeTester : public arena::TestCase { std::string interp_method_ = "Bilinear"; DDim _dims0_{{1, 1, 16, 16}}; DDim _dims1_{{2}}; + DDim sizetensor_dims_{{1}}; + DDim scale_dims_{{1}}; public: BilinearInterpComputeTester(const Place& place, @@ -190,33 +216,48 @@ class BilinearInterpComputeTester : public arena::TestCase { if (outsize_height_ > 0 && outsize_width_ > 0) { inputs.emplace_back(scope->FindTensor(input1_)); } + std::vector SizeTensor; + if (outsize_height_ > 0 && outsize_width_ > 0) { + SizeTensor.emplace_back(scope->FindTensor(sizetensor0_)); + SizeTensor.emplace_back(scope->FindTensor(sizetensor1_)); + } + const lite::Tensor* input_scale = scope->FindTensor(input_scale_); + float scale = height_scale_; + int in_h = inputs[0]->dims()[2]; + int in_w = inputs[0]->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height_ = new_size[0]; + out_width_ = new_size[1]; + } else { + auto scale_tensor = input_scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height_ = static_cast(in_h * scale); + out_width_ = static_cast(in_w * scale); + } + if (inputs.size() > 1) { + auto out_size = inputs[1]; + auto out_size_data = get_new_data_from_tensor(out_size); + out_height_ = out_size_data[0]; + out_width_ = out_size_data[1]; + } + } + height_scale_ = scale; + width_scale_ = scale; + if (out_width_ != -1 && out_height_ != -1) { height_scale_ = static_cast(out_height_ / inputs[0]->dims()[2]); width_scale_ = static_cast(out_width_ / inputs[0]->dims()[3]); } auto* outputs = scope->NewTensor(output_); CHECK(outputs); - if (inputs.size() > 1) { - auto outsize_data = inputs[1]->data(); - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - int num_cout = inputs[0]->dims()[0]; - int c_cout = inputs[0]->dims()[1]; - outputs->Resize({num_cout, c_cout, h_out, w_out}); - } else { - int out_h; - int out_w; - if (-1 == out_height_ && -1 == out_width_) { - out_h = inputs[0]->dims()[2] * height_scale_; - out_w = inputs[0]->dims()[3] * width_scale_; - } else { - out_h = out_height_; - out_w = out_width_; - } - outputs->Resize( - {inputs[0]->dims()[0], inputs[0]->dims()[1], out_h, out_w}); - } - + int num_cout = inputs[0]->dims()[0]; + int c_cout = inputs[0]->dims()[1]; + outputs->Resize({num_cout, c_cout, out_height_, out_width_}); if (align_corners_) { resize_bilinear_align(inputs, outputs); } else { @@ -229,6 +270,10 @@ class BilinearInterpComputeTester : public arena::TestCase { op_desc->SetInput("X", {input0_}); if (outsize_height_ > 0 && outsize_width_ > 0) { op_desc->SetInput("OutSize", {input1_}); + op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_}); + } + if (height_scale_ > 0) { + op_desc->SetInput("Scale", {input_scale_}); } op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("scale", height_scale_); @@ -250,6 +295,19 @@ class BilinearInterpComputeTester : public arena::TestCase { data1[0] = outsize_height_; data1[1] = outsize_width_; SetCommonTensor(input1_, _dims1_, data1.data()); + + std::vector sizetensor_data(1); + sizetensor_data[0] = outsize_height_; + SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data()); + + sizetensor_data[0] = outsize_width_; + SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data()); + } + + if (height_scale_ > 0) { + std::vector scale_data(1); + scale_data[0] = height_scale_; + SetCommonTensor(input_scale_, scale_dims_, scale_data.data()); } } }; diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc index a287f0bb6610921e0f048fcc4d46f8729dd177c1..6c348076ba82490a599b9916826c59dabf91f870 100644 --- a/lite/tests/kernels/conv2d_transpose_compute_test.cc +++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc @@ -31,8 +31,10 @@ void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -40,19 +42,22 @@ void col2im(const Dtype* data_col, Dtype* data_im) { memset(data_im, 0, height * width * channels * sizeof(float)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { data_im[input_row * width + input_col] += *data_col; @@ -104,6 +109,34 @@ void fill_bias_relu(float* tensor, } } +inline void UpdatePaddingAndDilation(std::vector* paddings, + std::vector* dilations, + const std::vector& strides, + const std::string padding_algorithm, + const DDim data_dims, + const std::vector& ksize) { + // when padding_desc is "VALID" or "SAME" + if (padding_algorithm == "SAME") { + for (size_t i = 0; i < strides.size(); ++i) { + int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i]; + int pad_sum = std::max( + (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2], + (int64_t)0); + int pad_0 = pad_sum / 2; + int pad_1 = pad_sum - pad_0; + // pad + *(paddings->begin() + i * 2) = pad_0; + *(paddings->begin() + i * 2 + 1) = pad_1; + // dilation + *(dilations->begin() + i) = 1; + } + } else if (padding_algorithm == "VALID") { + for (auto& it : *paddings) { + it = 0; + } + } +} + template static void basic_gemm(int m, int n, @@ -172,8 +205,10 @@ bool deconv_basic(const Dtype1* din, int stride_h, int dila_w, int dila_h, - int pad_w, - int pad_h, + int pad_w0, + int pad_w1, + int pad_h0, + int pad_h1, bool flag_bias, bool flag_relu) { int m = chout * kernel_w * kernel_h / group; @@ -193,8 +228,9 @@ bool deconv_basic(const Dtype1* din, int group_size_coldata = m * n; int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); + (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) && + (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) && + (dila_h == 1); for (int i = 0; i < num; ++i) { const Dtype1* din_batch = din + i * chin * hin * win; @@ -204,7 +240,7 @@ bool deconv_basic(const Dtype1* din, if (flag_1x1s1p1) { col_data = dout_batch; } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group); for (int g = 0; g < group; ++g) { const Dtype1* din_group = din_batch + g * group_size_in; const Dtype1* weights_group = weights + g * group_size_weights; @@ -230,8 +266,10 @@ bool deconv_basic(const Dtype1* din, wout, kernel_h, kernel_w, - pad_h, - pad_w, + pad_h0, + pad_h1, + pad_w0, + pad_w1, stride_h, stride_w, dila_h, @@ -253,9 +291,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase { std::string output_ = "out"; std::string filter_ = "filter"; std::string bias_ = "bias"; + std::string padding_algorithm_ = ""; std::vector strides_{1, 1}; - std::vector paddings_{0, 0}; + std::vector paddings_{0, 0, 0, 0}; int groups_{1}; std::vector dilations_{1, 1}; bool flag_relu_{false}; @@ -280,9 +319,13 @@ class Conv2DTransposeComputeTester : public arena::TestCase { bool flag_relu, int dilation, int stride, - int padding, + int pad_h0, + int pad_h1, + int pad_w0, + int pad_w1, int ks, - int groups) + int groups, + std::string padding_algorithm) : TestCase(place, alias) { n_ = n; ic_ = ic; @@ -291,20 +334,29 @@ class Conv2DTransposeComputeTester : public arena::TestCase { iw_ = iw; ks_ = ks; flag_bias_ = flag_bias; - + padding_algorithm_ = padding_algorithm; strides_ = std::vector({stride, stride}); - paddings_ = std::vector({padding, padding}); - groups_ = groups; + paddings_ = std::vector({pad_h0, pad_h1, pad_w0, pad_w1}); dilations_ = std::vector({dilation, dilation}); + groups_ = groups; flag_relu_ = flag_relu; } void RunBaseline(Scope* scope) override { auto* out = scope->NewTensor(output_); CHECK(out); - int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] + + auto* x = scope->FindTensor(x_); + auto input_dim = x->dims(); + std::vector ksize({1, 1, ks_, ks_}); + UpdatePaddingAndDilation(&paddings_, + &dilations_, + strides_, + padding_algorithm_, + input_dim, + ksize); + int oh = (ih_ - 1) * strides_[0] - paddings_[0] - paddings_[1] + dilations_[0] * (ks_ - 1) + 1; - int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] + + int ow = (iw_ - 1) * strides_[1] - paddings_[2] - paddings_[3] + dilations_[1] * (ks_ - 1) + 1; CHECK(oh > 0 || ow > 0); @@ -313,7 +365,6 @@ class Conv2DTransposeComputeTester : public arena::TestCase { out->Resize(output_dims); auto* output_data = out->mutable_data(); - auto* x = scope->FindTensor(x_); const auto* x_data = x->data(); auto* filter = scope->FindTensor(filter_); const auto* filter_data = filter->data(); @@ -341,8 +392,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase { strides_[0], dilations_[1], dilations_[0], - paddings_[1], + paddings_[2], + paddings_[3], paddings_[0], + paddings_[1], flag_bias_, flag_relu_); } @@ -360,6 +413,7 @@ class Conv2DTransposeComputeTester : public arena::TestCase { op_desc->SetInput("Bias", {bias_}); } op_desc->SetAttr("fuse_relu", flag_relu_); + op_desc->SetAttr("padding_algorithm", padding_algorithm_); } void PrepareData() override { @@ -402,49 +456,66 @@ TEST(conv2d_transpose, precision) { LOG(INFO) << "test conv2d_transpose op"; #ifdef LITE_WITH_ARM Place place(TARGET(kARM)); - for (auto n : {1, 2}) { + for (auto n : {2}) { for (auto ic : {1, 4 /*, 128*/}) { for (auto oc : {1, 4 /*, 128*/}) { LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc; - for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) { + for (auto ih : {8, 8 /*, 56 , 112, 224, 512*/}) { for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) { for (auto flag_bias : {false, true}) { for (auto flag_relu : {false, true}) { for (auto dilation : {1, 2}) { for (auto stride : {1, 2}) { - for (auto padding : {0, 2}) { - for (auto ks : {2, 5}) { - for (auto group : {1, 2}) { - // obtain shape - // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << - // oc - // << ",ih:" << ih << ",iw:" << iw - // << ",flag_bias:" << flag_bias - // << ",flag_relu:" << flag_relu - // << ",dila:" << dilation - // << ",stride:" << stride - // << ",padding:" << padding << ",ks:" << ks - // << ",group:" << group; - if (ic % group != 0 || oc % group != 0) { - group = 1; + for (auto pad_h0 : {0, 1}) { + for (auto pad_h1 : {0, 1}) { + for (auto pad_w0 : {0, 1}) { + for (auto pad_w1 : {0, 1}) { + for (auto ks : {1, 4}) { + for (auto group : {1, 2}) { + for (auto padding_algorithm : + {"", "SAME", "VALID"}) { + // obtain shape + // LOG(INFO) << "n:" << n << ",ic:" << ic << + // ",oc:" << + // oc + // << ",ih:" << ih << ",iw:" << iw + // << ",flag_bias:" << flag_bias + // << ",flag_relu:" << flag_relu + // << ",dila:" << dilation + // << ",stride:" << stride + // << ",padding:" << padding << + // ",ks:" << ks + // << ",group:" << group; + if (ic % group != 0 || oc % group != 0) { + group = 1; + } + std::unique_ptr tester( + new Conv2DTransposeComputeTester( + place, + "def", + n, + ic, + oc, + ih, + iw, + flag_bias, + flag_relu, + dilation, + stride, + pad_h0, + pad_h1, + pad_w0, + pad_w1, + ks, + group, + padding_algorithm)); + arena::Arena arena( + std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } } - std::unique_ptr tester( - new Conv2DTransposeComputeTester(place, - "def", - n, - ic, - oc, - ih, - iw, - flag_bias, - flag_relu, - dilation, - stride, - padding, - ks, - group)); - arena::Arena arena(std::move(tester), place, 2e-5); - arena.TestPrecision(); } } } diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e211582b04d279b535f0d3873a9b0c537e375a60 --- /dev/null +++ b/lite/tests/kernels/fill_constant_compute_test.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" + +namespace paddle { +namespace lite { + +class FillConstantComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string out_ = "out"; + int dtype_{static_cast(VarDescAPI::VarDataType::FP32)}; + std::vector shape_{}; + std::string shape_tensor_ = "ShapeTensor"; + std::vector shape_tensor_list_; + bool is_use_shape_tensor_{false}; + bool is_use_shape_tensor_list_{false}; + + float value_{0.0f}; + // useless for x86, keep it for compatibility + bool force_cpu_{false}; + // DDim shape_tensor_data{{5, 3}}; + std::vector shape_tensor_data; + DDim shape_test{{1, 2}}; + + public: + FillConstantComputeTester(const Place& place, + const std::string& alias, + std::vector shape, + const bool is_use_shape_tensor, + const bool is_use_shape_tensor_list, + float value, + bool force_cpu) + : TestCase(place, alias) { + shape_ = shape; + value_ = value; + force_cpu_ = force_cpu; + is_use_shape_tensor_ = is_use_shape_tensor; + is_use_shape_tensor_list_ = is_use_shape_tensor_list; + + for (int i = 0; i < shape_test.size(); i++) { + shape_tensor_data.push_back(i + 1); + } + } + + void RunBaseline(Scope* scope) override { + auto* out = scope->NewTensor(out_); + DDim output_dims{shape_}; + if (is_use_shape_tensor_) { + auto* temp_shape = scope->FindTensor(shape_tensor_); + auto* shape_data = temp_shape->data(); + auto vec_shape = + std::vector(shape_data, shape_data + temp_shape->numel()); + output_dims.ConstructFrom(vec_shape); + } + if (is_use_shape_tensor_list_) { + std::vector vec_shape; + for (int i = 0; i < shape_tensor_list_.size(); i++) { + auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]); + vec_shape.push_back(*temp_shape->data()); + } + + output_dims.ConstructFrom(vec_shape); + } + out->Resize(output_dims); + + auto* output_data = out->mutable_data(); + for (int i = 0; i < out->numel(); i++) { + output_data[i] = value_; + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + LOG(INFO) << "PrepareOpDesc"; + + op_desc->SetType("fill_constant"); + op_desc->SetAttr("dtype", dtype_); + op_desc->SetAttr("shape", shape_); + op_desc->SetAttr("value", value_); + op_desc->SetAttr("force_cpu", force_cpu_); + if (is_use_shape_tensor_) { + op_desc->SetInput("ShapeTensor", {shape_tensor_}); + } + if (is_use_shape_tensor_list_) { + // std::vector shape_tensor_list_; + for (int i = 0; i < shape_test.size(); ++i) { + shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i)); + } + op_desc->SetInput("ShapeTensorList", {shape_tensor_list_}); + } + op_desc->SetOutput("Out", {out_}); + } + + void PrepareData() override { + if (is_use_shape_tensor_) { + // std::vector temp = x_dims_.data(); + // int64_t* data = temp.data(); + SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data()); + } + if (is_use_shape_tensor_list_) { + Scope& scope_ = this->scope(); + for (int i = 0; i < shape_test.size(); ++i) { + auto* tensor = + scope_.NewTensor("shape_tensor_list_" + std::to_string(i)); + tensor->Resize(DDim({1})); + auto* d = tensor->mutable_data(); + d[0] = shape_tensor_data[i]; + } + } + } +}; + +TEST(fill_constant, precision) { + LOG(INFO) << "test fill_constant op, kARM"; +#ifdef LITE_WITH_ARM + Place place(TARGET(kARM)); + std::vector shape{1, 2}; + + for (int dtype : {static_cast(VarDescAPI::VarDataType::INT32)}) { + for (float value : {1, 2}) { + for (bool is_use_shape_tensor_list : {false, true}) { + for (bool is_use_shape_tensor : {false, true}) { + if (is_use_shape_tensor && is_use_shape_tensor_list) break; + LOG(INFO) << "value:" << value + << ", is_use_shape_tensor:" << is_use_shape_tensor + << ", is_use_shape_tensor_list:" + << is_use_shape_tensor_list; + + std::unique_ptr tester( + new FillConstantComputeTester(place, + "def", + shape, + is_use_shape_tensor, + is_use_shape_tensor_list, + value, + false)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + } + } +#endif + +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + LOG(INFO) << "test concate op, x86"; + for (int axis : {1, 2}) { + for (bool is_use_axis_tensor : {false, true}) { + LOG(INFO) << "axis:" << axis + << ", is_use_axis_tensor:" << is_use_axis_tensor; + std::unique_ptr tester( + new ConcateComputeTester(place, "def", axis, is_use_axis_tensor)); + arena::Arena arena(std::move(tester), place, 2e-5); + arena.TestPrecision(); + } + } + +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/lrn_compute_test.cc b/lite/tests/kernels/lrn_compute_test.cc index 9ee43c5c60b4703f64e7a2575ec15ba59b618052..e306155514e7423dfcfccb3d7103050b50f9fdbe 100644 --- a/lite/tests/kernels/lrn_compute_test.cc +++ b/lite/tests/kernels/lrn_compute_test.cc @@ -158,7 +158,7 @@ class LrnComputeTester : public arena::TestCase { op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("alpha", alpha_); op_desc->SetAttr("beta", beta_); - op_desc->SetAttr("local_size", local_size_); + op_desc->SetAttr("n", local_size_); op_desc->SetAttr("k", k_); op_desc->SetAttr("norm_region", norm_region_); } diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc index 3256ababcab639cd31ef51294a890b7fbdb54d5d..894959f9090cce8a391c146815f550d5f42adcb6 100644 --- a/lite/tests/kernels/nearest_interp_compute_test.cc +++ b/lite/tests/kernels/nearest_interp_compute_test.cc @@ -22,6 +22,28 @@ namespace paddle { namespace lite { +inline std::vector get_new_shape( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + vec_new_shape.push_back(static_cast(*tensor->data())); + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + lite::Tensor cpu_starts_tensor; + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->dims().production()); + return vec_new_data; +} + template void resize_nearest_align(std::vector inputs, lite::Tensor* output, @@ -73,6 +95,9 @@ class NearestInterpComputeTester : public arena::TestCase { protected: // common attributes for this op. std::string input0_ = "X"; + std::string sizetensor0_ = "SizeTensor0"; + std::string sizetensor1_ = "SizeTensor1"; + std::string input_scale_ = "Scale"; std::string input1_ = "OutSize"; std::string output_ = "Out"; @@ -85,6 +110,8 @@ class NearestInterpComputeTester : public arena::TestCase { DDim dims_{{2, 3}}; DDim _dims0_{{2, 3, 3, 2}}; DDim _dims1_{{2}}; + DDim sizetensor_dims_{{1}}; + DDim scale_dims_{{1}}; public: NearestInterpComputeTester(const Place& place, @@ -112,24 +139,54 @@ class NearestInterpComputeTester : public arena::TestCase { inputs.emplace_back(scope->FindTensor(input0_)); inputs.emplace_back(scope->FindTensor(input1_)); - auto outsize_data = inputs[1]->data(); + std::vector SizeTensor(2); + SizeTensor[0] = scope->FindTensor(sizetensor0_); + SizeTensor[1] = scope->FindTensor(sizetensor1_); + const lite::Tensor* input_scale = scope->FindTensor(input_scale_); + + float scale = height_scale_; + int in_h = inputs[0]->dims()[2]; + int in_w = inputs[0]->dims()[3]; + if (SizeTensor.size() > 0) { + auto new_size = get_new_shape(SizeTensor); + out_height_ = new_size[0]; + out_width_ = new_size[1]; + } else { + auto scale_tensor = input_scale; + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } + if (scale > 0) { + out_height_ = static_cast(in_h * scale); + out_width_ = static_cast(in_w * scale); + } + auto out_size = inputs[1]; + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_height_ = out_size_data[0]; + out_width_ = out_size_data[1]; + } + } + height_scale_ = scale; + width_scale_ = scale; + if (out_width_ != -1 && out_height_ != -1) { height_scale_ = static_cast(out_height_ / inputs[0]->dims()[2]); width_scale_ = static_cast(out_width_ / inputs[0]->dims()[3]); } - if (inputs.size() > 1) { - int h_out = outsize_data[0]; // HW - int w_out = outsize_data[1]; // HW - int num_cout = outputs->dims()[0]; - int c_cout = outputs->dims()[1]; - outputs->Resize({num_cout, c_cout, h_out, w_out}); - } + int num_cout = inputs[0]->dims()[0]; + int c_cout = inputs[0]->dims()[1]; + outputs->Resize({num_cout, c_cout, out_height_, out_width_}); + resize_nearest_align(inputs, outputs, align_corners_); } void PrepareOpDesc(cpp::OpDesc* op_desc) { op_desc->SetType("nearest_interp"); op_desc->SetInput("X", {input0_}); + op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_}); + op_desc->SetInput("Scale", {input_scale_}); op_desc->SetInput("OutSize", {input1_}); op_desc->SetOutput("Out", {output_}); op_desc->SetAttr("scale", height_scale_); @@ -152,6 +209,17 @@ class NearestInterpComputeTester : public arena::TestCase { SetCommonTensor(input0_, _dims0_, data0.data()); SetCommonTensor(input1_, _dims1_, data1.data()); + + std::vector sizetensor_data(1); + sizetensor_data[0] = out_height_; + SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data()); + + sizetensor_data[0] = out_width_; + SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data()); + + std::vector scale_data(1); + scale_data[0] = height_scale_; + SetCommonTensor(input_scale_, scale_dims_, scale_data.data()); } }; diff --git a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cb824931ae9ae1e8472dd3f368a04c24e72aa291 --- /dev/null +++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc @@ -0,0 +1,220 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" + +namespace paddle { +namespace lite { + +class SearchAlignedMatMulComputeTester : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "X"; + std::string y_ = "Y"; + bool x_transpose_; + bool y_transpose_; + float alpha_; + std::string out_ = "Out"; + DDim x_dims_; + DDim y_dims_; + LoD x_lod_; + LoD y_lod_; + + public: + SearchAlignedMatMulComputeTester(const Place& place, + const std::string& alias, + bool x_transpose, + bool y_transpose, + float alpha, + const DDim& x_dims, + const DDim& y_dims, + const LoD& x_lod, + const LoD& y_lod) + : TestCase(place, alias), + x_transpose_(x_transpose), + y_transpose_(y_transpose), + alpha_(alpha), + x_dims_(x_dims), + y_dims_(y_dims), + x_lod_(x_lod), + y_lod_(y_lod) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto y = scope->FindTensor(y_); + CHECK(x); + CHECK(y); + const auto x_data = x->data(); + const auto y_data = y->data(); + auto out = scope->NewTensor(out_); + CHECK(out); + + const auto x_dims = x->dims(); + const auto y_dims = y->dims(); + const auto& x_lod = x->lod(); + const auto& y_lod = y->lod(); + const auto& x_lod_0 = x_lod[0]; + const auto& y_lod_0 = y_lod[0]; + + int seq_num = x_lod_0.size() - 1; + int x_inner_size = x_dims[1]; + int y_inner_size = y_dims[1]; + int x_batch_size = x_lod_0[1]; + int y_batch_size = y_lod_0[1]; + int M = x_transpose_ ? x_inner_size : x_batch_size; + int N = y_transpose_ ? y_batch_size : y_inner_size; + int X_K = x_transpose_ ? x_batch_size : x_inner_size; + int Y_K = y_transpose_ ? y_inner_size : y_batch_size; + CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal"; + int K = X_K; + int x_stride = x_batch_size * x_inner_size; + int y_stride = y_batch_size * y_inner_size; + int out_stride = M * N; + int lda = x_transpose_ ? M : K; + int ldb = y_transpose_ ? K : N; + int ldc = N; + + LoD out_lod; + std::vector out_lod_0(seq_num + 1); + out_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + out_lod_0[i + 1] = out_lod_0[i] + M; + } + out_lod.push_back(out_lod_0); + DDim out_dims( + {static_cast(out_lod_0.back()), static_cast(N)}); + out->set_lod(out_lod); + out->Resize(out_dims); + + auto out_data = out->mutable_data(); + for (int i = 0; i < seq_num; i++) { + basic_gemm(x_transpose_, + y_transpose_, + M, + N, + K, + alpha_, + x_data + i * x_stride, + lda, + y_data + i * y_stride, + ldb, + 0, + out_data + i * out_stride, + ldc, + nullptr, + false, + false); + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("search_aligned_mat_mul"); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("Y", {y_}); + op_desc->SetOutput("Out", {out_}); + op_desc->SetAttr("transpose_X", x_transpose_); + op_desc->SetAttr("transpose_Y", y_transpose_); + op_desc->SetAttr("alpha", alpha_); + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + std::vector y_data(y_dims_.production()); + fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(y_data.data(), -1.f, 1.f, y_dims_.production()); + SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_); + SetCommonTensor(y_, y_dims_, y_data.data(), y_lod_); + } +}; + +void test_search_aligned_mat_mul(Place place) { + for (int seq_num : {1, 2}) { + for (int x_batch_size : {1, 3}) { + for (int x_inner_size : {1, 5}) { + for (int out_inner_size : {1, 4}) { + for (bool x_transpose : {true, false}) { + for (bool y_transpose : {true, false}) { + for (float alpha : {1., 2.}) { + // infer x_dims and y_dims + int y_batch_size; + int y_inner_size; + if (x_transpose) { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_batch_size; + } else { + y_batch_size = x_batch_size; + y_inner_size = out_inner_size; + } + } else { + if (y_transpose) { + y_batch_size = out_inner_size; + y_inner_size = x_inner_size; + } else { + y_batch_size = x_inner_size; + y_inner_size = out_inner_size; + } + } + std::vector x_lod_0(seq_num + 1); + std::vector y_lod_0(seq_num + 1); + x_lod_0[0] = 0; + y_lod_0[0] = 0; + for (int i = 0; i < seq_num; i++) { + x_lod_0[i + 1] = x_lod_0[i] + x_batch_size; + y_lod_0[i + 1] = y_lod_0[i] + y_batch_size; + } + LoD x_lod; + LoD y_lod; + x_lod.push_back(x_lod_0); + y_lod.push_back(y_lod_0); + DDim x_dims({static_cast(x_lod_0.back()), + static_cast(x_inner_size)}); + DDim y_dims({static_cast(y_lod_0.back()), + static_cast(y_inner_size)}); + + std::unique_ptr tester( + new SearchAlignedMatMulComputeTester(place, + "def", + x_transpose, + y_transpose, + alpha, + x_dims, + y_dims, + x_lod, + y_lod)); + arena::Arena arena(std::move(tester), place, 5e-4); + arena.TestPrecision(); + } + } + } + } + } + } + } +} + +TEST(SearchAlignedMatMul, precision) { +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + test_search_aligned_mat_mul(place); +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/search_seq_fc_compute_test.cc b/lite/tests/kernels/search_seq_fc_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..988d3a27cc238a57ae18de81a2dad619f8b4a9f0 --- /dev/null +++ b/lite/tests/kernels/search_seq_fc_compute_test.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/core/arena/framework.h" +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" + +namespace paddle { +namespace lite { + +class SearchSeqFcOPTest : public arena::TestCase { + protected: + // common attributes for this op. + std::string x_ = "x"; + std::string w_ = "w"; + std::string b_ = "b"; + std::string out_ = "out"; + DDim x_dims_; + DDim w_dims_; + DDim b_dims_; + LoD x_lod_; + bool has_bias_; + int out_size_; + + public: + SearchSeqFcOPTest(const Place& place, + const std::string& alias, + DDim x_dims, + DDim w_dims, + DDim b_dims, + LoD x_lod, + bool has_bias, + int out_size) + : TestCase(place, alias), + x_dims_(x_dims), + w_dims_(w_dims), + b_dims_(b_dims), + x_lod_(x_lod), + has_bias_(has_bias), + out_size_(out_size) {} + + void RunBaseline(Scope* scope) override { + auto x = scope->FindTensor(x_); + auto w = scope->FindTensor(w_); + CHECK(x); + CHECK(w); + auto out = scope->NewTensor(out_); + CHECK(out); + + const auto x_data = x->data(); + const auto w_data = w->data(); + const auto x_dims = x->dims(); + const auto w_dims = w->dims(); + const auto& x_lod = x->lod(); + CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor."; + CHECK(!x_lod.empty()) << "The Input(X) must hold lod info."; + const auto& x_lod_0 = x_lod[0]; + CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted."; + CHECK_EQ(x_dims[0], static_cast(x_lod_0.back())) + << "The Input(X)'s lod info mismatches the actual tensor shape."; + CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor."; + CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]"; + CHECK_EQ(w_dims[0], out_size_) << "Wrong shape: w_dims[0] != out_size"; + + const float* b_data = nullptr; + if (has_bias_) { + auto b = scope->FindTensor(b_); + CHECK(b); + auto b_dims = b->dims(); + CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor."; + CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]"; + b_data = b->data(); + } + + out->set_lod(x_lod); + out->Resize({x_dims[0], w_dims[0]}); + + int M = x_dims[0]; + int K = x_dims[1]; + int N = w_dims[0]; + auto out_data = out->mutable_data(); + basic_gemm(false, + true, + M, + N, + K, + 1.f, + x_data, + K, + w_data, + K, + 0, + out_data, + N, + nullptr, + false, + false); + if (b_data != nullptr) { + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + out_data[i * N + j] += b_data[j]; + } + } + } + } + + void PrepareOpDesc(cpp::OpDesc* op_desc) { + op_desc->SetType("search_seq_fc"); + op_desc->SetInput("X", {x_}); + op_desc->SetInput("W", {w_}); + if (has_bias_) { + op_desc->SetInput("b", {b_}); + } + op_desc->SetAttr("has_bias", has_bias_); + op_desc->SetAttr("out_size", out_size_); + op_desc->SetOutput("Out", {out_}); + } + + void PrepareData() override { + std::vector x_data(x_dims_.production()); + std::vector w_data(w_dims_.production()); + fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production()); + fill_data_rand(w_data.data(), -1.f, 1.f, w_dims_.production()); + SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_); + SetCommonTensor(w_, w_dims_, w_data.data()); + if (has_bias_) { + std::vector b_data(b_dims_.production()); + fill_data_rand(b_data.data(), -1.f, 1.f, b_dims_.production()); + SetCommonTensor(b_, b_dims_, b_data.data()); + } + } +}; + +void test_search_seq_fc(Place place) { + for (auto x_lod_0 : {std::vector({0, 1, 3}), + std::vector({0, 3, 4, 5})}) { + for (auto feature_size : {2, 9}) { + for (auto out_size : {3, 5}) { + for (auto has_bias : {true, false}) { + DDim x_dims({static_cast(x_lod_0.back()), feature_size}); + DDim w_dims({out_size, feature_size}); + DDim b_dims({has_bias ? out_size : 0}); + LoD x_lod; + x_lod.push_back(x_lod_0); + std::unique_ptr tester(new SearchSeqFcOPTest( + place, "def", x_dims, w_dims, b_dims, x_lod, has_bias, out_size)); + arena::Arena arena(std::move(tester), place, 6e-5); + arena.TestPrecision(); + } + } + } + } +} + +TEST(SearchSeqFcOP, precision) { +#ifdef LITE_WITH_X86 + Place place(TARGET(kX86)); + test_search_seq_fc(place); +#endif +} + +} // namespace lite +} // namespace paddle diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc index d0e9912e65de7a0aae10f83c31ba4ab5bbd50890..66123625fae606a9022537698cdc1032abb13451 100644 --- a/lite/tests/kernels/shuffle_channel_compute_test.cc +++ b/lite/tests/kernels/shuffle_channel_compute_test.cc @@ -12,12 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -// TODO(zhengxi) -// shuffle_channel_test can pass on local compilation -// while on ci compilation, the test will be killed immediately. - -/* -#include +// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine. +// Open this. +/*#include #include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/core/arena/framework.h" @@ -30,8 +27,8 @@ class ShuffleChannelComputeTester : public arena::TestCase { // common attributes for this op. std::string input_ = "X"; std::string output_ = "Out"; - int group_ = 1; - DDim dims_{{1, 2}}; + int group_ = 4; + DDim dims_{{10, 16, 4, 4}}; public: ShuffleChannelComputeTester(const Place& place, @@ -87,7 +84,7 @@ class ShuffleChannelComputeTester : public arena::TestCase { }; void test_shuffle_channel(Place place) { - for (int group : {1, 2, 3}) { + for (int group : {4}) { std::unique_ptr tester( new ShuffleChannelComputeTester(place, "def", group)); arena::Arena arena(std::move(tester), place, 2e-5); diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc index 9bbf39b70d5aab67454233efb909f932e0b5bec1..22e475672a87dafee29d68a3824e4f8ac0c15615 100644 --- a/lite/tests/kernels/unsqueeze_compute_test.cc +++ b/lite/tests/kernels/unsqueeze_compute_test.cc @@ -125,8 +125,7 @@ class UnsqueezeComputeTester : public arena::TestCase { for (size_t i = 0; i < axes_.size(); i++) { name = name + std::to_string(i); axes_tensor_list_.push_back(name); - std::vector in_data = {axes_[i]}; - SetCommonTensor(name, DDim({1}), in_data.data()); + SetCommonTensor(name, DDim({1}), &axes_[i]); } } } @@ -230,7 +229,7 @@ void test_unsqueeze(Place place) { for (int C : {3}) { for (int H : {1}) { for (int W : {5}) { - for (int input_axes_flag : {1, 2}) { + for (int input_axes_flag : {1, 2, 3}) { LOG(INFO) << N << " " << C << " " << H << " " << W << " " << input_axes_flag; std::unique_ptr tester( diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt index 87324375e09cf2633c1ec2a489b9205666754cc1..7dd4f522dbc0f10e8cfb7d19e95da4354ac4b779 100644 --- a/lite/tests/math/CMakeLists.txt +++ b/lite/tests/math/CMakeLists.txt @@ -1,9 +1,17 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + + if(LITE_BUILD_EXTRA) + lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels}) + endif() + + endif() diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc index bfb74e6e0a6f5ea0cae199f1c7dc5f1c03e83363..bda50d35633c853ba6e8c8695d0175da38865d1c 100644 --- a/lite/tests/math/conv_compute_test.cc +++ b/lite/tests/math/conv_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_compute.h" @@ -59,26 +59,30 @@ DEFINE_bool(flag_bias, true, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { DDim dim_out = dim_in; + auto paddings = *param.paddings; + auto dilations = *param.dilations; dim_out[1] = param.filter->dims()[0]; auto kernel_h = param.filter->dims()[2]; auto kernel_w = param.filter->dims()[3]; auto h = dim_in[2]; auto w = dim_in[3]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; + int pad_top = paddings[0]; + int pad_bottom = paddings[1]; + int pad_left = paddings[2]; + int pad_right = paddings[3]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; auto kernel_exten = dila_h * (kernel_h - 1) + 1; - auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; + auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1; kernel_exten = dila_w * (kernel_w - 1) + 1; - auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; + auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1; dim_out[2] = hout; dim_out[3] = wout; return dim_out; @@ -110,8 +114,8 @@ void test_conv_fp32(const std::vector& input_dims, param.bias->set_precision(PRECISION(kFloat)); } param.strides = strides; - param.paddings = pads; - param.dilations = dilas; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilas); param.fuse_relu = flag_relu; param.groups = group; @@ -162,7 +166,7 @@ void test_conv_fp32(const std::vector& input_dims, param.output->Resize(dim_out); paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); - // paddle::lite::fill_tensor_const(*param.x, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); auto din = param.x->data(); Tensor tout_basic; @@ -189,7 +193,7 @@ void test_conv_fp32(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], pads[0], flag_bias, flag_relu); @@ -201,19 +205,19 @@ void test_conv_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv.Launch(); - t0.end(); + t0.Stop(); } double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * weight_dim[3] / param.groups; LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -235,7 +239,8 @@ void test_conv_fp32(const std::vector& input_dims, LOG(FATAL) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -280,27 +285,33 @@ void test_conv_fp32(const std::vector& input_dims, TEST(TestConv3x3DW, test_conv3x3_depthwise) { if (FLAGS_basic_test) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - for (auto& c : {1, 3, 5, 8, 16, 32}) { - std::vector dims; - DDim weights_dim({c, 1, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 15, 19, 28, 32, 75}) { - dims.push_back(DDim({batch, c, h, h})); + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + for (auto& c : {1, 3, 5, 8, 16, 32}) { + std::vector dims; + DDim weights_dim({c, 1, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 15, 19, 28, 32, 75}) { + dims.push_back(DDim({batch, c, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + c, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - c, - {stride, stride}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -329,7 +340,7 @@ TEST(TestConv5x5DW, test_conv5x5_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -366,7 +377,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) { weights_dim, g, {1, 1}, - {0, 0}, + {0, 0, 0, 0}, {1, 1}, flag_bias, flag_relu, @@ -386,26 +397,32 @@ TEST(TestConv3x3s1, test_conv_3x3s1) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32, 48}) { for (auto& cout : {1, 5, 8, 32, 48}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + 1, + {1, 1}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - 1, - {1, 1}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -420,26 +437,32 @@ TEST(TestConv3x3s2, test_conv_3x3s2) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32}) { for (auto& cout : {1, 5, 8, 32}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32(dims, + weights_dim, + 1, + {2, 2}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - 1, - {2, 2}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -458,30 +481,37 @@ TEST(TestConvRand, test_conv_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cout, cin / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_fp32( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_fp32(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -510,7 +540,7 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc index e15b7d22bc2a5859db73f21aa54b1bcdaabf4d2c..27c186d7ceffcaab3019cedf7c281c524be73e44 100644 --- a/lite/tests/math/conv_int8_compute_test.cc +++ b/lite/tests/math/conv_int8_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_compute.h" @@ -59,26 +59,26 @@ DEFINE_bool(flag_bias, true, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { + auto paddings = *param.paddings; + auto dilations = *param.dilations; DDim dim_out = dim_in; dim_out[1] = param.filter->dims()[0]; auto kernel_h = param.filter->dims()[2]; auto kernel_w = param.filter->dims()[3]; auto h = dim_in[2]; auto w = dim_in[3]; - int dila_h = param.dilations[0]; - int dila_w = param.dilations[1]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + int dila_h = dilations[0]; + int dila_w = dilations[1]; int stride_h = param.strides[0]; int stride_w = param.strides[1]; auto kernel_exten = dila_h * (kernel_h - 1) + 1; - auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1; + auto hout = (h + paddings[0] + paddings[1] - kernel_exten) / stride_h + 1; kernel_exten = dila_w * (kernel_w - 1) + 1; - auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1; + auto wout = (w + paddings[2] + paddings[3] - kernel_exten) / stride_w + 1; dim_out[2] = hout; dim_out[3] = wout; return dim_out; @@ -104,8 +104,8 @@ void get_conv_param(const DDim& dim_w, param->bias->set_precision(PRECISION(kFloat)); } param->strides = strides; - param->paddings = pads; - param->dilations = dila; + param->paddings = std::make_shared>(pads); + param->dilations = std::make_shared>(dila); param->fuse_relu = flag_relu; param->groups = g; @@ -288,7 +288,7 @@ void test_conv_int8(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], pads[0], flag_bias, flag_relu); @@ -309,30 +309,30 @@ void test_conv_int8(const std::vector& input_dims, /// compute fp32 output Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_int8_fp32.Launch(); - t0.end(); + t0.Stop(); } LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out - << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); /// compute int8 output - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_int8_int8.Launch(); - t0.end(); + t0.Stop(); } LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out - << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); /// compare result fp32 output if (FLAGS_check_result) { @@ -358,7 +358,8 @@ void test_conv_int8(const std::vector& input_dims, LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -416,7 +417,8 @@ void test_conv_int8(const std::vector& input_dims, LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -428,9 +430,9 @@ void test_conv_int8(const std::vector& input_dims, } LOG(INFO) << "test int8 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] - << ", stride: " << strides[0] << ", " << strides[1] - << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << (flag_relu ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls @@ -473,7 +475,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -507,7 +509,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) { weights_dim, c, {stride, stride}, - {pad, pad}, + {pad, pad, pad, pad}, {1, 1}, flag_bias, flag_relu, @@ -544,7 +546,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) { weights_dim, g, {1, 1}, - {0, 0}, + {0, 0, 0, 0}, {1, 1}, flag_bias, flag_relu, @@ -564,26 +566,32 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32, 48}) { for (auto& cout : {1, 5, 8, 32, 48}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8(dims, + weights_dim, + 1, + {1, 1}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - 1, - {1, 1}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -598,26 +606,32 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) { if (FLAGS_basic_test) { for (auto& cin : {1, 3, 8, 32}) { for (auto& cout : {1, 5, 8, 32}) { - for (auto& pad : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - std::vector dims; - DDim weights_dim({cout, cin, 3, 3}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {1, 2}) { + for (auto& pad_bottom : {1, 2}) { + for (auto& pad_left : {1, 2}) { + for (auto& pad_right : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + std::vector dims; + DDim weights_dim({cout, cin, 3, 3}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 7, 19, 28, 75, 56, 32}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8(dims, + weights_dim, + 1, + {2, 2}, + {pad_top, pad_bottom, pad_left, pad_right}, + {1, 1}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - 1, - {2, 2}, - {pad, pad}, - {1, 1}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -636,30 +650,37 @@ TEST(TestConvRandInt8, test_conv_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cout, cin / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cout, cin / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_int8( + dims, + weights_dim, + g, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_conv_int8(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -688,7 +709,7 @@ TEST(TestConvCustomInt8, test_conv_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc index e0da07a53462cf902107efc0b6daaeef819f3288..398e745d94bfa71aa8fa2ced227b7add8b24087e 100644 --- a/lite/tests/math/conv_transpose_compute_test.cc +++ b/lite/tests/math/conv_transpose_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/conv_transpose_compute.h" @@ -59,17 +59,19 @@ DEFINE_bool(flag_bias, false, "with bias"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::ConvParam ConvParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::ConvParam& param) { auto filter_dims = param.filter->dims(); DDim output_shape = dim_in; output_shape[1] = filter_dims[1] * param.groups; + auto paddings = *param.paddings; + auto dilations = *param.dilations; for (int i = 0; i < 2; i++) { - int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1; + int kernel_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent - - 2 * param.paddings[i]; + (paddings[2 * i] + paddings[2 * i + 1]); output_shape[i + 2] = output_len; } return output_shape; @@ -101,19 +103,19 @@ void test_conv_transpose_fp32(const std::vector& input_dims, param.bias->set_precision(PRECISION(kFloat)); } param.strides = strides; - param.paddings = pads; - param.dilations = dilas; + param.paddings = std::make_shared>(pads); + param.dilations = std::make_shared>(dilas); param.fuse_relu = flag_relu; param.groups = group; param.output = new Tensor; param.output->set_precision(PRECISION(kFloat)); - // paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.filter, 1.f); + paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.filter, 1.f); if (flag_bias) { - // paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.bias, 1.f); + paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.bias, 1.f); } Tensor tmp_weights; tmp_weights.Resize(weight_dim); @@ -128,21 +130,8 @@ void test_conv_transpose_fp32(const std::vector& input_dims, new paddle::lite::KernelContext); auto& ctx = ctx1->As(); ctx.SetRunMode(static_cast(cls), th); - /// set param and context - for (auto& dim_in : input_dims) { - param.x->Resize(dim_in); - DDim out_tmp_dims = compute_out_dim(dim_in, param); - if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) { - continue; - } - param.output->Resize(out_tmp_dims); - break; - } conv_t.SetParam(param); conv_t.SetContext(std::move(ctx1)); - /// prepare for run - conv_t.PrepareForRun(); - for (auto& dim_in : input_dims) { CHECK_EQ(weight_dim[0], dim_in[1]) << "input channel must equal to weights channel"; @@ -152,9 +141,11 @@ void test_conv_transpose_fp32(const std::vector& input_dims, } param.x->Resize(dim_in); param.output->Resize(dim_out); - - // paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); - paddle::lite::fill_tensor_const(*param.x, 1.f); + param.filter->CopyDataFrom(tmp_weights); + // prepare for run + conv_t.PrepareForRun(); + paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); auto din = param.x->data(); Tensor tout_basic; @@ -182,8 +173,10 @@ void test_conv_transpose_fp32(const std::vector& input_dims, strides[0], dilas[1], dilas[0], - pads[1], + pads[2], + pads[3], pads[0], + pads[1], flag_bias, flag_relu); } @@ -194,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); conv_t.Launch(); - t0.end(); + t0.Stop(); } float gops = 2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3]; LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" - << dim_out << ",running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -228,7 +221,8 @@ void test_conv_transpose_fp32(const std::vector& input_dims, LOG(FATAL) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " + << pads[2] << ", " << pads[3] << ", stride: " << strides[0] << ", " << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") @@ -240,9 +234,9 @@ void test_conv_transpose_fp32(const std::vector& input_dims, } LOG(INFO) << "test fp32 conv: input: " << dim_in << ", output: " << dim_out << ", weight dim: " << weight_dim - << ", pad: " << pads[0] << ", " << pads[1] - << ", stride: " << strides[0] << ", " << strides[1] - << ", dila_: " << dilas[0] << ", " << dilas[1] + << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2] + << ", " << pads[3] << ", stride: " << strides[0] << ", " + << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1] << ", bias: " << (flag_bias ? "true" : "false") << ", relu: " << (flag_relu ? "true" : "false") << ", threads: " << th << ", power_mode: " << cls @@ -278,30 +272,37 @@ TEST(TestConvRand, test_conv_transpose_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& dila : {1, 2}) { - for (auto& flag_bias : {false, true}) { - for (auto& flag_relu : {false, true}) { - if (cin % g != 0 || cout % g != 0) { - continue; - } - std::vector dims; - DDim weights_dim({cin, cout / g, kh, kw}); - for (auto& batch : {1, 2}) { - for (auto& h : {1, 3, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_h0 : {0, 1, 2}) { + for (auto& pad_h1 : {0, 1, 2}) { + for (auto& pad_w0 : {0, 1, 2}) { + for (auto& pad_w1 : {0, 1, 2}) { + for (auto& dila : {1, 2}) { + for (auto& flag_bias : {false, true}) { + for (auto& flag_relu : {false, true}) { + if (cin % g != 0 || cout % g != 0) { + continue; + } + std::vector dims; + DDim weights_dim({cin, cout / g, kh, kw}); + for (auto& batch : {1, 2}) { + for (auto& h : {1, 3, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_conv_transpose_fp32( + dims, + weights_dim, + g, + {stride, stride}, + {pad_h0, pad_h1, pad_w0, pad_w1}, + {dila, dila}, + flag_bias, + flag_relu, + {1, 4}, + {FLAGS_power_mode}); + } } } - test_conv_transpose_fp32(dims, - weights_dim, - g, - {stride, stride}, - {pad, pad}, - {dila, dila}, - flag_bias, - flag_relu, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -330,7 +331,7 @@ TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) { FLAGS_kernel_w}), FLAGS_group, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, {FLAGS_dila_h, FLAGS_dila_w}, FLAGS_flag_bias, FLAGS_flag_relu, diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc index 06a1a0a65e1e5d0abb4a3eef2a6bf7d1e7ce5db0..fde5aacb1c1c21810c06a51eb6fa1f0cc4c3307a 100644 --- a/lite/tests/math/gemm_int8_compute_test.cc +++ b/lite/tests/math/gemm_int8_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra, dbias_int8[l] = dbias[l] / scale_c[0]; } for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data(), db, dbias_int8, @@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra, trb, scale_merge_int8.data(), &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; /// fp32 output compute - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data(), db, dbias, @@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra, trb, scale_merge_fp32.data(), &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc index c64e78d66a4193f1b20c525120d8b0281afc9a9c..623615c8da16326da3c233687915935aa5a88d64 100644 --- a/lite/tests/math/gemv_int8_compute_test.cc +++ b/lite/tests/math/gemv_int8_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -165,7 +165,7 @@ bool test_gemv_int8( dbias_int8[l] = dbias[l] / scale_c[0]; } for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemv_int8(da, db, dc_fp32, @@ -177,21 +177,21 @@ bool test_gemv_int8( dbias, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; /// fp32 output compute - t0.clear(); + t0.Reset(); for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); paddle::lite::arm::math::gemv_int8(da, db, dc_int8, @@ -203,15 +203,15 @@ bool test_gemv_int8( dbias_int8, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a566924548d6f3adac805eb80a574a9cd5c2afbf --- /dev/null +++ b/lite/tests/math/layout_compute_test.cc @@ -0,0 +1,608 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/operators/op_params.h" +#include "lite/tests/utils/naive_math_impl.h" +#include "lite/tests/utils/tensor_utils.h" + +#ifdef LITE_WITH_ARM +#include "lite/kernels/arm/layout_compute.h" +#endif // LITE_WITH_ARM + +DEFINE_int32(power_mode, + 3, + "power mode: " + "0 for POWER_HIGH;" + "1 for POWER_LOW;" + "2 for POWER_FULL;" + "3 for NO_BIND"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, false, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(batch, 1, "batch size"); +DEFINE_int32(in_channel, 32, "input channel"); +DEFINE_int32(in_height, 112, "input height"); +DEFINE_int32(in_width, 112, "input width"); + +DEFINE_bool(flag_nchw, true, "do nchw to nhwc"); + +typedef paddle::lite::DDim DDim; +typedef paddle::lite::Tensor Tensor; +typedef paddle::lite::operators::LayoutParam LayoutParam; + +using paddle::lite::profile::Timer; + +#define IN(n, c, h, w) \ + input_data[w + h * input_w + c * input_h * input_w + \ + n * input_c * input_h * input_w] +#define OUT(n, c, h, w) \ + output_data[w + h * output_w + c * output_h * output_w + \ + n * output_c * output_h * output_w] + +template +void nchw2nhwc_ref(const Tensor* input, Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_c = input->dims()[1]; + int input_h = input->dims()[2]; + int input_w = input->dims()[3]; + int output_c = output->dims()[1]; + int output_h = output->dims()[2]; + int output_w = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, h, w, c) = IN(n, c, h, w); + } + } + } + } +} +#undef IN +#undef OUT + +#define IN(n, h, w, c) \ + input_data[c + w * input_c + h * input_w * input_c + \ + n * input_h * input_w * input_c] +#define OUT(n, h, w, c) \ + output_data[c + w * output_c + h * output_w * output_c + \ + n * output_h * output_w * output_c] +template +void nhwc2nchw_ref(const Tensor* input, Tensor* output) { + auto* input_data = input->data(); + auto* output_data = output->mutable_data(); + + int input_n = input->dims()[0]; + int input_h = input->dims()[1]; + int input_w = input->dims()[2]; + int input_c = input->dims()[3]; + int output_h = output->dims()[1]; + int output_w = output->dims()[2]; + int output_c = output->dims()[3]; + + for (int n = 0; n < input_n; ++n) { + for (int c = 0; c < input_c; ++c) { + for (int h = 0; h < input_h; ++h) { + for (int w = 0; w < input_w; ++w) { + OUT(n, c, h, w) = IN(n, h, w, c); + } + } + } + } +} + +#ifdef LITE_WITH_ARM +void test_layout_fp32_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kFloat)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kFloat)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NCHWToNHWCCompute layout; + DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand( + *(const_cast(param.x)), -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kFloat)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nchw2nhwc_ref(param.x, &tout_basic); + } + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_fp32_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kFloat)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kFloat)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NHWCToNCHWCompute layout; + // n h w c == n c h w + DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand( + *(const_cast(param.x)), -1.f, 1.f); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kFloat)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nhwc2nchw_ref(param.x, &tout_basic); + } + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kFloat)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test fp32 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_int8_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kInt8)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kInt8)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NCHWToNHWCCompute layout; + DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand(*(const_cast(param.x))); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kInt8)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0); + auto dout_basic = tout_basic.mutable_data(); + nchw2nhwc_ref(param.x, &tout_basic); + } + LOG(INFO) << "saber compute"; + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + LOG(INFO) << "saber compute end"; + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kInt8)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +void test_layout_int8_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + + LayoutParam param; + param.x = new Tensor; + const_cast(param.x)->set_precision(PRECISION(kInt8)); + + param.y = new Tensor; + param.y->set_precision(PRECISION(kInt8)); + + for (auto& cls : power_mode) { + for (auto& th : thread_num) { + paddle::lite::kernels::arm::NHWCToNCHWCompute layout; + // n h w c == n c h w + DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]}); + + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), th); + /// set param and context + const_cast(param.x)->Resize(dim_in); + param.y->Resize(dim_out); + + layout.SetParam(param); + + paddle::lite::fill_tensor_rand(*(const_cast(param.x))); + // paddle::lite::fill_tensor_const(*param.x, 1.f); + + auto din = param.x->data(); + + Tensor tout_basic; + + if (FLAGS_check_result) { + tout_basic.set_precision(PRECISION(kInt8)); + tout_basic.Resize(dim_out); + fill_tensor_const(tout_basic, 0.f); + auto dout_basic = tout_basic.mutable_data(); + nhwc2nchw_ref(param.x, &tout_basic); + } + LOG(INFO) << "saber compute"; + /// warm up + for (int i = 0; i < FLAGS_warmup; ++i) { + layout.Run(); + } + /// compute + Timer t0; + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + layout.Run(); + t0.Stop(); + } + LOG(INFO) << "run"; + double gops = 2.0 * dim_out.production(); + LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" + << dim_out << ",running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() + << ", total GOPS: " << 1e-9 * gops + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-3f) { + if (max_diff > 5e-4f) { + LOG(WARNING) << "din"; + print_tensor(*(const_cast(param.x))); + LOG(WARNING) << "basic result"; + print_tensor(tout_basic); + LOG(WARNING) << "lite result"; + print_tensor(*param.y); + Tensor tdiff; + tdiff.Resize(tout_basic.dims()); + tdiff.set_precision(PRECISION(kInt8)); + tensor_diff(tout_basic, *param.y, tdiff); + print_tensor(tdiff); + LOG(FATAL) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out << ", flag_nchw: " + << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " failed!!\n"; + } + } + LOG(INFO) << "test int8 layout: input: " << dim_in + << ", output: " << dim_out + << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw") + << ", threads: " << th << ", power_mode: " << cls + << " successed!!\n"; + } + } + } + + delete param.x; + delete param.y; +} +#else +void test_layout_fp32_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_fp32_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_int8_nchw(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +void test_layout_int8_nhwc(DDim dim_in, + bool flag_nchw, + const std::vector& thread_num, + const std::vector& power_mode) {} +#endif // LITE_WITH_ARM + +#if 1 // +TEST(TestLayout, test_Layout_fp32) { + if (FLAGS_basic_test) { + for (auto n : {1, 3}) { + for (auto c : {1, 3, 5, 32}) { + for (auto h : {3, 16, 20, 32}) { + for (auto w : {3, 4, 32, 112}) { + for (auto nchw2nhwc : {true, false}) { + DDim dim_in({n, c, h, w}); + if (nchw2nhwc) { + LOG(INFO) << "NCHW2NHWC"; + test_layout_fp32_nchw( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } else { + LOG(INFO) << "NHWC2NCHW"; + test_layout_fp32_nhwc( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } + } + } + } + } + } + } +} +#endif +#if 1 +TEST(TestLayout, test_Layout_int8) { + if (FLAGS_basic_test) { + for (auto n : {1, 3}) { + for (auto c : {1, 3, 5, 32}) { + for (auto h : {3, 16, 20, 32}) { + for (auto w : {3, 4, 32, 112}) { + for (auto nchw2nhwc : {true, false}) { + DDim dim_in({n, c, h, w}); + if (nchw2nhwc) { + LOG(INFO) << "NCHW2NHWC int8"; + test_layout_int8_nchw( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } else { + LOG(INFO) << "NHWC2NCHW int8"; + test_layout_int8_nhwc( + dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode}); + } + } + } + } + } + } + } +} +#endif + +#if 1 /// custom +TEST(TestLayoutCustom, test_Layout_custom_size) { + test_layout_fp32_nchw( + {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})}, + true, + {FLAGS_threads}, + {FLAGS_power_mode}); +} +#endif // custom diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc index 9f4a9435945f8478a9285a56f03b20e941b3f8d7..73a5ba5606c2635c2df2792a3ccb6544715384a9 100644 --- a/lite/tests/math/pool_compute_test.cc +++ b/lite/tests/math/pool_compute_test.cc @@ -15,10 +15,10 @@ #include #include #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/operators/op_params.h" #include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" #ifdef LITE_WITH_ARM #include "lite/kernels/arm/pool_compute.h" @@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling"); typedef paddle::lite::DDim DDim; typedef paddle::lite::Tensor Tensor; typedef paddle::lite::operators::PoolParam PoolParam; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DDim compute_out_dim(const DDim& dim_in, const paddle::lite::operators::PoolParam& param) { @@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in, auto kernel_w = param.ksize[1]; auto h = dim_in[2]; auto w = dim_in[3]; - int pad_h = param.paddings[0]; - int pad_w = param.paddings[1]; + auto paddings = *param.paddings; int stride_h = param.strides[0]; int stride_w = param.strides[1]; bool ceil_mode = param.ceil_mode; @@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in, int wout = 1; if (!flag_global) { if (!ceil_mode) { - hout = (h - kernel_h + 2 * pad_h) / stride_h + 1; - wout = (w - kernel_w + 2 * pad_w) / stride_w + 1; + hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1; + wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1; } else { - hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1; - wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1; + hout = + (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h + + 1; + wout = + (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w + + 1; } } dim_out[2] = hout; @@ -116,7 +119,7 @@ void pooling_basic(const float* din, int stride_h = strides[0]; int stride_w = strides[1]; int pad_h = paddings[0]; - int pad_w = paddings[1]; + int pad_w = paddings[2]; int size_channel_in = win * hin; int size_channel_out = wout * hout; if (global_pooling) { @@ -195,18 +198,22 @@ void pooling_basic(const float* din, int bh = kernel_h; int bw = kernel_w; if (ew == win) { - bw = sw + kernel_w >= win + pad_w ? win + pad_w - : sw + kernel_w; + bw = (sw + kernel_w) >= (win + paddings[3]) + ? (win + paddings[3]) + : (sw + kernel_w); bw -= sw; - if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) { + if ((sw - pad_w) < 0 && + (sw + kernel_w) > (win + paddings[3])) { bw += pad_w; } } if (eh == hin) { - bh = sh + kernel_h >= hin + pad_h ? hin + pad_h - : sh + kernel_h; + bh = (sh + kernel_h) >= (hin + paddings[1]) + ? (hin + paddings[1]) + : (sh + kernel_h); bh -= sh; - if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) { + if ((sh - pad_h) < 0 && + (sh + kernel_h) > (hin + paddings[1])) { bh += pad_h; } } @@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector& input_dims, param.ksize = ksize; param.strides = strides; - param.paddings = pads; + param.paddings = std::make_shared>(pads); param.ceil_mode = ceil_mode; param.global_pooling = flag_global; param.pooling_type = pooling_type; @@ -313,18 +320,18 @@ void test_pool_fp32(const std::vector& input_dims, /// compute Timer t0; for (int i = 0; i < FLAGS_repeats; ++i) { - t0.start(); + t0.Start(); pool.Launch(); - t0.end(); + t0.Stop(); } double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1]; LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape" - << dim_out << ", running time, avg: " << t0.get_average_ms() - << ", min time: " << t0.get_min_time() + << dim_out << ", running time, avg: " << t0.LapTimes().Avg() + << ", min time: " << t0.LapTimes().Min() << ", total GOPS: " << 1e-9 * gops - << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() - << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); + << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min(); if (FLAGS_check_result) { double max_ratio = 0; @@ -399,31 +406,38 @@ TEST(TestPoolRand, test_pool_rand) { for (auto& kw : {1, 2, 3}) { for (auto& kh : {1, 2, 3}) { for (auto& stride : {1, 2}) { - for (auto& pad : {0, 1, 2}) { - for (auto& flag_global : {false, true}) { - for (auto& exclusive : {false, true}) { - for (auto& ceil_mode : {false, true}) { - for (auto& pooling_type : {"max", "avg"}) { - bool adaptive = false; - bool use_quantizer = false; - std::vector dims; - for (auto& batch : {1, 2}) { - for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) { - dims.push_back(DDim({batch, cin, h, h})); + for (auto& pad_top : {0, 1, 2}) { + for (auto& pad_bottom : {0, 1, 2}) { + for (auto& pad_left : {0, 1, 2}) { + for (auto& pad_right : {0, 1, 2}) { + for (auto& flag_global : {false, true}) { + for (auto& exclusive : {false, true}) { + for (auto& ceil_mode : {false, true}) { + for (auto& pooling_type : {"max", "avg"}) { + bool adaptive = false; + bool use_quantizer = false; + std::vector dims; + for (auto& batch : {1, 2}) { + for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) { + dims.push_back(DDim({batch, cin, h, h})); + } + } + test_pool_fp32( + dims, + {kh, kw}, + {stride, stride}, + {pad_top, pad_bottom, pad_left, pad_right}, + ceil_mode, + flag_global, + exclusive, + adaptive, + use_quantizer, + pooling_type, + {1, 2, 4}, + {FLAGS_power_mode}); + } } } - test_pool_fp32(dims, - {kh, kw}, - {stride, stride}, - {pad, pad}, - ceil_mode, - flag_global, - exclusive, - adaptive, - use_quantizer, - pooling_type, - {1, 2, 4}, - {FLAGS_power_mode}); } } } @@ -443,7 +457,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) { {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})}, {FLAGS_kernel_h, FLAGS_kernel_w}, {FLAGS_stride_h, FLAGS_stride_w}, - {FLAGS_pad_h, FLAGS_pad_w}, + {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w}, FLAGS_ceil_mode, FLAGS_flag_global, FLAGS_exclusive, diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..886dba6ac5a390c5eca4a9b499bfb57e2b077a32 --- /dev/null +++ b/lite/tests/math/sgemm_c4_compute_test.cc @@ -0,0 +1,236 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" +#ifdef LITE_WITH_ARM +#include "lite/backends/arm/math/funcs.h" +#endif // LITE_WITH_ARM +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" +#include "lite/tests/utils/tensor_utils.h" + +typedef paddle::lite::Tensor Tensor; +using paddle::lite::profile::Timer; + +DEFINE_int32(power_mode, + 3, + "power mode: " + "0 for POWER_HIGH;" + "1 for POWER_LOW;" + "2 for POWER_FULL;" + "3 for NO_BIND"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, false, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(M, 512, "gemm_c4: M"); +DEFINE_int32(N, 512, "gemm_c4: N"); +DEFINE_int32(K, 512, "gemm_c4: K"); + +DEFINE_bool(flag_relu, false, "do relu"); +DEFINE_bool(flag_bias, false, "with bias"); + +bool test_sgemm_c4( + int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) { + int m_round = (m + 3) / 4 * 4; + int k_round = (k + 3) / 4 * 4; + int size_a = m * k; + int size_b = n * k; + int size_a_c4 = m_round * k_round; + int size_b_c4 = k_round * n; + + Tensor ta; + Tensor tb; + Tensor ta_c4; + Tensor tb_c4; + Tensor tc; + Tensor tc_basic; + Tensor tc_backup; + Tensor tbias; + + ta.Resize({size_a}); + tb.Resize({size_b}); + ta_c4.Resize({size_a_c4}); + tb_c4.Resize({size_b_c4}); + tc.Resize({m_round * n}); + tc_basic.Resize({m_round * n}); + tbias.Resize({m}); + + ta.set_precision(PRECISION(kFloat)); + tb.set_precision(PRECISION(kFloat)); + ta_c4.set_precision(PRECISION(kFloat)); + tb_c4.set_precision(PRECISION(kFloat)); + tc.set_precision(PRECISION(kFloat)); + tc_basic.set_precision(PRECISION(kFloat)); + tbias.set_precision(PRECISION(kFloat)); + + fill_tensor_rand(ta, -1.f, 1.f); + fill_tensor_rand(tb, -1.f, 1.f); + fill_tensor_rand(tbias, -1.f, 1.f); + fill_tensor_rand(tc, -1.f, 1.f); + + auto da = ta.mutable_data(); + auto db = tb.mutable_data(); + auto da_c4 = ta_c4.mutable_data(); + auto db_c4 = tb_c4.mutable_data(); + auto dc_basic = tc_basic.mutable_data(); + auto dbias = tbias.mutable_data(); + + // trans A, B to c4 + basic_trans_mat_to_c4(da, da_c4, k, m, k, true); + basic_trans_mat_to_c4(db, db_c4, n, k, n, false); + + LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k + << ", relu: " << (has_relu ? "true" : "false") + << ", bias: " << (has_bias ? "true" : "false"); + + if (FLAGS_check_result) { + basic_gemm_c4(false, + false, + m, + n, + k, + 1.f, + da, + k, + db, + n, + 0.f, + dc_basic, + n, + dbias, + has_bias, + has_relu); + } + Timer t0; +#ifdef LITE_WITH_ARM + //! compute + double ops = 2.0 * m_round * n * k_round; + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), ths); + auto dc = tc.mutable_data(); + for (int j = 0; j < FLAGS_warmup; ++j) { + paddle::lite::arm::math::sgemm_prepack_c4( + m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx); + } + + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + paddle::lite::arm::math::sgemm_prepack_c4( + m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx); + t0.Stop(); + } + LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k + << ", power_mode: " << cls << ", threads: " << ths + << ", GOPS: " << ops * 1e-9f + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() + << " GOPs"; + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + tensor_cmp_host(tc_basic, tc, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) { + Tensor tdiff; + tdiff.set_precision(PRECISION(kFloat)); + tdiff.Resize(tc.dims()); + tensor_diff(tc_basic, tc, tdiff); + LOG(INFO) << "a: "; + print_tensor(ta); + LOG(INFO) << "a_c4: "; + print_tensor(ta_c4); + LOG(INFO) << "b: "; + print_tensor(tb); + LOG(INFO) << "b_c4: "; + print_tensor(tb_c4); + LOG(INFO) << "basic result: "; + print_tensor(tc_basic); + LOG(INFO) << "lite result: "; + print_tensor(tc); + LOG(INFO) << "diff result: "; + print_tensor(tdiff); + return false; + } + } +#endif + return true; +} + +TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) { + if (FLAGS_basic_test) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LOG(INFO) << "run basic sgemm_c4 test"; + for (auto& m : {1, 3, 8, 32, 397}) { + for (auto& n : {1, 2, 3, 4, 13, 141, 789}) { + for (auto& k : {1, 3, 8, 59, 234}) { + for (auto& has_bias : {false, true}) { + for (auto& has_relu : {false, true}) { + for (auto& th : {1, 2, 4}) { + auto flag = test_sgemm_c4( + m, n, k, has_bias, has_relu, FLAGS_power_mode, th); + if (flag) { + LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " passed\n"; + } else { + LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << " failed\n"; + } + } + } + } + } + } + } + } +} + +TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + auto flag = test_sgemm_c4(FLAGS_M, + FLAGS_N, + FLAGS_K, + FLAGS_flag_bias, + FLAGS_flag_relu, + FLAGS_power_mode, + FLAGS_threads); + if (!flag) { + LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N + << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " failed!!"; + } + LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K + << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu + << " passed!!"; +} diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc index 1621ceb9047125d0d2a4141a01111eb54892dee9..6df5e671fe5138ab6b6ac5941604b9b91759a661 100644 --- a/lite/tests/math/sgemm_compute_test.cc +++ b/lite/tests/math/sgemm_compute_test.cc @@ -20,12 +20,12 @@ #include "lite/backends/arm/math/funcs.h" #endif // LITE_WITH_ARM #include "lite/core/context.h" +#include "lite/core/profile/timer.h" #include "lite/core/tensor.h" #include "lite/tests/utils/tensor_utils.h" -#include "lite/tests/utils/timer.h" typedef paddle::lite::Tensor Tensor; -using paddle::lite::Timer; +using paddle::lite::profile::Timer; DEFINE_int32(power_mode, 3, @@ -171,7 +171,7 @@ bool test_sgemm(bool tra, if (i == FLAGS_repeats - 1) { memcpy(dc, dc_backup, sizeof(float) * m * ldc); } - t0.start(); + t0.Start(); paddle::lite::arm::math::sgemm_prepack(trb, m, n, @@ -186,15 +186,15 @@ bool test_sgemm(bool tra, has_bias, has_relu, &ctx); - t0.end(); + t0.Stop(); } LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k << ", power_mode: " << cls << ", threads: " << ths << ", GOPS: " << ops * 1e-9f - << " GOPS, avg time: " << t0.get_average_ms() - << " ms, min time: " << t0.get_min_time() - << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() - << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() << " GOPs"; if (FLAGS_check_result) { diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dd2d322955d2c628366075a6dddb31bed2338ee --- /dev/null +++ b/lite/tests/math/sgemv_compute_test.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/tests/utils/fill_data.h" +#include "lite/tests/utils/naive_math_impl.h" +#ifdef LITE_WITH_ARM +#include "lite/backends/arm/math/funcs.h" +#endif // LITE_WITH_ARM +#include "lite/core/context.h" +#include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" +#include "lite/tests/utils/tensor_utils.h" + +typedef paddle::lite::Tensor Tensor; + +DEFINE_int32(cluster, 3, "cluster id"); +DEFINE_int32(threads, 1, "threads num"); +DEFINE_int32(warmup, 0, "warmup times"); +DEFINE_int32(repeats, 1, "repeats times"); +DEFINE_bool(basic_test, true, "do all tests"); +DEFINE_bool(check_result, true, "check the result"); + +DEFINE_int32(M, 512, "sgemv: M"); +DEFINE_int32(K, 512, "sgemv: K"); + +DEFINE_bool(traA, false, "gemv: A transpose"); + +DEFINE_bool(flag_relu, false, "do relu"); +DEFINE_bool(flag_bias, false, "with bias"); + +bool test_sgemv( + bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) { + Tensor ta; + Tensor tb; + Tensor tc; + Tensor tc_basic; + Tensor tbias; + + ta.Resize({m, k}); + tb.Resize({k, 1}); + tc.Resize({m, 1}); + tc_basic.Resize({m, 1}); + tbias.Resize({m}); + + ta.set_precision(PRECISION(kFloat)); + tb.set_precision(PRECISION(kFloat)); + tc.set_precision(PRECISION(kFloat)); + tc_basic.set_precision(PRECISION(kFloat)); + tbias.set_precision(PRECISION(kFloat)); + + fill_tensor_rand(ta, -1.f, 1.f); + // fill_tensor_const(ta, 1.f); + fill_tensor_rand(tb, -1.f, 1.f); + // fill_tensor_const(tb, 1.f); + fill_tensor_rand(tbias, -1.f, 1.f); + + LOG(INFO) << "sgemv M: " << m << ", K: " << k + << ", transA: " << (tra ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", bias: " << (has_bias ? "true" : "false"); +#ifdef LITE_WITH_ARM + + auto da = ta.mutable_data(); + auto db = tb.mutable_data(); + auto dc = tc.mutable_data(); + auto dc_basic = tc_basic.mutable_data(); + auto dbias = tbias.mutable_data(); + + if (FLAGS_check_result) { + basic_gemv( + m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu); + } + paddle::lite::profile::Timer t0; + //! compute + double ops = 2.0 * m * k; + std::unique_ptr ctx1( + new paddle::lite::KernelContext); + auto& ctx = ctx1->As(); + ctx.SetRunMode(static_cast(cls), ths); + /// warmup + for (int j = 0; j < FLAGS_warmup; ++j) { + paddle::lite::arm::math::sgemv( + da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); + } + + t0.Reset(); + for (int i = 0; i < FLAGS_repeats; ++i) { + t0.Start(); + paddle::lite::arm::math::sgemv( + da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); + t0.Stop(); + } + LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls + << ", threads: " << ths << ", GOPS: " << ops * 1e-9f + << " GOPS, avg time: " << t0.LapTimes().Avg() + << " ms, min time: " << t0.LapTimes().Min() + << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg() + << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min() + << " GOPs"; + + if (FLAGS_check_result) { + double max_ratio = 0; + double max_diff = 0; + /// fp32 result + tensor_cmp_host(tc_basic, tc, max_ratio, max_diff); + LOG(INFO) << "compare result, max diff: " << max_diff + << ", max ratio: " << max_ratio; + if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) { + Tensor tdiff; + tdiff.set_precision(PRECISION(kFloat)); + tdiff.Resize(tc.dims()); + tensor_diff(tc_basic, tc, tdiff); + LOG(INFO) << "basic result: "; + print_tensor(tc_basic); + LOG(INFO) << "saber result: "; + print_tensor(tc); + LOG(INFO) << "diff result: "; + print_tensor(tdiff); + return false; + } + } +#endif + return true; +} + +TEST(TestLiteSgemv, Sgemv) { + if (FLAGS_basic_test) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + LOG(INFO) << "run basic sgemv test"; + for (auto& m : {1, 3, 8, 21, 32, 397}) { + for (auto& k : {1, 3, 8, 17, 59, 234}) { + for (auto& tra : {true, false}) { + for (auto& has_bias : {false, true}) { + for (auto& has_relu : {false, true}) { + for (auto& th : {1, 2, 4}) { + auto flag = test_sgemv( + tra, m, k, has_bias, has_relu, FLAGS_cluster, th); + if (flag) { + LOG(INFO) << "test m = " << m << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", trans A: " << (tra ? "true" : "false") + << ", threads: " << th << " passed\n"; + } else { + LOG(FATAL) << "test m = " << m << ", k=" << k + << ", bias: " << (has_bias ? "true" : "false") + << ", relu: " << (has_relu ? "true" : "false") + << ", trans A: " << (tra ? "true" : "false") + << ", threads: " << th << " failed\n"; + } + } + } + } + } + } + } + } +} + +TEST(TestSgemvCustom, Sgemv_custom) { +#ifdef LITE_WITH_ARM + paddle::lite::DeviceInfo::Init(); +#endif + auto flag = test_sgemv(FLAGS_traA, + FLAGS_M, + FLAGS_K, + FLAGS_flag_bias, + FLAGS_flag_relu, + FLAGS_cluster, + FLAGS_threads); + if (!flag) { + LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K + << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " failed!!"; + } + LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K + << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias + << ", relu: " << FLAGS_flag_relu << " passed!!"; +} diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h index 846126ac247ee685bd8772ede87635c45b52f79a..fd868e85acdbdfe39abc0bcbfa50f85db12a50b6 100644 --- a/lite/tests/utils/naive_math_impl.h +++ b/lite/tests/utils/naive_math_impl.h @@ -14,6 +14,108 @@ #pragma once +template +static void basic_trans_mat_to_c4(const type* input, + type* output, + const int ldin, + const int M, + const int K, + bool pack_k) { + const int m_round = (M + 3) / 4 * 4; + int k_round = (K + 3) / 4 * 4; + if (!pack_k) { + k_round = K; + } + const int m_loop = m_round / 4; + type zero_buf[K]; + memset(zero_buf, 0, K * sizeof(type)); + for (int i = 0; i < m_loop; ++i) { + const type* in0 = input + i * 4 * ldin; + const type* in1 = in0 + ldin; + const type* in2 = in1 + ldin; + const type* in3 = in2 + ldin; + if (4 * (i + 1) - M > 0) { + switch (4 * (i + 1) - M) { + case 3: + in1 = zero_buf; + case 2: + in2 = zero_buf; + case 1: + in3 = zero_buf; + default: + break; + } + } + for (int j = 0; j < K; ++j) { + *output++ = *in0++; + *output++ = *in1++; + *output++ = *in2++; + *output++ = *in3++; + } + for (int j = K; j < k_round; ++j) { + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + *output++ = static_cast(0); + } + } +} + +template +static void basic_gemm_c4(bool trans_a, + bool trans_b, + int m, + int n, + int k, + type2 alpha, + const type* a, + int lda, + const type* b, + int ldb, + type2 beta, + type2* c, + int ldc, + const type2* bias, + bool flag_bias = false, + bool flag_relu = false) { + type2* tmp_c = reinterpret_cast(malloc(m * ldc * sizeof(type2))); + memset(tmp_c, 0, m * ldc * sizeof(type2)); +#pragma omp parallel for + for (int i = 0; i < m; ++i) { + auto bias_data = static_cast(0); + if (flag_bias) { + bias_data = bias[i]; + } + for (int j = 0; j < n; ++j) { + auto sum = static_cast(0); + for (int l = 0; l < k; ++l) { + type av; + type bv; + if (trans_a) { + av = a[l * lda + i]; + } else { + av = a[i * lda + l]; + } + if (trans_b) { + bv = b[j * ldb + l]; + } else { + bv = b[l * ldb + j]; + } + sum += av * bv; + } + type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data; + if (flag_relu) { + tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0; + } else { + tmp_c[i * ldc + j] = tmp; + } + } + } + //! trans c to c4 + basic_trans_mat_to_c4(tmp_c, c, ldc, m, n, false); + free(tmp_c); +} + template static void basic_gemm(bool trans_a, bool trans_b, @@ -228,8 +330,10 @@ static void col2im(const Dtype* data_col, const int width, const int kernel_h, const int kernel_w, - const int pad_h, - const int pad_w, + const int pad_h0, + const int pad_h1, + const int pad_w0, + const int pad_w1, const int stride_h, const int stride_w, const int dilation_h, @@ -237,21 +341,24 @@ static void col2im(const Dtype* data_col, Dtype* data_im) { memset(data_im, 0, height * width * channels * sizeof(Dtype)); const int output_h = - (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) / + stride_h + + 1; const int output_w = - (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w + + 1; const int channel_size = height * width; for (int channel = channels; channel--; data_im += channel_size) { for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { - int input_row = -pad_h + kernel_row * dilation_h; + int input_row = -pad_h0 + kernel_row * dilation_h; for (int output_rows = output_h; output_rows; output_rows--) { if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { data_col += output_w; } else { - int input_col = -pad_w + kernel_col * dilation_w; + int input_col = -pad_w0 + kernel_col * dilation_w; for (int output_col = output_w; output_col; output_col--) { if (is_a_ge_zero_and_a_lt_b(input_col, width)) { @@ -289,8 +396,10 @@ void deconv_basic(const Dtype1* din, int stride_h, int dila_w, int dila_h, - int pad_w, - int pad_h, + int pad_w0, + int pad_w1, + int pad_h0, + int pad_h1, bool flag_bias, bool flag_relu) { int m = chout * kernel_w * kernel_h / group; @@ -302,8 +411,9 @@ void deconv_basic(const Dtype1* din, int group_size_coldata = m * n; int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group); bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) && - (stride_w == 1) && (pad_w == 1) && (pad_h == 1) && - (dila_w == 1) && (dila_h == 1); + (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) && + (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) && + (dila_h == 1); Dtype2* workspace_ptr = static_cast(malloc(sizeof(float) * m * n * group)); @@ -316,7 +426,7 @@ void deconv_basic(const Dtype1* din, if (flag_1x1s1p1) { col_data = dout_batch; } - memset(col_data, 0, sizeof(Dtype2) * group_size_coldata); + memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group); for (int g = 0; g < group; ++g) { const Dtype1* din_group = din_batch + g * group_size_in; const Dtype1* weights_group = weights + g * group_size_weights; @@ -346,8 +456,10 @@ void deconv_basic(const Dtype1* din, wout, kernel_h, kernel_w, - pad_h, - pad_w, + pad_h0, + pad_h1, + pad_w0, + pad_w1, stride_h, stride_w, dila_h, diff --git a/lite/tools/build.sh b/lite/tools/build.sh index 4873e70773f31425d628ee2bbdd36f2cb2f921f1..319f26ff82dd47718a7fc69d64522ca622ecaf3e 100755 --- a/lite/tools/build.sh +++ b/lite/tools/build.sh @@ -20,6 +20,7 @@ BUILD_DIR=$(pwd) OPTMODEL_DIR="" BUILD_TAILOR=OFF BUILD_CV=OFF +SHUTDOWN_LOG=ON readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz @@ -93,7 +94,7 @@ function make_tiny_publish_so { -DWITH_TESTING=OFF \ -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ - -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \ -DLITE_ON_TINY_PUBLISH=ON \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ @@ -136,7 +137,7 @@ function make_full_publish_so { -DWITH_TESTING=OFF \ -DLITE_WITH_JAVA=$BUILD_JAVA \ -DLITE_WITH_PYTHON=$BUILD_PYTHON \ - -DLITE_SHUTDOWN_LOG=ON \ + -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \ -DANDROID_STL_TYPE=$android_stl \ -DLITE_BUILD_EXTRA=$BUILD_EXTRA \ -DLITE_WITH_CV=$BUILD_CV \ @@ -236,10 +237,10 @@ function make_cuda { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ -DWITH_TESTING=OFF \ -DLITE_WITH_ARM=OFF \ - -DLITE_WITH_PYTHON=ON \ + -DLITE_WITH_PYTHON=${BUILD_PYTHON} \ -DLITE_BUILD_EXTRA=ON - - make publish_inference_python_lib -j8 + + make publish_inference -j4 cd - } @@ -290,6 +291,7 @@ function print_usage { echo -e " ./build.sh --arm_os= --arm_abi= --arm_lang= test" echo echo -e "optional argument:" + echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON" echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)" echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)" echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)" @@ -366,6 +368,10 @@ function main { BUILD_TAILOR="${i#*=}" shift ;; + --shutdown_log=*) + SHUTDOWN_LOG="${i#*=}" + shift + ;; tiny_publish) make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL shift diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh index 03a74046f17ad03bccc7b6d5050acae9d643686c..1509f563b2e4f2008e7ea4f37ca4e5491464e9cc 100755 --- a/lite/tools/build_npu.sh +++ b/lite/tools/build_npu.sh @@ -5,8 +5,8 @@ set -ex ARM_OS="android" # android only yet ARM_ABI="armv8" # armv8, armv7 ARM_LANG="gcc" # gcc only yet -ANDROID_STL="c++_static" # c++_shared, c++_static -DDK_ROOT="$(pwd)/ai_ddk_lib/" # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/ +ANDROID_STL="c++_shared" # c++_shared/c++_static, c++_shared is used by HiAI DDK 310 +DDK_ROOT="$(pwd)/ai_ddk_lib/" # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/ TARGET_NAME="test_npu_pass" # default target BUILD_EXTRA=OFF # ON(with sequence ops)/OFF WITH_JAVA=ON # ON(build jar and jni so)/OFF diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 8be8e6e6b6da1e2aa38b6fcbcf95b23a8543a5be..8b5741a7a68bee3e783dff68e4bd4a8fc7cd8527 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -1,9 +1,10 @@ #!/bin/bash +# The git version of CI is 2.7.4. This script is not compatible with git version 1.7.1. set -ex TESTS_FILE="./lite_tests.txt" LIBS_FILE="./lite_libs.txt" - +CUDNN_ROOT="/usr/local/cudnn" readonly ADB_WORK_DIR="/data/local/tmp" readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF" @@ -162,6 +163,12 @@ function cmake_x86_for_CI { # make test_generated_code -j$NUM_CORES_FOR_COMPILE } +function cmake_cuda_for_CI { + prepare_workspace # fake an empty __generated_code__.cc to pass cmake. + cmake .. -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=OFF \ + -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT} +} + function cmake_gpu { prepare_workspace cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON" @@ -195,7 +202,6 @@ function test_server { # Due to the missing of x86 kernels, we skip the following tests temporarily. # TODO(xxx) clear the skip list latter local skip_list=("test_paddle_api" "test_cxx_api" - "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86" "test_light_api" "test_apis" "test_model_bin" ) @@ -227,6 +233,16 @@ function build_test_server { test_model_optimize_tool_compile } +# The CUDA version of CI is cuda_10.1.243_418.87.00_linux. +# The cuDNN version is cudnn-10.1-linux-x64-v7.5.0.56. +function build_test_cuda_server { + mkdir -p ./build + cd ./build + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + cmake_cuda_for_CI + build +} + function build_test_train { mkdir -p ./build cd ./build @@ -951,6 +967,10 @@ function main { test_arm_android $TEST_NAME $ARM_PORT shift ;; + build_test_cuda_server) + build_test_cuda_server + shift + ;; build_test_server) build_test_server shift diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h index 7f77b90488657aab96c7942d703e86d64723f5fc..ff08c47e524cacee37e95572a7f7a2fb444d4d16 100644 --- a/lite/tools/debug/debug_utils.h +++ b/lite/tools/debug/debug_utils.h @@ -27,7 +27,7 @@ #include "lite/model_parser/pb/var_desc.h" #include "lite/utils/all.h" -DEFINE_string(model_dir, "", "Model dir path"); +DEFINE_string(model_path, "", "Model dir path"); DEFINE_string(input_file, "", "Input datas file path"); DEFINE_string(topo_output_file, "", "Runtime topology order output file path"); DEFINE_bool(output_topo, true, "Dump runtime topology or not"); @@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) { CHECK(conf); #define CHECK_NON_EMPTY(name__) \ CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty." - CHECK_NON_EMPTY(model_dir); + CHECK_NON_EMPTY(model_path); if (FLAGS_output_topo) { CHECK_NON_EMPTY(topo_output_file); } @@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) { CHECK_NON_EMPTY(tensor_output_file); } #undef CHECK_NON_EMPTY - conf->model_dir = FLAGS_model_dir; + conf->model_dir = FLAGS_model_path; conf->topo_output_file = FLAGS_topo_output_file; conf->tensor_output_file = FLAGS_tensor_output_file; conf->input_file = FLAGS_input_file; diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc index 0bccfe2804a9ba17473575815bfe4b2e9635f234..f18047556874a82d28c5964a1b5fd2fa8284c814 100644 --- a/lite/utils/cv/paddle_image_preprocess.cc +++ b/lite/utils/cv/paddle_image_preprocess.cc @@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src, int dstw, int dsth) { resize(src, dst, srcFormat, srcw, srch, dstw, dsth); - /* - int size = srcw * srch; - if (srcw == dstw && srch == dsth) { - if (srcFormat == NV12 || srcFormat == NV21) { - size = srcw * (floor(1.5 * srch)); - } else if (srcFormat == BGR || srcFormat == RGB) { - size = 3 * srcw * srch; - } else if (srcFormat == BGRA || srcFormat == RGBA) { - size = 4 * srcw * srch; - } - memcpy(dst, src, sizeof(uint8_t) * size); - return; - } - double scale_x = static_cast(srcw / dstw); - double scale_y = static_cast(srch / dsth); - - int* buf = new int[dstw * 2 + dsth * 2]; - - int* xofs = buf; - int* yofs = buf + dstw; - int16_t* ialpha = reinterpret_cast(buf + dstw + dsth); - int16_t* ibeta = reinterpret_cast(buf + 2 * dstw + dsth); - - compute_xy( - srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta); - - int w_out = dstw; - int w_in = srcw; - int num = 1; - int orih = dsth; - if (srcFormat == GRAY) { - num = 1; - } else if (srcFormat == NV12 || srcFormat == NV21) { - num = 1; - int hout = static_cast(0.5 * dsth); - dsth += hout; - } else if (srcFormat == BGR || srcFormat == RGB) { - w_in = srcw * 3; - w_out = dstw * 3; - num = 3; - - } else if (srcFormat == BGRA || srcFormat == RGBA) { - w_in = srcw * 4; - w_out = dstw * 4; - num = 4; - } - - int* xofs1 = nullptr; - int* yofs1 = nullptr; - int16_t* ialpha1 = nullptr; - if (orih < dsth) { // uv - int tmp = dsth - orih; - int w = dstw / 2; - xofs1 = new int[w]; - yofs1 = new int[tmp]; - ialpha1 = new int16_t[srcw]; - compute_xy(srcw / 2, - srch / 2, - w, - tmp, - scale_x, - scale_y, - xofs1, - yofs1, - ialpha1, - ibeta + orih); - } - int cnt = w_out >> 3; - int remain = w_out % 8; - int32x4_t _v2 = vdupq_n_s32(2); - #pragma omp parallel for - for (int dy = 0; dy < dsth; dy++) { - int16_t* rowsbuf0 = new int16_t[w_out]; - int16_t* rowsbuf1 = new int16_t[w_out]; - int sy = yofs[dy]; - if (dy >= orih) { - xofs = xofs1; - yofs = yofs1; - ialpha = ialpha1; - } - if (sy < 0) { - memset(rowsbuf0, 0, sizeof(uint16_t) * w_out); - const uint8_t* S1 = src + srcw * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows1p++ = ((*S1pl++) * a1) >> 4; - } else { - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } else { - // hresize two rows - const uint8_t* S0 = src + w_in * (sy); - const uint8_t* S1 = src + w_in * (sy + 1); - const int16_t* ialphap = ialpha; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - for (int dx = 0; dx < dstw; dx++) { - int sx = xofs[dx] * num; // num = 4 - int16_t a0 = ialphap[0]; - int16_t a1 = ialphap[1]; - - const uint8_t* S0pl = S0 + sx; - const uint8_t* S0pr = S0 + sx + num; - const uint8_t* S1pl = S1 + sx; - const uint8_t* S1pr = S1 + sx + num; - if (sx < 0) { - S0pl = S0; - S1pl = S1; - } - for (int i = 0; i < num; i++) { - if (sx < 0) { - *rows0p = ((*S0pl++) * a1) >> 4; - *rows1p = ((*S1pl++) * a1) >> 4; - rows0p++; - rows1p++; - } else { - *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4; - *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4; - } - } - ialphap += 2; - } - } - int ind = dy * 2; - int16_t b0 = ibeta[ind]; - int16_t b1 = ibeta[ind + 1]; - int16x8_t _b0 = vdupq_n_s16(b0); - int16x8_t _b1 = vdupq_n_s16(b1); - uint8_t* dp_ptr = dst + dy * w_out; - int16_t* rows0p = rowsbuf0; - int16_t* rows1p = rowsbuf1; - int re_cnt = cnt; - if (re_cnt > 0) { - #ifdef __aarch64__ - asm volatile( - "1: \n" - "ld1 {v0.8h}, [%[rows0p]], #16 \n" - "ld1 {v1.8h}, [%[rows1p]], #16 \n" - "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n" - "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n" - "smull v2.4s, v0.4h, %w[_b0].4h \n" - "smull2 v4.4s, v0.8h, %w[_b0].8h \n" - "smull v3.4s, v1.4h, %w[_b1].4h \n" - "smull2 v5.4s, v1.8h, %w[_b1].8h \n" - - "ssra v6.4s, v2.4s, #16 \n" - "ssra v7.4s, v4.4s, #16 \n" - "ssra v6.4s, v3.4s, #16 \n" - "ssra v7.4s, v5.4s, #16 \n" - - "shrn v0.4h, v6.4s, #2 \n" - "shrn2 v0.8h, v7.4s, #2 \n" - "subs %w[cnt], %w[cnt], #1 \n" - "sqxtun v1.8b, v0.8h \n" - "st1 {v1.8b}, [%[dp]], #8 \n" - "bne 1b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); - #else - asm volatile( - "mov r4, #2 \n" - "vdup.s32 q12, r4 \n" - "0: \n" - "vld1.s16 {d2-d3}, [%[rows0p]]!\n" - "vld1.s16 {d6-d7}, [%[rows1p]]!\n" - "vorr.s32 q10, q12, q12 \n" - "vorr.s32 q11, q12, q12 \n" - - "vmull.s16 q0, d2, %[_b0] \n" - "vmull.s16 q1, d3, %[_b0] \n" - "vmull.s16 q2, d6, %[_b1] \n" - "vmull.s16 q3, d7, %[_b1] \n" - - "vsra.s32 q10, q0, #16 \n" - "vsra.s32 q11, q1, #16 \n" - "vsra.s32 q10, q2, #16 \n" - "vsra.s32 q11, q3, #16 \n" - - "vshrn.s32 d20, q10, #2 \n" - "vshrn.s32 d21, q11, #2 \n" - "subs %[cnt], #1 \n" - "vqmovun.s16 d20, q10 \n" - "vst1.8 {d20}, [%[dp]]! \n" - "bne 0b \n" - : [rows0p] "+r"(rows0p), - [rows1p] "+r"(rows1p), - [cnt] "+r"(re_cnt), - [dp] "+r"(dp_ptr) - : [_b0] "w"(_b0), [_b1] "w"(_b1) - : "cc", - "memory", - "r4", - "q0", - "q1", - "q2", - "q3", - "q8", - "q9", - "q10", - "q11", - "q12"); - - #endif // __aarch64__ - } - for (int i = 0; i < remain; i++) { - // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> - // INTER_RESIZE_COEF_BITS; - *dp_ptr++ = - (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) + - (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >> - 2); - } - } - delete[] buf; - */ } void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) { diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h index 11673e19041883bfa6ca7a45f03ca3feca76dd20..5a46a9e48e8202fe29ec9fc7d950ccf15920cc32 100644 --- a/lite/utils/cv/paddle_image_preprocess.h +++ b/lite/utils/cv/paddle_image_preprocess.h @@ -133,7 +133,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param degree: Rotate degree, support 90, 180 and 270 @@ -158,7 +158,7 @@ class ImagePreprocess { * color format support 1-channel image, 3-channel image and 4-channel image * param src: input image data * param dst: output image data - * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param flip_param: flip parameter, support X, Y and XY @@ -190,7 +190,7 @@ class ImagePreprocess { * NCHW * param src: input image data * param dstTensor: output tensor data - * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA) + * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA) * param srcw: input image width * param srch: input image height * param layout: output tensor layout,support NHWC and NCHW diff --git a/lite/utils/io.h b/lite/utils/io.h index 98a0f39b084c1ec0767299501f6f359dab2017b3..92405cae862f062090665aecc8eb7f207cf059e7 100644 --- a/lite/utils/io.h +++ b/lite/utils/io.h @@ -14,9 +14,12 @@ #pragma once +#include #include +#include #include #include +#include #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" @@ -46,11 +49,68 @@ static void MkDirRecur(const std::string& path) { // read buffer from file static std::string ReadFile(const std::string& filename) { std::ifstream ifile(filename.c_str()); + if (!ifile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } std::ostringstream buf; char ch; while (buf && ifile.get(ch)) buf.put(ch); + ifile.close(); return buf.str(); } +// read lines from file +static std::vector ReadLines(const std::string& filename) { + std::ifstream ifile(filename.c_str()); + if (!ifile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } + std::vector res; + std::string tmp; + while (getline(ifile, tmp)) res.push_back(tmp); + ifile.close(); + return res; +} + +static void WriteLines(const std::vector& lines, + const std::string& filename) { + std::ofstream ofile(filename.c_str()); + if (!ofile.is_open()) { + LOG(FATAL) << "Open file: [" << filename << "] failed."; + } + for (const auto& line : lines) { + ofile << line << "\n"; + } + ofile.close(); +} + +static bool IsDir(const std::string& path) { + DIR* dir_fd = opendir(path.c_str()); + if (dir_fd == nullptr) return false; + closedir(dir_fd); + return true; +} + +static std::vector ListDir(const std::string& path, + bool only_dir = false) { + if (!IsDir(path)) { + LOG(FATAL) << "[" << path << "] is not a valid dir path."; + } + + std::vector paths; + DIR* parent_dir_fd = opendir(path.c_str()); + dirent* dp; + while ((dp = readdir(parent_dir_fd)) != nullptr) { + // Exclude '.', '..' and hidden dir + std::string name(dp->d_name); + if (name == "." || name == ".." || name[0] == '.') continue; + if (IsDir(Join({path, name}, "/"))) { + paths.push_back(name); + } + } + closedir(parent_dir_fd); + return paths; +} + } // namespace lite } // namespace paddle diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc index 6351be95acdb7311f7d5604d9af3cfe8945bc424..e9ee5861baca85966ce53ac1570d7ebc23a002cb 100644 --- a/lite/utils/logging.cc +++ b/lite/utils/logging.cc @@ -43,10 +43,10 @@ void gen_log(STL::ostream& log_stream_, gettimeofday(&tv, NULL); // print date / time - log_stream_ << '[' << level << ' ' << std::setw(2) << 1 + tm_time.tm_mon - << '/' << std::setw(2) << tm_time.tm_mday << ' ' << std::setw(2) - << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':' - << std::setw(2) << tm_time.tm_sec << '.' << std::setw(3) + log_stream_ << '[' << level << ' ' << STL::setw(2) << 1 + tm_time.tm_mon + << '/' << STL::setw(2) << tm_time.tm_mday << ' ' << STL::setw(2) + << tm_time.tm_hour << ':' << STL::setw(2) << tm_time.tm_min << ':' + << STL::setw(2) << tm_time.tm_sec << '.' << STL::setw(3) << tv.tv_usec / 1000 << " "; if (len > kMaxLen) { diff --git a/lite/utils/logging.h b/lite/utils/logging.h index e85753ec301c62152ce484105d6c42ac1b69ab16..c2c999fd70f3eee78c1deaf5ec2c4fea4e4f3fd1 100644 --- a/lite/utils/logging.h +++ b/lite/utils/logging.h @@ -30,6 +30,18 @@ #include #include "lite/utils/replace_stl/stream.h" +#ifdef LITE_WITH_ANDROID +#include +// Android log macors +#define ANDROID_LOG_TAG "Paddle-Lite" +#define ANDROID_LOG_I(msg) \ + __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, msg) +#define ANDROID_LOG_W(msg) \ + __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, msg) +#define ANDROID_LOG_F(msg) \ + __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, msg) +#endif + // NOLINTFILE() // LOG() @@ -93,11 +105,22 @@ class LogMessage { const char* func, int lineno, const char* level = "I") { + level_ = level; paddle::lite::gen_log(log_stream_, file, func, lineno, level); } ~LogMessage() { log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + if (level_ == "I") { + ANDROID_LOG_I(log_stream_.str().c_str()); + } else if (level_ == "W") { + ANDROID_LOG_W(log_stream_.str().c_str()); + } else { + fprintf(stderr, "Unsupported log level: %s", level_.c_str()); + assert(false); + } +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); } @@ -105,6 +128,7 @@ class LogMessage { protected: STL::stringstream log_stream_; + std::string level_; LogMessage(const LogMessage&) = delete; void operator=(const LogMessage&) = delete; @@ -121,7 +145,11 @@ class LogMessageFatal : public LogMessage { ~LogMessageFatal() { log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + ANDROID_LOG_F(log_stream_.str().c_str()); +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); + #ifndef LITE_ON_TINY_PUBLISH abort(); #else @@ -152,6 +180,9 @@ class VLogMessage { return; } log_stream_ << '\n'; +#ifdef LITE_WITH_ANDROID + ANDROID_LOG_I(log_stream_.str().c_str()); +#endif fprintf(stderr, "%s", log_stream_.str().c_str()); } diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc index 61999a79e3d9e997b23943e46a419577ee2de44c..d821078e366b1ade8b093e08a63829bcf35c1376 100644 --- a/lite/utils/replace_stl/stream.cc +++ b/lite/utils/replace_stl/stream.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "lite/utils/replace_stl/stream.h" +#include +#include #ifdef LITE_ON_TINY_PUBLISH @@ -20,93 +22,119 @@ namespace paddle { namespace lite { namespace replace_stl { +void ostream::pad(const std::string& text) { + if (display_width_ > 0) { + if (display_width_ < text.size()) { + fprintf(stderr, "Replace STL IO display length less than text\n"); + assert(false); + } else { + for (int i = 0; i < display_width_ - text.size(); ++i) { + data_.push_back(' '); + } + display_width_ = -1; + } + } +} + #ifdef LITE_SHUTDOWN_LOG #define ADD_DATA_AS_STRING(data_, obj_) #else -#define ADD_DATA_AS_STRING(data_, obj_) data_ = data_ + std::to_string(obj_) +#define ADD_DATA_AS_STRING(data_, obj_) \ + std::string text = std::to_string(obj_); \ + pad(text); \ + data_ = data_ + text; + #endif template <> ostream& ostream::operator<<(const char* obj) { - _data = _data + std::string(obj); + data_ = data_ + std::string(obj); return *this; } template <> ostream& ostream::operator<<(const char& obj) { - _data = _data + obj; + data_ = data_ + obj; return *this; } template <> ostream& ostream::operator<<(const std::string& obj) { - _data = _data + obj; + data_ = data_ + obj; return *this; } template <> ostream& ostream::operator<<(const int16_t& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const int& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const bool& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const unsigned long long& obj) { // NOLINT - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const float& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const double& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); return *this; } template <> ostream& ostream::operator<<(const long double& obj) { - ADD_DATA_AS_STRING(_data, obj); + ADD_DATA_AS_STRING(data_, obj); + return *this; +} + +template <> +ostream& ostream::operator<<(const LiteIoWidth& obj) { + int width = obj.width; + assert(width > 0); + display_width_ = width; return *this; } diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h index e6bb261706bd7f25943fd3a6fad1ba97b9dfe3a4..3288a1986906b3fd600b91b6a56ae7134644456f 100644 --- a/lite/utils/replace_stl/stream.h +++ b/lite/utils/replace_stl/stream.h @@ -29,18 +29,25 @@ namespace lite { namespace replace_stl { +struct LiteIoWidth { + explicit LiteIoWidth(int value) : width(value) {} + int width; +}; + +static LiteIoWidth setw(int width) { return LiteIoWidth(width); } + class ostream { public: ostream() {} - explicit ostream(const std::string& x) : _data(x) {} + explicit ostream(const std::string& x) : data_(x) {} ~ostream() {} - const char* c_str() { return _data.c_str(); } + const char* c_str() { return data_.c_str(); } - const std::string& str() { return _data; } + const std::string& str() { return data_; } const std::string& str(const std::string& x) { - _data = x; - return _data; + data_ = x; + return data_; } template @@ -50,7 +57,9 @@ class ostream { ostream& operator<<(const T* obj); private: - std::string _data; + void pad(const std::string& text); + std::string data_; + int display_width_{-1}; // -1 refers to no setting }; class stringstream : public ostream { diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp index f39d012e08c124feacbd72fa2879e60b352c2785..1a90cb5bdc8b0cf96785b59cc37076b2beaa2572 100644 --- a/mobile/src/fpga/V2/api.cpp +++ b/mobile/src/fpga/V2/api.cpp @@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.images_in[i] = (int8_t *)arg->conv_arg[i].output.address; // NOLINT - arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; + arg->concat_arg.scales_in[i] = out->scale; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; expand_conv_arg(&arg->conv_arg[i]); diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp old mode 100644 new mode 100755 index dc3c3356e838c88023d0efa1c40bf6f910aece89..917491c371a4433e212f4b7a74707d7350363821 --- a/mobile/src/fpga/V2/image.cpp +++ b/mobile/src/fpga/V2/image.cpp @@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * sizeof(int8_t)); - for (j = 0; - j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT); - j++) { - images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5); - } } align_each_out_area_cw = align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); @@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, memcpy( (int8_t *)image_out + tmp_channel + // NOLINT k * align_each_out_area_cw_differ, - images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw, + images_in[i] + j * channel_num[i] + k * align_each_in_area_cw, channel_num[i] * sizeof(int8_t)); tmp_channel += channel_num[i]; @@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, } } fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t)); + for (i = 0; i < image_num; i++) { + fpga_free(images_in_tmp[i]); + } + fpga_free(images_in_tmp); } void split_image(int8_t *image_in, void **images_out, int image_num, diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp old mode 100644 new mode 100755 index aa150e0c6cecbdf278f3d776ebba4ec81ed003a1..a3c179994a2be8dc4a87441febc7e6db4ecd797c --- a/mobile/src/fpga/V2/pe.cpp +++ b/mobile/src/fpga/V2/pe.cpp @@ -109,7 +109,7 @@ using namespace std; // NOLINT #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 -#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 #define REG_POOLING_MODE_RECIPROCAL 0x890 @@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { // DLOG << " activation_type:" << active_args.activation_type // << " leaky_relu_negative_slope:" // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - + DLOG << " reg_ActivationArgs:"; + uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { ret = -EIO; @@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } + // reg_writeq(reg_ActivationArgs, + // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion + + reg_writeq(output_scale, REG_SCALE_PARAMETER); // new reg_writeq((args.driver.row_padding_down << 45) | (args.driver.row_padding_up << 34) | @@ -270,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { args.driver.filter_pad_width_mul_channel, REG_CONV_REG1); - reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) | - (args.driver.filter_row << 8) | - (args.driver.filter_height << 4) | args.driver.filter_width, - REG_CONV_REG2); + reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) | + (args.driver.filter_row << 10) | + (args.driver.filter_height << 5) | args.driver.filter_width, + REG_CONV_REG2); reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) | (args.driver.prog_full_cnt << 16) | @@ -358,7 +362,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { << " out_scale_address:" << args.output.scale_address; #endif #ifdef PADDLE_MOBILE_ZU5 - DLOG << "Polling"; // return 0; uint64_t output_scale = 0; uint64_t timer_cnt = 0; @@ -366,66 +369,74 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { uint64_t cmd = 0; uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; - - // uint64_t reg_ActivationArgs = 0; - // active function:{none,leakeyrelu,sigmoid,tanh} - // ActivationArgs active_args; - // active_args.activation_type = LEAKYRELU; - // active_args.activation_type = args.output.activation.activation_type; - - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; - - image_physical_address = vaddr_to_paddr_driver(args.image.address); - output_physical_address = vaddr_to_paddr_driver(args.output.address); - uint32_t output_height = (uint32_t)( +uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t)( (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( + args.kernel.stride_h + 1); + uint64_t output_width = (uint64_t)( (args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1); + args.kernel.stride_w + 1); + uint64_t image_amount_per_row = align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | - (((uint64_t)args.kernel_reciprocal)); - + uint64_t image_one_pad_per_row = (uint64_t)args.image.width * + (uint64_t)args.image.channels +(uint64_t)args.image.pad_width * + (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width * + (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = + (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + uint64_t filter_row_align = + C_align_32 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_32 * + (uint64_t)args.kernel.width * (uint64_t)args.kernel.height; + uint64_t mult_factor = 0; + float average_reciprocal = args.kernel_reciprocal; + uint32_t* kernel_reciprocal; + kernel_reciprocal =(reinterpret_cast(&average_reciprocal)); + if (args.mode == 1) + mult_factor = (uint64_t)(*kernel_reciprocal) | + ((uint64_t)1 << 32) | ((uint64_t)1 << 40); + else + mult_factor = + (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -433,41 +444,21 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - - // reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); - reg_writeq(cmd, REG_POOLING_CMD); + reg_writeq(output_scale, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq((uint64_t)args.image.channels, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(mult_factor, 0x840); // dw donot care + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + if (args.mode == 1) + cmd = (uint64_t)4; + else + cmd = (uint64_t)8; + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { @@ -478,14 +469,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { } DLOG << "after reg poll"; - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; @@ -518,19 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #endif #ifdef PADDLE_MOBILE_ZU5 int ret = 0; - uint64_t output_scale = 0; - - // uint64_t reg_ActivationArgs = 0; - // ActivationArgs active_args; - // active_args.activation_type = args.output.activation.activation_type; - // active_args.leaky_relu_negative_slope = - // args.output.activation.leaky_relu_negative_slope; - // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) | - // active_args.leaky_relu_negative_slope; - // DLOG << " activation_type:" << active_args.activation_type - // << " leaky_relu_negative_slope:" - // << active_args.leaky_relu_negative_slope; - // DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; +uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { @@ -540,18 +511,47 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { return ret; } - // reg_writeq(reg_ActivationArgs, - // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion - reg_writeq(output_scale, REG_SCALE_PARAMETER); - reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); - reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); - reg_writeq(args.driver.datalen, REG_EW_DATA_LEN); - reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL); - reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); - reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR); - reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT); - reg_writeq(args.driver.cmd, REG_EW_CMD); + uint64_t image0_physical_address = 0; + uint64_t image1_physical_address = 0; + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + image0_physical_address = vaddr_to_paddr(args.image0.address); + image1_physical_address = vaddr_to_paddr(args.image1.address); + image_physical_address = + image0_physical_address | (image1_physical_address << 32); + output_physical_address = vaddr_to_paddr(args.output.address); + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * + (uint64_t)args.image0.channels, IMAGE_ALIGNMENT); + uint64_t result_addr_row = + output_physical_address | (image_amount_per_row << 32); + uint64_t kernel_padding_step = 0; + kernel_padding_step = ((uint64_t)args.image0.height * 2) | + ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48); + uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) | + ((image_amount_per_row / 32 - 1) << 16) | + (((uint64_t)args.image0.height * 2) << 32); + uint64_t image_row_col_padding_down = image_amount_per_row | + (image_amount_per_row << 32); + float quantParam = (args.output.scale_address)[0]; + uint32_t* ew_scale = reinterpret_cast(&quantParam); + uint64_t ew_scale_mult_factor = (*ew_scale) | + ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); + reg_writeq(0ul, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(32, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(((image_amount_per_row*2) << 32), 0x838); + reg_writeq(ew_scale_mult_factor, 0x840); // dw donot care + reg_writeq(((uint64_t)32 << 32), 0x848); + reg_writeq(0, 0x858); + uint64_t cmd = 0; + cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR; @@ -560,12 +560,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!"); } - // output_scale = reg_readq(REG_SCALE_PARAMETER); - // output_scale = (output_scale << 32) | (output_scale >> 32); - // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - // active_args.activation_type = NONE; - // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); - pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif @@ -870,7 +864,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { #endif } - if (sub_conv_num > 1) { + /*if (sub_conv_num > 1) { float max_scale = -1.0f; #ifdef COST_TIME_PRINT gettimeofday(&start, NULL); @@ -894,19 +888,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) { << " cost time: " << (dif_sec * 1000000 + dif_usec) << "us" << std::endl; #endif - - // fpga_flush(args.output.scale_address, 2 * sizeof(float)); - /*#ifdef COST_TIME_PRINT - gettimeofday(&start,NULL); - #endif - //deconv_post_process(args); - #ifdef COST_TIME_PRINT - gettimeofday(&end,NULL); - dif_sec = end.tv_sec - start.tv_sec; - dif_usec = end.tv_usec - start.tv_usec; - std::cout << "deconv_post_process " << " cost time: " << - (dif_sec*1000000+dif_usec) << "us" << std::endl; #endif*/ - } + }*/ return 0; } // ComputeFpgaDeconv @@ -940,8 +922,8 @@ int ComputeDWConv(const struct DWconvArgs &args) { << " image_width:" << args.image.width << " pad_height:" << args.image.pad_height << " pad_width:" << args.image.pad_width; - DLOG << " filter_address:" << args.filter_address - << " bias_address:" << args.bias_address; + DLOG << " filter_address:" << args.filter_address; + //<< " bias_address:" << args.bias_address; DLOG << " kernel_height:" << args.kernel.height << " kernel_width:" << args.kernel.width << " stride_h:" << args.kernel.stride_h @@ -951,11 +933,10 @@ int ComputeDWConv(const struct DWconvArgs &args) { #endif #ifdef PADDLE_MOBILE_ZU5 DLOG << "DWConv"; + uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT); // return 0; - uint64_t output_scale = 0; uint64_t timer_cnt = 0; int ret = 0; - // uint64_t cmd = args.relu_enabled; uint64_t cmd = 0; uint64_t image_physical_address = 0; uint64_t output_physical_address = 0; @@ -966,57 +947,69 @@ int ComputeDWConv(const struct DWconvArgs &args) { output_physical_address = vaddr_to_paddr(args.output.address); filter_physical_address = vaddr_to_paddr(args.filter_address); bias_physical_address = vaddr_to_paddr(args.bias_address); - uint64_t filter_N_align = - align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t filter_amount_per_row_align = - filter_N_align * (uint64_t)args.kernel.width; - uint64_t sub_filter_amount_align = filter_N_align * - (uint64_t)args.kernel.width * - (uint64_t)args.kernel.height; - uint64_t filter_amount_align = - sub_filter_amount_align * (uint64_t)args.sub_conv_num; - - uint32_t output_height = (uint32_t)( - (args.image.height + args.image.pad_height * 2 - args.kernel.height) / - args.kernel.stride_h + - 1); - uint32_t output_width = (uint32_t)( - ((args.image.width + args.image.pad_width * 2 - args.kernel.width) / - args.kernel.stride_w + - 1) * - args.sub_conv_num); + uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64); + uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32); + uint64_t output_height = (uint64_t) + ((args.image.height + args.image.pad_height * 2 - + args.kernel.height) / args.kernel.stride_h +1); + uint64_t output_width = (uint64_t) + (((args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + 1) * args.sub_conv_num); uint64_t image_amount_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); + align_to_x((uint64_t)args.image.width * + (uint64_t)args.image.channels, IMAGE_ALIGNMENT); uint64_t image_one_pad_per_row = - align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT) + - (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; - uint64_t image_two_pad_per_row = align_to_x( - ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * - (uint64_t)args.image.channels, - IMAGE_ALIGNMENT); - uint64_t image_row_mul_pooling_hight = - image_amount_per_row * (uint64_t)args.kernel.height; - uint64_t image_row_mul_pad_hight = - image_amount_per_row * (uint64_t)args.image.pad_height; - uint64_t image_row_mul_step_hight = - image_amount_per_row * (uint64_t)args.kernel.stride_h; - uint64_t result_amount_align_32 = - align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, - FILTER_ELEMENT_ALIGNMENT); - uint64_t result_amount_align_64 = align_to_x( - (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT); - uint64_t image_calcu_height = - (uint64_t)args.kernel.height + - ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; - uint64_t image_pad_left = args.image.channels * args.image.pad_width; - uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; - - uint64_t image_padleft_skipwindow = - (image_skip_window << 32) | image_pad_left; - + (uint64_t)args.image.width * (uint64_t)args.image.channels + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, 32); + uint64_t result_addr_row = + (result_amount_align_32 << 32) | output_physical_address; + uint64_t row_padding_down = + (uint64_t)args.image.height + (uint64_t)args.image.pad_height; + uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1; + uint64_t kernel_padding_step = row_padding_down | + ((uint64_t)args.image.pad_height << 16) | + ((uint64_t)args.kernel.stride_h << 24) | + ((uint64_t)kernel_width_sub1<<32) | + ((uint64_t)args.kernel.height << 40) | + ((uint64_t)(args.kernel.height-1) << 48); + uint64_t image_calcu_height = (uint64_t)args.kernel.height + + (output_height - 1) * (uint64_t)args.kernel.stride_h; + uint64_t result_size_calcu_height = (output_height - 1) | + ((output_width - 1) << 16) | (image_calcu_height << 32); + uint64_t col_padding_down = ((uint64_t)args.image.width + + (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels; + + uint64_t image_row_col_padding_down = + image_amount_per_row | (col_padding_down << 32); + uint64_t image_rowXpadding_h = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_rowXstep_h = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t image_rowXpad_h_rowXstep_h = + image_rowXpadding_h | (image_rowXstep_h << 32); + uint64_t channelXpad_w = + (uint64_t)args.image.channels * (uint64_t)args.image.pad_width; + uint64_t channelXstep_w = + (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w; + uint64_t channelXpad_w_channelXstep_w = + channelXpad_w | (channelXstep_w << 32); + + uint64_t filter_row_align = + C_align_64 * (uint64_t)args.kernel.width; + uint64_t sub_filter_amount_align = C_align_64 * + (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height; + uint64_t filter_amount_align = + sub_filter_amount_align * (uint64_t)args.sub_conv_num; + uint64_t filter_param = filter_row_align | (filter_amount_align << 16) | + (sub_filter_amount_align << 32) | + (((uint64_t)args.sub_conv_num -1) << 48); + uint64_t channel_parameter = + (uint64_t)args.image.channels | (C_align_64 << 16); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { ret = -EIO; @@ -1024,73 +1017,31 @@ int ComputeDWConv(const struct DWconvArgs &args) { pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; } - - /*restart scale*/ - reg_writeq(output_scale, REG_SCALE_PARAMETER); - - reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); - reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); - reg_writeq((bias_physical_address << 32 | filter_physical_address), - REG_DWCONV_FILTER_BASE_ADDR); - reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32), - REG_DWCONV_FILTER_SHAPE); - reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32), - REG_DWCONV_FILTER_SUBNUMBER); - reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN); - - reg_writeq( - ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), - REG_POOLING_IMAGE_PIXEL); - reg_writeq( - ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), - REG_POOLING_WINDOW_SIZE); - - reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), - REG_POOLING_RESULT_PIXEL); - - reg_writeq(((uint64_t)args.image.pad_height) | - (((uint64_t)args.image.pad_width) << 32), - REG_POOLING_PAD_PIXEL); - reg_writeq(((uint64_t)args.kernel.stride_h) | - (((uint64_t)args.kernel.stride_w) << 32), - REG_POOLING_STEP_PIXEL); - - reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); - - reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); - reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); - reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); - - reg_writeq(image_row_mul_pooling_hight, - REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); - reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); - reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); - - reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); - reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); - - reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); - - reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); - - /*SDK刷Cache保证数据一致性*/ - - reg_writeq(cmd, REG_DWCONV_CMD); + reg_writeq(0ul, REG_SCALE_PARAMETER); + reg_writeq(image_physical_address, 0x808); + reg_writeq(result_addr_row, 0x810); + reg_writeq(kernel_padding_step, 0x818); + reg_writeq(result_size_calcu_height, 0x820); + reg_writeq(channel_parameter, 0x828); + reg_writeq(image_row_col_padding_down, 0x830); + reg_writeq(image_rowXpad_h_rowXstep_h, 0x838); + reg_writeq(0, 0x840); + reg_writeq(channelXpad_w_channelXstep_w, 0x848); + reg_writeq(filter_physical_address, 0x850); + reg_writeq(filter_param, 0x858); + reg_writeq(((bias_physical_address+C_align_64*4) | + (bias_physical_address << 32)), 0x860); + cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8); + reg_writeq(cmd, 0x800); DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; ret = -EIO; - DLOG << "Pooling Wait Irq Timeout!"; + DLOG << "DWconv Wait Irq Timeout!"; PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout"); } DLOG << "after reg poll"; - - // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); - output_scale = reg_readq(REG_SCALE_PARAMETER); - output_scale = (output_scale << 32) | (output_scale >> 32); - fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); - DLOG << "output_scale:" << output_scale; pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); return ret; #endif diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp old mode 100644 new mode 100755 index 911704965aac3b6897b70dc60cb23fb4f3e59979..b7ce4d32474465988f0e2c02763d21bfdf9a7530 --- a/mobile/src/fpga/common/driver.cpp +++ b/mobile/src/fpga/common/driver.cpp @@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { uint64_t i = 0; /*timeout精确性待确认*/ int64_t timeout = time * 6; - usleep(1); for (i = 0; i < timeout; i++) { + usleep(1); if (val == reg_readq(reg)) { break; } diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h old mode 100644 new mode 100755 index a798d54459b86f67a28c158dc30c82131ea48626..a767cd2606bb351b42f8d2a6bc944c66a2fa39a7 --- a/mobile/src/fpga/common/fpga_common.h +++ b/mobile/src/fpga/common/fpga_common.h @@ -211,6 +211,7 @@ struct ConcatArgs { uint32_t out_channel; uint32_t height; uint32_t width; + std::vector> vector_concat_space; }; struct SplitConvArgs { diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp old mode 100644 new mode 100755 index 951fbb5f3708bf511bfcbbb0669fb7a56a4eb7c4..56cc8927f035b16963b639bc960b20532b931f44 --- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp @@ -37,7 +37,7 @@ bool AnchorGeneratorKernel::Init( int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, -20, 39, 36, -43, -34, 59, 49, -63, -54, 79, 69, -96, -77, 112, 93, -137, -118, 153, - 134, -204, -188, 220, 204, -281, -395, 296, 441}; + 134, -204, -188, 220, 204, -281, -395, 296, 411}; int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103, 0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58, diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp old mode 100644 new mode 100755 index 716531fcab47252c86486d2cb1f325ca97423935..8442eef8b2314d5035d673c12dd87590cfb8064d --- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp @@ -53,6 +53,15 @@ bool ConcatKernel::Init(ConcatParam *param) { concatArgs.channel_num = channel_num; concatArgs.height = height; concatArgs.width = width; + + auto deleter = [](void *p) { fpga::fpga_free(p); }; + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.images_in), deleter)); + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.scales_in), deleter)); + concatArgs.vector_concat_space.push_back(std::shared_ptr( + reinterpret_cast(concatArgs.channel_num), deleter)); + param->SetFpgaArgs(concatArgs); return true; } diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp old mode 100644 new mode 100755 index 43b9355c99be4a22781cac10309a24c7dd3ac76c..57ccf9f00d9e4ab04bbed16af8b02e4aaa537847 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp @@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef ELEMENTWISEADD_OP - +#include #include "operators/kernel/elementwise_add_kernel.h" -#include -#include "fpga/V2/api.h" - namespace paddle_mobile { namespace operators { @@ -60,10 +57,36 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { return true; } +void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAdd(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp old mode 100644 new mode 100755 index 6d5ad505732f58cfc9a50f8627a07956cd96d45c..de603418742da5b9672259a1bb414567853a8cb5 --- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef FUSION_ELEMENTWISEADDRELU_OP - +#include #include "operators/kernel/elementwise_add_relu_kernel.h" namespace paddle_mobile { @@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel::Init( return true; } +void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) { + int inputc = ewaddArgs.image0.channels; + int inputh = ewaddArgs.image0.height; + int inputw = ewaddArgs.image0.width; + float inScale0 = + (reinterpret_cast(ewaddArgs.image0.scale_address))[0]; + float inScale1 = + (reinterpret_cast(ewaddArgs.image1.scale_address))[0]; + float outScale = + (reinterpret_cast(ewaddArgs.output.scale_address))[0]; + int8_t* inPtr0 = reinterpret_cast(ewaddArgs.image0.address); + int8_t* inPtr1 = reinterpret_cast(ewaddArgs.image1.address); + int8_t* outPtr = reinterpret_cast(ewaddArgs.output.address); + int datasize = inputc * inputh * inputw; + float const0 = inScale0 / outScale; + float const1 = inScale1 / outScale; + fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t)); + fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t)); + for (int i = 0; i < datasize; i++) { + float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1; + int tmpI = static_cast(round(tmpF)); + outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI))); + } + fpga::fpga_flush(outPtr, datasize * sizeof(int8_t)); +} + template <> void ElementwiseAddReluKernel::Compute( const ElementwiseAddReluParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + // fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + ComputeCPUEWAddRelu(param.FpgaArgs()); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp old mode 100644 new mode 100755 index fcf0889b4a66919efc677e211a1da453fd761de4..c7cd6575e4010f7ba9aa12882a8968cd558049b9 --- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp @@ -110,7 +110,27 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { } } output->Resize(framework::make_ddim(shape)); + + bool reshapeNeedFlg = 1; if (output->dims() == input->dims()) { + reshapeNeedFlg = 0; + } else if (output->dims().size() != input->dims().size()) { + auto inputdimsize = input->dims().size(); + auto outputdimsize = output->dims().size(); + int smallersize = + inputdimsize > outputdimsize ? outputdimsize : inputdimsize; + int i = 0; + for (i = 0; i < smallersize; i++) { + if ((input->dims())[i] != (output->dims())[i]) + break; + } + if (i == smallersize) { + reshapeNeedFlg = 0; + } + } + if (reshapeNeedFlg) { + reshape(input, output); + } else { DLOG << "No need to reshape"; output->ShareDataWith(*input); framework::LoD lod = input->lod(); @@ -118,9 +138,6 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { output->scale[0] = input->scale[0]; return; } - - reshape(input, output); - // } } // namespace operators diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp index 194fd5a30565b866ca702b296981d0b8302a1c16..44aae4be321db6797d3450cec7c2f159b5e5124b 100644 --- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp @@ -48,7 +48,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { template <> void SigmoidKernel::Compute(const SigmoidParam ¶m) { fpga::PerformBypass(param.FpgaArgs()); - param.Out()->scale[0] = 127.0; + param.Out()->scale[0] = 1.0; } } // namespace operators diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp old mode 100644 new mode 100755 index a1500ecdb0246d4c7235de490437945ec381d5a4..d32dddb3072b9b7181da4d871fe6cea37db5de04 --- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp +++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp @@ -30,6 +30,7 @@ bool SliceKernel::Init(SliceParam* param) { } return true; } + template <> void SliceKernel::Compute(const SliceParam& param) { // Only support slicing in channel dimension @@ -38,6 +39,8 @@ void SliceKernel::Compute(const SliceParam& param) { auto input = param.input_; auto output = param.output_; + int H = input->dims()[2]; + int W = input->dims()[3]; int HW = input->dims()[2] * input->dims()[3]; int channel = input->dims()[1]; auto input_ptr = input->data(); @@ -53,10 +56,32 @@ void SliceKernel::Compute(const SliceParam& param) { end = end > channel ? channel : end; int len = end - start; size_t size = len * sizeof(int8_t); + DLOG << input->fpga_data_num; + fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t)); + DLOG << output->fpga_data_num; + fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t)); + int unalignedWC = len * W; + int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT); - for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); + if (unalignedWC != alignedWC) { + auto tmpOutput = reinterpret_cast + (fpga::fpga_malloc(len*HW * sizeof(int8_t))); + for (int i = 0; i < HW; i++) { + memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size); + } + for (int i = 0; i < H; i++) { + for (int j = 0; j < unalignedWC; j++) { + *(output_ptr + alignedWC * i + j) = + *(tmpOutput + unalignedWC * i + j); + } + } + fpga::fpga_free(tmpOutput); + } else { + for (int i = 0; i < HW; i++) { + memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); + } } + fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t)); } } // namespace operators } // namespace paddle_mobile diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp index 11fce286051dbaa158ae9db917452c4987122f32..4f8b7a7b3000a9130dac1c755a3beb16e7c98c59 100644 --- a/mobile/src/operators/math/depthwise_conv3x3.cpp +++ b/mobile/src/operators/math/depthwise_conv3x3.cpp @@ -150,7 +150,8 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, const int out_image_size = output_h * output_w; const int valid_h_start = padding_h; const int valid_h_end = output_h - valid_h_start; - const int valid_h = valid_h_end - valid_h_start; + const int valid_h = + valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; const int valid_w_start = padding_w; const int valid_w_end = output_w - valid_w_start; const int valid_w = valid_w_end - valid_w_start; @@ -631,7 +632,7 @@ void DepthwiseConv3x3S1(const framework::Tensor &input, } } // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { + for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, input_w, padding_h, padding_w, output_w, output_ptr, _ker); @@ -659,7 +660,8 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, const int valid_h_start = (padding_h + 1) / 2; const int valid_h_end = std::max((input_h + padding_h - 1) / 2, valid_h_start); - const int valid_h = valid_h_end - valid_h_start; + const int valid_h = + valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0; const int valid_w_start = (padding_w + 1) / 2; const int valid_w_end = std::max((input_w + padding_w - 1) / 2, valid_w_start); @@ -1045,7 +1047,7 @@ void DepthwiseConv3x3S2(const framework::Tensor &input, } } // pad bottom - for (int h = valid_h_end; h < output_h; ++h) { + for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) { DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, input_w, padding_h, padding_w, output_w, output_ptr, _ker);