diff --git a/CMakeLists.txt b/CMakeLists.txt index 199b3bda17f4ac22c1d657b6794446832d448440..1ec5352fa4009144b9f572ecbe061aba11e884d3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) @@ -184,6 +185,10 @@ if(LITE_WITH_CUDA) include(cuda) endif() +if(LITE_WITH_XPU) + include(xpu) +endif() + include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 67830fe2e0ec3c35064acb4c00ec152989ddb655..5dbb7f3fca4a2ecdab943cd49f34ee97f9bac9b0 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -127,6 +127,10 @@ if (LITE_WITH_NPU) add_definitions("-DLITE_WITH_NPU") endif() +if (LITE_WITH_XPU) + add_definitions("-DLITE_WITH_XPU") +endif() + if (LITE_WITH_OPENCL) add_definitions("-DLITE_WITH_OPENCL") endif() diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake index 863200986c93ea09d3fa3049fe684b32c2fb52dd..25aa4d2bc8c1c145e7a103c9164e1c9e231a8f9e 100644 --- a/cmake/cross_compiling/npu.cmake +++ b/cmake/cross_compiling/npu.cmake @@ -50,9 +50,6 @@ find_library(NPU_DDK_IR_FILE NAMES hiai_ir find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) -find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite - PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) - if(NOT NPU_DDK_HIAI_FILE) message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") else() @@ -77,14 +74,8 @@ else() set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) endif() -if(NOT NPU_DDK_PROTO_FILE) - message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}") -else() - message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}") - add_library(npu_ddk_proto SHARED IMPORTED GLOBAL) - set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE}) -endif() +set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs") +set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs") -set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs") diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 7d8641d96da86cf9a2be442b797507ac79058efa..9b6fab3f6261ff13361bda35cfa9cd681075c77d 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -83,6 +83,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_XPU) + foreach(var ${lite_deps_XPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + set(${TARGET} ${deps} PARENT_SCOPE) endfunction() @@ -107,7 +113,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -118,6 +124,7 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} NPU_DEPS ${args_NPU_DEPS} + XPU_DEPS ${args_XPU_DEPS} ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} @@ -236,6 +243,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels") set(x86_kernels CACHE INTERNAL "x86 kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") +set(xpu_kernels CACHE INTERNAL "xpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -305,6 +313,12 @@ function(add_kernel TARGET device level) endif() set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "XPU") + if (NOT LITE_WITH_XPU) + return() + endif() + set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "FPGA") if (NOT LITE_WITH_FPGA) return() @@ -338,6 +352,7 @@ function(add_kernel TARGET device level) lite_cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS} X86_DEPS ${args_X86_DEPS} + XPU_DEPS ${args_XPU_DEPS} CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} @@ -386,6 +401,7 @@ function(add_operator TARGET level) lite_cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${args_DEPS} X86_DEPS ${args_X86_DEPS} + XPU_DEPS ${args_XPU_DEPS} CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} ARM_DEPS ${args_ARM_DEPS} diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7bf63f93d3646a2a1f009bd51b369e6bc014091a --- /dev/null +++ b/cmake/xpu.cmake @@ -0,0 +1,103 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_XPU) + return() +endif() + +if(NOT DEFINED XPU_SDK_ROOT) + set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT}) + if(NOT XPU_SDK_ROOT) + message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") + endif() +endif() + +message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") +find_path(XPU_SDK_INC NAMES xtcl.h + PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) +if(NOT XPU_SDK_INC) + message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") +endif() + +include_directories("${XPU_SDK_ROOT}/XTCL/include") +include_directories("${XPU_SDK_ROOT}/XTDK/include") + +find_library(XPU_SDK_XTCL_FILE NAMES xtcl + PATHS ${XPU_SDK_ROOT}/XTCL/so) + +if(NOT XPU_SDK_XTCL_FILE) + message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") + add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) +endif() + +find_library(XPU_SDK_TVM_FILE NAMES tvm + PATHS ${XPU_SDK_ROOT}/XTCL/so) + +if(NOT XPU_SDK_TVM_FILE) + message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") + add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) +endif() + +find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi + PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + +if(NOT XPU_SDK_XPU_API_FILE) + message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU API Library: ${XPU_SDK_XPU_API_FILE}") + add_library(xpu_sdk_xpu_api SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xpu_api PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_API_FILE}) +endif() + +find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt + PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + +if(NOT XPU_SDK_XPU_RT_FILE) + message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU RT Library: ${XPU_SDK_XPU_RT_FILE}") + add_library(xpu_sdk_xpu_rt SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) +endif() + +find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc + PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + +if(NOT XPU_SDK_XPU_JITC_FILE) + message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}") + add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE}) +endif() + +find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 + PATHS ${XPU_SDK_ROOT}/XTDK/shlib) + +if(NOT XPU_SDK_LLVM_FILE) + message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") +else() + message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") + add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) +endif() + +set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") +set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index ecd6f6634251246b3a759f52dbb538e66505025c..bff5a231388b62772c20d194ec140518d9765b27 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -6,6 +6,7 @@ message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") +message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index a4ea1b8ff9d09268d76e63f0d032a17d390f90a2..4e768731d295452f424e69b80cb6ef167e6b013f 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -26,11 +26,21 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and DEPS ${light_lib_DEPS} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) + if (LITE_WITH_NPU) + # Strips the symbols of our protobuf functions to fix the conflicts during + # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + if (LITE_WITH_NPU) + # Need to add HIAI runtime libs (libhiai.so) dependency + target_link_libraries(paddle_light_api_shared ${npu_runtime_libs}) + endif() endif() endif() @@ -39,7 +49,8 @@ if (WITH_TESTING) DEPS scope optimizer target_wrapper_host model_parser program ${ops} ${host_kernels} CUDA_DEPS ${cuda_kernels} - X86_DEPS ${x86_kernels}) + X86_DEPS ${x86_kernels} + XPU_DEPS ${xpu_kernels}) endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) @@ -51,6 +62,7 @@ message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") +message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") # for full api @@ -63,6 +75,7 @@ if (NOT LITE_ON_TINY_PUBLISH) X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass + XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass CL_DEPS ${opencl_kenrels} FPGA_DEPS ${fpga_kenrels}) endif() @@ -82,6 +95,7 @@ lite_cc_library(light_api SRCS light_api.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kenrels} FPGA_DEPS ${fpga_kenrels}) @@ -96,6 +110,7 @@ if(WITH_TESTING) X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} EXCLUDE_COMPILE_DEPS "ON" @@ -223,6 +238,7 @@ lite_cc_test(test_apis SRCS apis_test.cc DEPS cxx_api light_api ${ops} paddle_api_light CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} + XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -250,6 +266,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle ${ops} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} @@ -264,6 +281,7 @@ if(NOT IOS) ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels}) @@ -271,6 +289,7 @@ if(NOT IOS) ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels}) diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt index afe051a437f4de83931bdaa3f2d03427b78d13ad..3efa980332f25d786d5c880fab9b3ba5af0a1013 100644 --- a/lite/api/android/jni/native/CMakeLists.txt +++ b/lite/api/android/jni/native/CMakeLists.txt @@ -17,10 +17,20 @@ if (NOT LITE_ON_TINY_PUBLISH) # Unlike static library, module library has to link target to be able to work # as a single .so lib. target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) + if (LITE_WITH_NPU) + # Strips the symbols of our protobuf functions to fix the conflicts during + # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(paddle_lite_jni PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() else() add_library(paddle_lite_jni SHARED "") target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) add_dependencies(paddle_lite_jni op_list_h kernel_list_h) + if (LITE_WITH_NPU) + # Need to add HIAI runtime libs (libhiai.so) dependency + target_link_libraries(paddle_lite_jni ${npu_runtime_libs}) + endif() endif() if (APPLE) diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index dbdf9ff269b372cd3dcd59769b15526b7631a5e5..ccacb027d682b5388e44b05075b66f436c3e2668 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -46,8 +46,16 @@ std::string Place::DebugString() const { } const std::string& TargetToStr(TargetType target) { - static const std::string target2string[] = { - "unk", "host", "x86", "cuda", "arm", "opencl", "any", "fpga", "npu"}; + static const std::string target2string[] = {"unk", + "host", + "x86", + "cuda", + "arm", + "opencl", + "any", + "fpga", + "npu", + "xpu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -84,7 +92,8 @@ const std::string& TargetRepr(TargetType target) { "kOpenCL", "kAny", "kFPGA", - "kNPU"}; + "kNPU", + "kXPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index 5e4f2ed21c8298ac15a912672e3d15633d0a3ecb..19ec5c6e8b5e39d1c68f9a20968472cbc66e89a2 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -50,8 +50,9 @@ enum class TargetType : int { kOpenCL = 5, kFPGA = 7, kNPU = 8, + kXPU = 9, kAny = 6, // any target - NUM = 9, // number of fields. + NUM = 10, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index 70b4f0bbf794ed7ca537177f48fee34a5955aba5..dec63e6efa0e4c4548646ebdd6f6de24f046d6d0 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(cuda) add_subdirectory(fpga) add_subdirectory(host) add_subdirectory(npu) +add_subdirectory(xpu) diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt index 370f620b919d9cdb7458a704b205951caf4bf8af..426ff5698146c773c818b2bfd598d6bbbdf7867f 100644 --- a/lite/backends/npu/CMakeLists.txt +++ b/lite/backends/npu/CMakeLists.txt @@ -2,4 +2,5 @@ if(NOT LITE_WITH_NPU) return() endif() -lite_cc_library(npu_runtime SRCS runtime.cc DEPS npu_ddk_hiai) +lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs}) +lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope) diff --git a/lite/kernels/npu/bridges/utils.cc b/lite/backends/npu/builder.cc similarity index 92% rename from lite/kernels/npu/bridges/utils.cc rename to lite/backends/npu/builder.cc index 933d8188c99a36bd537ac4a5ee5f584a2b79956a..80ab6e486b6cd9a67f4162ffb11d7bdac959eca9 100644 --- a/lite/kernels/npu/bridges/utils.cc +++ b/lite/backends/npu/builder.cc @@ -12,21 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "lite/kernels/npu/bridges/utils.h" +#include "lite/backends/npu/builder.h" #include // NOLINT #include -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data -#include "ai_ddk_lib/include/graph/tensor.h" // for ge::TensorUtils -#include "ai_ddk_lib/include/hiai_ir_build.h" #include "lite/backends/npu/runtime.h" namespace paddle { namespace lite { -namespace kernels { namespace npu { -namespace bridges { // Build HIAI IR graph to om model, and store om model data into lite tensor bool BuildModel(std::vector& inputs, // NOLINT @@ -165,8 +158,6 @@ bool HasInputArg(const OpInfo* op_info, } } -} // namespace bridges } // namespace npu -} // namespace kernels } // namespace lite } // namespace paddle diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h new file mode 100644 index 0000000000000000000000000000000000000000..a245a8517b1c8e20a4630d370da5ca0b203adb71 --- /dev/null +++ b/lite/backends/npu/builder.h @@ -0,0 +1,254 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "ai_ddk_lib/include/graph/buffer.h" +#include "ai_ddk_lib/include/graph/graph.h" +#include "ai_ddk_lib/include/graph/model.h" +#include "ai_ddk_lib/include/graph/op/all_ops.h" +#include "ai_ddk_lib/include/graph/operator.h" +#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "ai_ddk_lib/include/hiai_ir_build.h" +#include "lite/core/op_lite.h" +#include "lite/core/target_wrapper.h" +#include "lite/core/tensor.h" + +// Extended Ops of HIAI DDK +namespace ge { +/** + * Multiply the matrix x1 by the matrix x2 to generate x1 * x2. + * The inputs must be two-dimensional matrices and the inner dimension of "x1" + * (after being transposed if transpose_x1 is true) must match the outer + * dimension of "x2" (after being transposed if transposed_x2 is true). + * x : the first input tensor, must be non const op. + * w : the second input tensor, must be const op. + * bias: the optional bias tensor, must be const op. + * + * y : the output tensor. + * + * has_bias: If true, enable input bias. + */ +REG_OP(MatMul) + .INPUT(x, TensorType({DT_FLOAT})) + .INPUT(w, TensorType({DT_FLOAT})) + .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT})) // bias must be const input + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(has_bias, AttrValue::BOOL{false}) // when has input::bias,set true + .OP_END(); + +/** + * Computes the gradients of convolution with respect to the input. + * + * input_sizes : An integer vector representing the shape of input, + * where input is a 4-D [batch, height, width, channels] tensor. + * filter : the filter tensor, with shape [H , W, filter_channel, + * filter_number], filter_channel must be same as x channel. + * x : The input tensor. + * + * y : The output tensor. + * + * format: 0: NCHW. 1: NHWC + * group : 1: default + * num_output : 0: default, num_output must be equal to + * (filter_channel * group) + * pad : Padding for the beginning and ending along each axis + * stride : Stride along each axis. + * dilation : dilation value along each axis of the filter. + * pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET + * bias_term : 0: default + * kernel : The shape of the convolution kernel + */ +REG_OP(Deconvolution) + .INPUT(input_sizes, TensorType({DT_UINT8})) + .INPUT(filter, TensorType({DT_FLOAT})) + .INPUT(x, TensorType({DT_FLOAT})) + .OPTIONAL_INPUT(b, TensorType({DT_FLOAT})) + .OUTPUT(y, TensorType({DT_FLOAT})) + .ATTR(mode, AttrValue::INT{1}) + .ATTR(format, AttrValue::INT{1}) + .ATTR(group, AttrValue::INT{1}) + .ATTR(num_output, AttrValue::INT{0}) + .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0})) + .ATTR(stride, AttrValue::LIST_INT({1, 1})) + .ATTR(dilation, AttrValue::LIST_INT({1, 1})) + .ATTR(pad_mode, AttrValue::INT{0}) + .ATTR(bias_term, AttrValue::INT{0}) + .ATTR(kernel, AttrValue::LIST_INT({0, 0})) + .OP_END(); + +/** + * Resize images to size using bilinear interpolation. + * + * x : The tensor of 4-D + * w : A int32 Tensor of 2 elements: [height, width]. + * + * y : the output tensor + * + * align_corners : If true, the centers of the 4 corner pixels of the + * input and output tensors are aligned, preserving the values at the corner + * pixels. + * output_dim_mode : Defaults 2, including 0: zoom_factor , 1: + * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is + * controled by the [height, width] of w. + * shrink_factor : shrink factor. + * zoom_factor : zoom factor. + * pad_begin : begin of pad. + * pad_end : end of pad. + */ +REG_OP(ResizeBilinear) + .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) + .INPUT(w, TensorType({DT_FLOAT, DT_INT32})) + .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32})) + .ATTR(align_corners, AttrValue::BOOL{false}) + .ATTR(output_dim_mode, AttrValue::INT{2}) + .ATTR(shrink_factor, AttrValue::INT{1}) + .ATTR(zoom_factor, AttrValue::INT{1}) + .ATTR(pad_begin, AttrValue::INT{0}) + .ATTR(pad_end, AttrValue::INT{0}) + .OP_END(); + +/** + * Resize images to size using nearest neighbor interpolation. + * + * image : Resize images to size using nearest neighbor interpolation. + * size : Must be one dimension and two elements + * + * output : the output tensor + * + * align_corners : If true, the centers of the 4 corner pixels of the + * input and output tensors are aligned, preserving the values at the corner + * pixels. Defaults to false + */ +REG_OP(ResizeNearestNeighbor) + .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) + .INPUT(size, TensorType({DT_INT32})) + .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL})) + .ATTR(align_corners, AttrValue::BOOL{false}) + .OP_END(); + +/** + * Pads a tensor. + * + * x : the input tensor + * padding : the input tensor must be 2-D + * constant_values : constant values must be a scalar + * + * output : the output tensor + * + * t_paddings : Default DT_INT32 , t_paddings must be the same with + * datatype of the padding + * mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC + * T : datatype of constant_values DT_INT32:3 DT_FLOAT:0 + */ +REG_OP(Pad) + .INPUT(x, TensorType({DT_FLOAT, DT_INT32})) + .INPUT(padding, TensorType({DT_INT32})) + .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT})) + .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32})) + .ATTR(t_paddings, AttrValue::INT{3}) + .ATTR(mode, AttrValue::INT{0}) + .REQUIRED_ATTR(T, AttrValue::INT) + .OP_END(); + +} // namespace ge + +namespace paddle { +namespace lite { +namespace npu { + +class OpList { + public: + static OpList& Global() { + static thread_local OpList x; + return x; + } + void clear() { lists_.clear(); } + void add(std::shared_ptr p) { lists_.push_back(p); } + + private: + std::vector> lists_; +}; + +// Build HIAI IR graph to om model, and store om model data into lite tensor +bool BuildModel(std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + lite::Tensor* model_data); + +std::string UniqueName(const std::string& prefix); + +ge::DataType PrecisionConverter(PrecisionType itype); + +ge::Format DataLayoutConverter(DataLayoutType itype); + +ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor, + std::vector out_shape = {}, + PrecisionType in_ptype = PRECISION(kFloat), + DataLayoutType in_ltype = DATALAYOUT(kNCHW)); + +template +ge::TensorPtr CreateTensorAndFillData(std::vector data, + std::vector shape = {}, + ge::Format format = ge::FORMAT_NCHW) { + const std::type_info& info = typeid(T); + ge::DataType type = ge::DT_FLOAT; + if (info == typeid(float)) { + type = ge::DT_FLOAT; + } else if (info == typeid(int8_t)) { + type = ge::DT_INT8; + } else if (info == typeid(int32_t)) { + type = ge::DT_INT32; + } else { + LOG(FATAL) << "Unknow value type " << info.name(); + } + if (shape.empty()) { + shape = {static_cast(data.size())}; + } else { + int size = 1; + for (auto i : shape) { + size *= i; + } + CHECK_EQ(data.size(), size); + } + ge::TensorDesc desc(ge::Shape(shape), format, type); + ge::TensorPtr tensor = std::make_shared(); + tensor->SetTensorDesc(desc); + tensor->SetData(reinterpret_cast(data.data()), + data.size() * sizeof(T)); + return tensor; +} + +template +ge::TensorPtr CreateTensorAndFillData(T value, + std::vector shape = {1}, + ge::Format format = ge::FORMAT_NCHW) { + int64_t size = 1; + for (auto i : shape) { + size *= i; + } + std::vector data(size, value); + return CreateTensorAndFillData(data, shape, format); +} + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +} // namespace npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt index 5cc4a9f0770e79683197bce0bb83336b8c79c364..2dea4364d5ee2d11d6d266935fad2a1180954369 100644 --- a/lite/backends/x86/math/CMakeLists.txt +++ b/lite/backends/x86/math/CMakeLists.txt @@ -32,8 +32,8 @@ math_library(sampler) math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) -lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3) -math_library(math_function DEPS blas) +lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3 dynload_mklml) +math_library(math_function DEPS blas dynload_mklml) math_library(maxouting) math_library(pooling) math_library(selected_rows_functor DEPS selected_rows math_function blas) diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..f911f8e0e7c61481e1d4e309bc0635718be11206 --- /dev/null +++ b/lite/backends/xpu/CMakeLists.txt @@ -0,0 +1,6 @@ +if(NOT LITE_WITH_XPU) + return() +endif() + +lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs}) +lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope) diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..796eaf9c46ceb3d29f1ffdc4c86ac45509f07ba1 --- /dev/null +++ b/lite/backends/xpu/builder.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include // NOLINT +#include +#include "lite/backends/xpu/runtime.h" + +namespace paddle { +namespace lite { +namespace xpu { + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +std::string UniqueName(const std::string& prefix) { + static std::mutex counter_mtx; + static std::unordered_map counter_map; + std::unique_lock counter_lck(counter_mtx); + int counter = 1; + auto it = counter_map.find(prefix); + if (it == counter_map.end()) { + counter_map[prefix] = counter; + } else { + counter = ++(it->second); + } + return prefix + "_" + std::to_string(counter); +} + +xtcl::DataType CvtPrecisionType(PrecisionType in_type) { + xtcl::DataType out_type = ::xtcl::Float(32); + switch (in_type) { + case PRECISION(kFloat): + out_type = ::xtcl::Float(32); + break; + case PRECISION(kInt8): + out_type = ::xtcl::Int(8); + break; + case PRECISION(kInt32): + out_type = ::xtcl::Int(32); + break; + default: + LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type) + << ") from Lite to XPU"; + break; + } + return out_type; +} + +DLDataType CvtDataType(PrecisionType in_type) { + DLDataType out_type = {kDLFloat, 32, 1}; + switch (in_type) { + case PRECISION(kFloat): + out_type = {kDLFloat, 32, 1}; + break; + case PRECISION(kInt8): + out_type = {kDLInt, 8, 1}; + break; + case PRECISION(kInt32): + out_type = {kDLInt, 32, 1}; + break; + default: + LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type) + << ") from Lite to XPU"; + break; + } + return out_type; +} + +xtcl::Array CvtShape(const std::vector& in_shape) { + xtcl::Array out_shape; + for (auto dim : in_shape) { + out_shape.push_back(dim); + } + return out_shape; +} + +xtcl::Array CvtShape(const std::vector& in_shape) { + return CvtShape(std::vector(in_shape.begin(), in_shape.end())); +} + +xtcl::Array CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} + +std::shared_ptr CvtTensor(lite::Tensor* in_tensor, + std::vector out_shape, + PrecisionType in_ptype, + DataLayoutType in_ltype) { + uint8_t* in_data = nullptr; + auto in_size = in_tensor->dims().production(); + auto in_shape = in_tensor->dims().Vectorize(); + if (out_shape.empty()) { + out_shape = in_shape; + } + int in_bytes; + if (in_ptype == PRECISION(kFloat)) { + in_data = reinterpret_cast(in_tensor->mutable_data()); + in_bytes = in_size * sizeof(float); + } else if (in_ptype == PRECISION(kInt32)) { + in_data = reinterpret_cast(in_tensor->mutable_data()); + in_bytes = in_size * sizeof(int32_t); + } else if (in_ptype == PRECISION(kInt8)) { + in_data = reinterpret_cast(in_tensor->mutable_data()); + in_bytes = in_size * sizeof(int8_t); + } else { + LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype); + } + auto out_tensor = std::make_shared( + xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0})); + auto out_data = + reinterpret_cast(out_tensor->ToDLPack()->dl_tensor.data); + std::memcpy(out_data, in_data, in_bytes); + return out_tensor; +} + +// Build the XPU subgraph to the XPU model, store the model data into the +// weight tensor of the graph op, and the model data will be loaded again +// by the graph computing kernel when the graph op is executed for inference. +// Due to the lack of XPU APIs for building and outputing the model data, +// the compiled XPU runtime object will be managed by the global variable +// 'DeviceInfo' and the key name for finding the runtime object will be +// stored in the weight tensor of graph op. +// TODO(hong19860320) Compile the XPU subgraph and output the compiled model +// data to the weight tensor of graph op. +bool BuildModel( + std::shared_ptr builder, + std::shared_ptr params, + std::vector>* outputs, + lite::Tensor* model) { + LOG(INFO) << "[XPU] Build Model."; + CHECK(builder != nullptr); + CHECK(outputs != nullptr); + CHECK_GT(outputs->size(), 0); + CHECK(model != nullptr); + + // build graph and fill all of constant params + xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0])); + auto target = xtcl::Target::Create("llvm"); + auto compiler = xtcl::network::xTensorCompiler(network, target); + compiler.SetParams(*params); // set the data of constant tensors + compiler.Build(); + + // create and register runtime + auto runtime = std::make_shared( + compiler.CreateRuntimeInstance()); + if (runtime == nullptr) { + LOG(WARNING) << "[XPU] Build Model failed!"; + return false; + } + std::string name = UniqueName("xpu"); + LOG(INFO) << "[XPU] Model Name: " << name; + DeviceInfo::Global().Insert(name, runtime); + model->Resize({static_cast(name.length() + 1)}); + memcpy(model->mutable_data(), + reinterpret_cast(name.c_str()), + name.length() + 1); + return true; +} + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/builder.h b/lite/backends/xpu/builder.h new file mode 100644 index 0000000000000000000000000000000000000000..f0ac2b303aac7fa7f827e6e2f8f0fdf614b604b5 --- /dev/null +++ b/lite/backends/xpu/builder.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/core/target_wrapper.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace xpu { + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +std::string UniqueName(const std::string& prefix); + +xtcl::DataType CvtPrecisionType(PrecisionType in_type); + +DLDataType CvtDataType(PrecisionType in_type); + +xtcl::Array CvtShape(const std::vector& in_shape); + +xtcl::Array CvtShape(const std::vector& in_shape); + +xtcl::Array CvtShape(const DDim& in_dims); + +std::shared_ptr CvtTensor( + Tensor* in_tensor, + std::vector out_shape = {}, + PrecisionType in_ptype = PRECISION(kFloat), + DataLayoutType in_ltype = DATALAYOUT(kNCHW)); + +bool BuildModel( + std::shared_ptr builder, + std::shared_ptr params, + std::vector>* outputs, + lite::Tensor* model); + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc new file mode 100644 index 0000000000000000000000000000000000000000..a2c34b95758e8abf81c8294507d0ca60aad7c021 --- /dev/null +++ b/lite/backends/xpu/runtime.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/runtime.h" +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace xpu { + +// Extract the model data and recover the XPU model for inference, the function +// is called by the graph computing kernel when the graph op is executed. +// Due to the lack of XPU APIs for loading and recovering the XPU model from +// memory, the key name is obtained from the weight tensor of graph op, to get +// the runtime object for inference from the global variable 'DeviceInfo'. +// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op. +bool LoadModel(const lite::Tensor &model, + std::shared_ptr *runtime) { + LOG(INFO) << "[XPU] Load Model."; + CHECK_GT(model.dims().production(), 0); + std::string name(reinterpret_cast(model.data())); + LOG(INFO) << "[XPU] Model Name: " << name; + CHECK(runtime != nullptr); + *runtime = DeviceInfo::Global().Find(name); + if (*runtime == nullptr) { + LOG(WARNING) << "[XPU] Load Model failed!"; + return false; + } + return true; +} + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..4ff8d75bce6156d51a4988d427058da34460443f --- /dev/null +++ b/lite/backends/xpu/runtime.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace xpu { + +class DeviceInfo { + public: + static DeviceInfo& Global() { + static DeviceInfo x; + return x; + } + DeviceInfo() {} + + void Insert(const std::string& name, + std::shared_ptr runtime) { + if (runtimes_.find(name) != runtimes_.end()) { + LOG(WARNING) << "[XPU] Model " << name << " already exists."; + return; + } + runtimes_.emplace(std::make_pair(name, runtime)); + } + + void Clear() { runtimes_.clear(); } + + std::shared_ptr Find( + const std::string& name) const { + if (runtimes_.find(name) != runtimes_.end()) { + return runtimes_.at(name); + } else { + return nullptr; + } + } + + private: + int device_id_{0}; + std::string device_name_{"default"}; + std::unordered_map> + runtimes_; +}; + +bool LoadModel(const lite::Tensor& model, + std::shared_ptr* runtime); + +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index a5b581335047ff18c31ea9d1c03a9785e4ddf2ed..5eecf1d815d30fe0ef10a55c6b6b351795fe63ae 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -35,7 +35,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor) if (LITE_WITH_ARM) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime) else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime) endif() #-------------------------------------------- GET CODE META INFO ------------------------------------------ diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index 127e2ea11c159217e6d943d852af5849d85a74b3..bc77afd81e0859b9492b2068ce681098a9393923 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -5,6 +5,6 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) -if(NOT LITE_WITH_OPENCL AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.h b/lite/core/context.h index 281a9e0d267b43b3c7a50f3172908909b362811a..f798dc3a60705828c3ea1606e76145d91216ae95 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -28,6 +28,9 @@ #ifdef LITE_WITH_NPU #include "lite/backends/npu/runtime.h" #endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/runtime.h" +#endif #include #include @@ -55,6 +58,7 @@ using X86Context = Context; using CUDAContext = Context; using ARMContext = Context; using NPUContext = Context; +using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; @@ -84,6 +88,20 @@ class Context { }; #endif +#ifdef LITE_WITH_XPU +template <> +class Context { + public: + Context() {} + explicit Context(const NPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(XPUContext* ctx) {} + + std::string name() const { return "XPUContext"; } +}; +#endif + #ifdef LITE_WITH_ARM template <> class Context { @@ -340,6 +358,12 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_XPU + case TARGET(kXPU): + kernel_contexts_[TargetType::kXPU].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_OPENCL case TARGET(kOpenCL): kernel_contexts_[TargetType::kOpenCL].As().CopySharedTo( @@ -386,6 +410,9 @@ class ContextScheduler { #endif #ifdef LITE_WITH_NPU InitContext(); +#endif +#ifdef LITE_WITH_XPU + InitContext(); #endif } diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc index 804d4e1b5bc94f0e7804fa588e107a298210143b..cfa43f8d6e9dc4585a4618a003cb8e0bd9709642 100644 --- a/lite/core/mir/pass_utils.cc +++ b/lite/core/mir/pass_utils.cc @@ -53,6 +53,7 @@ void ExpandPlaces(std::set* places, const Place& place) { TARGET(kARM), TARGET(kOpenCL), TARGET(kNPU), + TARGET(kXPU), TARGET(kFPGA)}); static const Types precision_set( {PRECISION(kFloat), PRECISION(kInt8), PRECISION(kFP16), PRECISION(kAny)}); diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt index 76588b7027764a6afd54c33158a37589525ba8c0..95b5fe5ae13e03940bda8d83fcfc252b4ca490ab 100644 --- a/lite/core/mir/subgraph/CMakeLists.txt +++ b/lite/core/mir/subgraph/CMakeLists.txt @@ -16,7 +16,7 @@ set(subgraph_passes subgraph_pass) if(LITE_WITH_NPU) lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc - DEPS mir_pass types context ${mir_fusers} ${npu_bridges} ${npu_ddk_libs} graph_op subgraph_pass) + DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass) list(APPEND subgraph_passes npu_pass) lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags @@ -30,5 +30,21 @@ if(LITE_WITH_NPU) endif() endif() +if(LITE_WITH_XPU) + lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc + DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass) + list(APPEND subgraph_passes xpu_pass) + lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc + DEPS xpu_pass mir_passes paddle_api_full gflags + ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 + --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL) + if (WITH_TESTING) + add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz) + add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz) + set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + endif() +endif() + set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes") message(STATUS "----> subgraph_passes: ${subgraph_passes}") diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc index c47ab60d634672c9092cec83d4a7bfc74cf1a747..c5465a5edaa28d3cc2cfb4a7ffe0cca2e3c1bc79 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc @@ -22,14 +22,9 @@ #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pattern_matcher.h" -#include "ai_ddk_lib/include/HiAiModelManagerService.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -51,7 +46,7 @@ std::shared_ptr GenerateNPUProgramPass::CvtVarNode( auto wgt = std::make_shared(arg.name); LOG(INFO) << "in convert const:" << arg.name; VLOG(4) << dims; - wgt->set_attr_value(lite::kernels::npu::bridges::CvtFromLiteTensor(tensor)); + wgt->set_attr_value(lite::npu::CvtFromLiteTensor(tensor)); return wgt; } else { CHECK_EQ(dims.size(), 4); @@ -132,7 +127,7 @@ std::string GenerateNPUProgramPass::BuildNPUGraph( // Compiling IR graph to NPU model and store mode data into weight tensor with // persistable=true, Sothat the model parser can recognize it and save it to // param files - if (!lite::kernels::npu::bridges::BuildModel(inputs, outputs, weight)) { + if (!lite::npu::BuildModel(inputs, outputs, weight)) { LOG(WARNING) << "Build NPU failed subgraph " << sub_id; throw std::runtime_error("Build NPU failed subgraph."); } diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h index be6b1aa24c8bf6ccab9bbdac198814350195b1b1..823ca5f1f624a9e920a5f395a9d5098c5ea52929 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass.h +++ b/lite/core/mir/subgraph/generate_npu_program_pass.h @@ -20,10 +20,10 @@ #include #include #include +#include "lite/backends/npu/builder.h" #include "lite/core/mir/pass.h" #include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc index 88095df502fe05a51b548dde7ce09700855ffae3..95339d6175c98f22d542db24f02d6d714ccbe2a8 100644 --- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc +++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc @@ -93,11 +93,13 @@ void CompareOutputTensor( auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape()); EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size); for (size_t j = 0; j < ref_output_tensor_size; j++) { - auto diff = - std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) / - (std::fabs(ref_output_tensor_data[j]) + 1e-6); - VLOG(3) << diff; - EXPECT_LT(diff, 0.1); + auto abs_diff = + std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]); + auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6); + VLOG(3) << "val: " << tar_output_tensor_data[j] + << " ref: " << ref_output_tensor_data[j] + << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; + EXPECT_LT(rel_diff, 0.1); } } } diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..319e1e51feb917b803753807ddbb1f72c2cb7084 --- /dev/null +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/subgraph/generate_xpu_program_pass.h" +#include +#include +#include +#include +#include +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/pattern_matcher.h" + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace subgraph { + +std::shared_ptr GenerateXPUProgramPass::CvtVarNode( + lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx, + lite::mir::Node* var_node, + const Scope* scope) { + CHECK(var_node->IsArg()); + const auto& arg = var_node->AsArg(); + auto var_name = arg.name; + VLOG(4) << "[XPU] Convert var node " << var_name; + + auto* var = scope->FindVar(var_name); + CHECK(var); + auto* tensor = var->GetMutable(); + CHECK(tensor); + auto dims = tensor->dims(); + auto cvted_var_node = + std::make_shared(graph_ctx->builder->CreateTensor( + var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32))); + if (arg.is_weight) { + auto cvted_var_tensor = lite::xpu::CvtTensor(tensor); + graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor)); + } + return cvted_var_node; +} + +void GenerateXPUProgramPass::CvtAllOpNodes( + const std::vector& op_nodes, + lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx, + lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) { + const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance(); + const auto& supported_lists = bridges.AllFunctions(); + // return record all converted vars + // op node's inputs must be found in converted_vars + for (auto& node : op_nodes) { + lite::kernels::xpu::bridges::node_map_type input_nodes; + auto& stmt = node->AsStmt(); + for (auto& var_node : node->inlinks) { + auto& arg = var_node->AsArg(); + // weight should be handled in the converter, so skip here + if (arg.is_weight) { + continue; + } + auto var_name = arg.name; + if (!cvted_var_nodes->count(var_name)) { + cvted_var_nodes->insert(std::make_pair( + var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope()))); + } + input_nodes.insert(*cvted_var_nodes->find(var_name)); + } + auto output_nodes = + supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes); + cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end()); + } +} + +std::string GenerateXPUProgramPass::BuildXPUGraph( + const std::unordered_set& op_nodes, + const std::unordered_set& in_data_vars, + const std::unordered_set& out_data_vars, + int sub_id) { + auto ordered_op_nodes = GetTopologicalOrder(op_nodes); + lite::kernels::xpu::bridges::graph_ctx_type graph_ctx; + graph_ctx.builder = std::make_shared(); + graph_ctx.params = + std::make_shared(); + lite::kernels::xpu::bridges::node_map_type cvted_var_nodes; + CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes); + + std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights"; + auto any_op = (*op_nodes.begin())->AsStmt().op(); + auto weight = any_op->scope()->Var(weight_var_name)->GetMutable(); + weight->set_persistable(true); + weight->set_precision(PRECISION(kInt8)); + // Compiling graph to XPU model and store mode data into weight tensor with + // persistable=true, Sothat the model parser can recognize it and save it to + // param files + std::vector> ordered_cvted_var_nodes; + for (auto out_data_var : out_data_vars) { + auto var_name = out_data_var->AsArg().name; + ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]); + } + if (!lite::xpu::BuildModel(graph_ctx.builder, + graph_ctx.params, + &ordered_cvted_var_nodes, + weight)) { + LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")"; + throw std::runtime_error("[XPU] Build XPU graph failed."); + } + LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")"; + return weight_var_name; +} + +void GenerateXPUProgramPass::GenXPUSubgraph( + const std::unique_ptr& graph, + const std::unordered_set& op_nodes, + int sub_id) { + std::unordered_set in_data_vars; + std::unordered_set in_wgt_vars; + std::unordered_set out_data_vars; + std::unordered_set out_unused_vars; + FindInputOutputVars( + op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars); + + auto weight_var_name = + BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id); + + auto any_op = (*op_nodes.begin())->AsStmt().op(); + InsertNewNode(graph, + weight_var_name, + any_op->scope(), + any_op->valid_places(), + in_data_vars, + in_wgt_vars, + out_data_vars, + out_unused_vars); + + auto nodes2rm = GetNode2rm( + op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars}); + + GraphSafeRemoveNodes(graph.get(), nodes2rm); +} + +void GenerateXPUProgramPass::Apply(const std::unique_ptr& graph) { + LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get()); + const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance(); + const auto& op_map = bridges.AllFunctions(); + std::vector supported_op_types; + for (auto& i : op_map) { + LOG(INFO) << "[XPU] Supported type: " << i.first; + supported_op_types.push_back(i.first); + } + + try { + int num_subgraph = FuseSubgraph(graph, supported_op_types); + InferOnce(graph); + auto op_nodes_all = ClassifySubgraph(graph); + CHECK_EQ(op_nodes_all.size(), num_subgraph); + int id = 1; + for (auto& op_nodes : op_nodes_all) { + LOG(INFO) << "[XPU] Converting Subgraph " << id; + GenXPUSubgraph(graph, op_nodes.second, id); + LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n" + << Visualize(graph.get()); + id++; + } + } catch (...) { + LOG(WARNING) << "[XPU] Build XPU graph failed."; + throw std::runtime_error("[XPU] Build XPU graph failed."); + } + + for (auto& item : graph->StmtTopologicalOrder()) { + if (item->IsStmt()) { + auto& stmt = item->AsStmt(); + LOG(INFO) << stmt; + insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); + } + } +} + +std::unique_ptr GenerateXPUProgramPass::GenProgram() { + LOG(INFO) << "[XPU] program insts.size=" << insts_.size(); + std::unique_ptr program( + new RuntimeProgram(std::move(insts_))); + return program; +} + +} // namespace subgraph +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(generate_xpu_program_pass, + paddle::lite::mir::subgraph::GenerateXPUProgramPass) + .BindTargets({TARGET(kXPU)}); diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..cf121ae9503201e8cf6be40fe9054ccaf6e4b172 --- /dev/null +++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "lite/backends/xpu/builder.h" +#include "lite/core/mir/pass.h" +#include "lite/core/mir/subgraph/subgraph_program_pass.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace subgraph { + +class GenerateXPUProgramPass : public SubgraphProgramPass { + public: + using key2nodes_t = std::map; + + void Apply(const std::unique_ptr& graph) override; + std::unique_ptr GenProgram(); + + protected: + // nodes2cvt: op nodes to convert + // return cvted_vars: converted var nodes + void CvtAllOpNodes( + const std::vector& op_nodes, + lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx, + lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes); + + std::shared_ptr CvtVarNode( + lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx, + lite::mir::Node* var_node, + const Scope* scope); + + std::string BuildXPUGraph(const std::unordered_set& op_nodes, + const std::unordered_set& in_data_vars, + const std::unordered_set& out_data_vars, + int sub_id); + + void GenXPUSubgraph(const std::unique_ptr& graph, + const std::unordered_set& op_nodes, + int sub_id); + + private: + std::vector insts_; +}; + +} // namespace subgraph +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..728ecbc6b77666accd432b1ad82a03860588ab40 --- /dev/null +++ b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/utils/cp_logging.h" + +DEFINE_string(model_file, "", "model file path of combined protobuf model"); +DEFINE_string(params_file, "", "params file path of combined protobuf model"); +DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model"); +DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors"); +DEFINE_int32(output_tensor_num, 1, "number of output tensors"); + +namespace paddle { +namespace lite { + +std::vector> ParseShape(std::string txt) { + std::vector> shape; + while (!txt.empty()) { + size_t idx = txt.find_first_of(":"); + std::string dims = txt.substr(0, idx); + std::vector s; + while (!dims.empty()) { + size_t idx = dims.find_first_of(","); + int d = atoi(dims.substr(0, idx).c_str()); + VLOG(3) << d; + s.push_back(d); + if (idx == std::string::npos) { + break; + } else { + dims = dims.substr(idx + 1); + } + } + shape.push_back(s); + if (idx == std::string::npos) { + break; + } else { + txt = txt.substr(idx + 1); + } + } + return shape; +} + +int64_t ShapeProduction(std::vector shape) { + int64_t s = 1; + for (int64_t dim : shape) { + s *= dim; + } + return s; +} + +void FillInputTensor( + const std::shared_ptr& predictor, + const std::vector>& input_tensor_shape, + const float value) { + for (int i = 0; i < input_tensor_shape.size(); i++) { + auto input_tensor = predictor->GetInput(i); + input_tensor->Resize(input_tensor_shape[i]); + auto input_tensor_data = input_tensor->mutable_data(); + auto input_tensor_size = ShapeProduction(input_tensor->shape()); + for (int j = 0; j < input_tensor_size; j++) { + input_tensor_data[j] = value; + } + } +} + +void CompareOutputTensor( + const std::shared_ptr& tar_predictor, + const std::shared_ptr& ref_predictor, + const int output_tensor_num) { + for (int i = 0; i < output_tensor_num; i++) { + auto tar_output_tensor = tar_predictor->GetOutput(i); + auto ref_output_tensor = ref_predictor->GetOutput(i); + auto tar_output_tensor_data = tar_output_tensor->data(); + auto ref_output_tensor_data = ref_output_tensor->data(); + auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); + auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape()); + EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size); + for (size_t j = 0; j < ref_output_tensor_size; j++) { + auto diff = + std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) / + (std::fabs(ref_output_tensor_data[j]) + 1e-6); + VLOG(3) << diff; + EXPECT_LT(diff, 0.1); + } + } +} + +std::shared_ptr TestModel( + const std::string& model_dir, + const std::string& model_file, + const std::string& params_file, + const std::vector& valid_places, + const std::vector>& input_tensor_shape, + const std::string& optimized_model_dir) { + // generate optimized model + lite_api::CxxConfig cxx_config; + cxx_config.set_model_dir(model_dir); + cxx_config.set_model_file(model_file); + cxx_config.set_param_file(params_file); + cxx_config.set_valid_places(valid_places); + auto predictor = lite_api::CreatePaddlePredictor(cxx_config); + FillInputTensor(predictor, input_tensor_shape, -1); + predictor->SaveOptimizedModel(optimized_model_dir, + lite_api::LiteModelType::kNaiveBuffer); +#if 0 // TODO(hong19860320) supports light api for XPU + // load optimized model + lite_api::MobileConfig mobile_config; + mobile_config.set_model_dir(optimized_model_dir); + mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH); + mobile_config.set_threads(1); + predictor = lite_api::CreatePaddlePredictor(mobile_config); + FillInputTensor(predictor, input_tensor_shape, 1); +#endif + // run optimized model + for (int i = 0; i < FLAGS_warmup; i++) { + predictor->Run(); + } + for (int i = 0; i < FLAGS_repeats; i++) { + auto start = GetCurrentUS(); + predictor->Run(); + LOG(INFO) << i << ", " << GetCurrentUS() - start << "us"; + } + return predictor; +} + +TEST(XPUSubgraph, compare) { + // parsing input tensor shape, supported formats: "1,3,224,224" + // "1,3,224,224:1,80" + std::vector> input_tensor_shape = + ParseShape(FLAGS_input_tensor_shape); + // generate and run optimized CPU model + LOG(INFO) << " ================ CPU ================== "; + auto cpu_predictor = + TestModel(FLAGS_model_dir, + FLAGS_model_file, + FLAGS_params_file, + {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}}, + input_tensor_shape, + FLAGS_optimized_model_dir + "/CPU"); + // generate and run optimized XPU model + LOG(INFO) << " ================ XPU ================== "; + auto xpu_predictor = + TestModel(FLAGS_model_dir, + FLAGS_model_file, + FLAGS_params_file, + {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}, + lite_api::Place{TARGET(kX86), PRECISION(kFloat)}}, + input_tensor_shape, + FLAGS_optimized_model_dir + "/XPU"); + // verify results + CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc index 31c28ad89cd419090fd14bfc367a9ef5eeaf9b15..0cb2261a3fca7aa47119b18900d38ecfd8229299 100644 --- a/lite/core/mir/subgraph/subgraph_program_pass.cc +++ b/lite/core/mir/subgraph/subgraph_program_pass.cc @@ -207,8 +207,26 @@ void SubgraphProgramPass::InferOnce(const std::unique_ptr& graph) { if (!item->IsStmt()) continue; auto& stmt = item->AsStmt(); auto& op = stmt.op(); + auto scope = op->scope(); std::string op_type = op->op_info()->Type(); - if (op_type == "feed" || op_type == "fetch") continue; + // check the dimension of input variables in the scope, must not be empty ! + if (op_type == "feed") { + auto input_var_names = op->op_info()->output_names(); + CHECK_GE(input_var_names.size(), 1); + for (auto input_var_name : input_var_names) { + auto input_var = scope->FindVar(input_var_name); + CHECK(input_var) << "No input variable '" << input_var_name + << "' found in scope " << scope; + auto input = input_var->GetMutable(); + CHECK(!input->dims().empty()) << "The dimension of input variable '" + << input_var_name + << "' can not be empty."; + } + continue; + } + if (op_type == "fetch") { + continue; + } op->CheckShape(); op->InferShape(); // TOOD(xxx): remove Launch() at last diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_program_pass_test.cc index 625c9ac92435296ddb9a9ad2b116aef7fe6ea3f8..22e20b81d831ff25df090a7565e671b9139122f7 100644 --- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_program_pass_test.cc @@ -46,6 +46,9 @@ TEST(SubgraphTest, models) { #endif #ifdef LITE_WITH_NPU Place{TARGET(kNPU), PRECISION(kFloat)}, +#endif +#ifdef LITE_WITH_XPU + Place{TARGET(kXPU), PRECISION(kFloat)}, #endif }); lite::Program program(program_desc, scope, valid_places); diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index 0fdce27e3b5381cb455a346800a47e2a42e9f4ba..ad974a781c7c899428015907a4166d8d0c351c76 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -78,6 +78,9 @@ std::list> KernelRegistry::Create( case TARGET(kNPU): { CREATE_KERNEL(kNPU); } break; + case TARGET(kXPU): { + CREATE_KERNEL(kXPU); + } break; case TARGET(kFPGA): { CREATE_KERNEL(kFPGA); } break; @@ -142,6 +145,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kNPU, kAny, kNCHW); INIT_FOR(kNPU, kAny, kAny); + INIT_FOR(kXPU, kFloat, kNCHW); + INIT_FOR(kXPU, kInt8, kNCHW); + INIT_FOR(kXPU, kAny, kNCHW); + INIT_FOR(kXPU, kAny, kAny); + INIT_FOR(kFPGA, kFP16, kNHWC); INIT_FOR(kFPGA, kFP16, kAny); INIT_FOR(kFPGA, kFloat, kNHWC); diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 25375b8a8f795e58194d6223f617273beac3b78e..1c67ee8f3dcafe30d9bda587d62233d0e715071e 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -178,6 +178,16 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 93d8a148c909c1d4682664eca2fe7dc172f4f280..739615e2763f509f2dec97f5ab3e536aca7acc4f 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -28,6 +28,9 @@ #ifdef LITE_WITH_NPU #include "lite/core/mir/subgraph/generate_npu_program_pass.h" #endif +#ifdef LITE_WITH_XPU +#include "lite/core/mir/subgraph/generate_xpu_program_pass.h" +#endif namespace paddle { namespace lite { @@ -106,7 +109,8 @@ class Optimizer { "runtime_context_assign_pass", "argument_type_display_pass", // -#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) +#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \ + !defined(LITE_WITH_XPU) // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel "memory_optimize_pass", #endif @@ -121,14 +125,27 @@ class Optimizer { // Generate a new program based on the mir graph. std::unique_ptr GenRuntimeProgram() { +#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU) + auto target_place = Place{ #ifdef LITE_WITH_NPU - if (std::find(valid_places_.begin(), - valid_places_.end(), - Place{TARGET(kNPU), PRECISION(kFloat)}) != + TARGET(kNPU), +#endif +#ifdef LITE_WITH_XPU + TARGET(kXPU), +#endif + PRECISION(kFloat)}; + if (std::find(valid_places_.begin(), valid_places_.end(), target_place) != valid_places_.end()) { +#ifdef LITE_WITH_NPU auto pass = mir::PassManager::Global() .LookUp( "generate_npu_program_pass"); +#endif +#ifdef LITE_WITH_XPU + auto pass = mir::PassManager::Global() + .LookUp( + "generate_xpu_program_pass"); +#endif try { pass->Apply(graph_); auto program = pass->GenProgram(); @@ -136,7 +153,8 @@ class Optimizer { program->set_exec_scope(exec_scope_); return program; } catch (...) { - LOG(WARNING) << "Build NPU graph failed"; + LOG(WARNING) << "Build " << TargetToStr(target_place.target) + << " program failed!"; } } #endif diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index d83657ad3e24eb5661225a4a0684c141e40a6163..40c95415546d99a66abf2d6f3595ae8695c4df86 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} EXCLUDE_COMPILE_DEPS "ON" @@ -42,6 +43,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} EXCLUDE_COMPILE_DEPS "ON" diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 1996f50133acc6f3bdf651e8c0daae5b68c96832..0bfd39ae9a0bdf6e8af606711fd4dcc6011994b5 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -9,3 +9,4 @@ add_subdirectory(x86) add_subdirectory(opencl) add_subdirectory(fpga) add_subdirectory(npu) +add_subdirectory(xpu) diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index 77d0097c6955c43c12bdaac8ce0410b24c5cf526..032de819743f4aba02e442dd71c26b950d1435b6 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,7 +1,6 @@ -lite_cc_library(npu_bridge_registry SRCS registry.cc DEPS ${npu_ddk_libs}) -lite_cc_library(npu_bridge_utils SRCS utils.cc DEPS ${npu_ddk_libs} npu_runtime tensor op scope) +lite_cc_library(npu_bridge_registry SRCS registry.cc) -set(npu_bridge_deps npu_bridge_registry npu_bridge_utils op) +set(npu_bridge_deps npu_bridge_registry npu_builder op) lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps}) @@ -23,7 +22,6 @@ lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) set(npu_bridges npu_bridge_registry - npu_bridge_utils npu_bridge_fc_op npu_bridge_conv_op npu_bridge_mul_op @@ -43,7 +41,7 @@ set(npu_bridges npu_bridge_pad2d_op CACHE INTERNAL "npu_bridges") -set(npu_bridge_test_deps ${npu_ddk_libs} ${npu_bridges} ${npu_kernels} ${ops}) +set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops}) lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc index 1e8500ef28eed25cd8514846b98e7ebfacb946a2..2b3a415ad72d5629d343678f65e2e0040fafda14 100644 --- a/lite/kernels/npu/bridges/act_op.cc +++ b/lite/kernels/npu/bridges/act_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type ActConverter(const std::shared_ptr act_op, auto scope = act_op->scope(); auto op_info = act_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; // create act node and set input node from inputs_map @@ -40,8 +34,8 @@ node_map_type ActConverter(const std::shared_ptr act_op, auto act_node = std::make_shared(unique_op_type); CHECK(inputs_map.count(x_var_name)); act_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(act_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(act_node); // parse and set activation type int act_mode = 1; diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc index 9f3a506d764eb9a635f46e30715b00e17b62d572..5b3cbd52133b61f0c0e37e2ba9bf2f6775f7a2b4 100644 --- a/lite/kernels/npu/bridges/batch_norm_op.cc +++ b/lite/kernels/npu/bridges/batch_norm_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type BatchNormConverter( auto scope = batch_norm_op->scope(); auto op_info = batch_norm_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr batch_norm_node = @@ -43,27 +37,27 @@ node_map_type BatchNormConverter( auto scale_var_name = op_info->Input("Scale").front(); lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable(); auto npu_scale = std::make_shared(scale_var_name); - npu_scale->set_attr_value(CvtFromLiteTensor(scale)); - OpList::Global().add(npu_scale); + npu_scale->set_attr_value(lite::npu::CvtFromLiteTensor(scale)); + lite::npu::OpList::Global().add(npu_scale); auto bias_var_name = op_info->Input("Bias").front(); lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable(); auto npu_bias = std::make_shared(bias_var_name); - npu_bias->set_attr_value(CvtFromLiteTensor(bias)); - OpList::Global().add(npu_bias); + npu_bias->set_attr_value(lite::npu::CvtFromLiteTensor(bias)); + lite::npu::OpList::Global().add(npu_bias); auto mean_var_name = op_info->Input("Mean").front(); lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable(); auto npu_mean = std::make_shared(mean_var_name); - npu_mean->set_attr_value(CvtFromLiteTensor(mean)); - OpList::Global().add(npu_mean); + npu_mean->set_attr_value(lite::npu::CvtFromLiteTensor(mean)); + lite::npu::OpList::Global().add(npu_mean); auto variance_var_name = op_info->Input("Variance").front(); lite::Tensor* variance = scope->FindVar(variance_var_name)->GetMutable(); auto npu_variance = std::make_shared(variance_var_name); - npu_variance->set_attr_value(CvtFromLiteTensor(variance)); - OpList::Global().add(npu_variance); + npu_variance->set_attr_value(lite::npu::CvtFromLiteTensor(variance)); + lite::npu::OpList::Global().add(npu_variance); float npu_momentum = op_info->GetAttr("momentum"); float npu_epsilon = op_info->GetAttr("epsilon"); @@ -80,8 +74,8 @@ node_map_type BatchNormConverter( batch_norm_node->set_attr_mode(npu_mode); batch_norm_node->set_attr_use_global_stats(npu_use_global_stats); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(batch_norm_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(batch_norm_node); node_map_type outputs_map; outputs_map[op_info->Output("Y").front()] = batch_norm_node; diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc index 9684031ac777cc524b5324d07e7a54ce9d954453..9be47339354c5602f98583b5163d11e037570321 100644 --- a/lite/kernels/npu/bridges/concat_op.cc +++ b/lite/kernels/npu/bridges/concat_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type ConcatConverter(const std::shared_ptr concat_op, lite::Scope* scope = concat_op->scope(); const lite::OpInfo* op_info = concat_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "converting " << op_type << " ... "; auto x_var_names = op_info->Input("X"); @@ -48,17 +42,17 @@ node_map_type ConcatConverter(const std::shared_ptr concat_op, for (auto x_var_name : x_var_names) { if (inputs_map.find(x_var_name) != inputs_map.end()) { output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); } else { auto consty = std::make_shared(x_var_name); auto* x = scope->FindVar(x_var_name)->GetMutable(); - consty->set_attr_value(CvtFromLiteTensor(x)); + consty->set_attr_value(lite::npu::CvtFromLiteTensor(x)); output_node->set_dynamic_input_x(index + 1, *consty); - OpList::Global().add(consty); + lite::npu::OpList::Global().add(consty); } index++; } - OpList::Global().add(output_node); + lite::npu::OpList::Global().add(output_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = output_node; diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc index db1f72ed69d5a9ce73625308fbac7fcb54cc137f..2a4ae56a515b8119324d944e14d20f5ad4295fd3 100644 --- a/lite/kernels/npu/bridges/conv_op.cc +++ b/lite/kernels/npu/bridges/conv_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto scope = conv_op->scope(); auto op_info = conv_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " << op_type << "... "; // get input, filter and op attributes @@ -78,13 +72,13 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, // check input CHECK(inputs_map.count(input_var_name)); - OpList::Global().add(inputs_map.at(input_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(input_var_name)); // create filter node CHECK(!inputs_map.count(filter_var_name)); auto filter_const_node = std::make_shared(filter_var_name); - filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); - OpList::Global().add(filter_const_node); + filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter)); + lite::npu::OpList::Global().add(filter_const_node); // create bias node if has bias // supports the bias nodes with the following dimensions @@ -93,7 +87,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, // 2: {n, oc, oh, ow} std::shared_ptr bias_node = nullptr; bool is_channel_bias = false; - if (HasInputArg(op_info, scope, "Bias")) { + if (lite::npu::HasInputArg(op_info, scope, "Bias")) { auto bias_var_name = op_info->Input("Bias").front(); auto* bias = scope->FindVar(bias_var_name)->GetMutable(); auto bias_dims = bias->dims(); @@ -121,10 +115,11 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, } else { // bias node with const data auto bias_const_node = std::make_shared(bias_var_name); - bias_const_node->set_attr_value(CvtFromLiteTensor(bias, bias_shape)); + bias_const_node->set_attr_value( + lite::npu::CvtFromLiteTensor(bias, bias_shape)); bias_node = bias_const_node; } - OpList::Global().add(bias_node); + lite::npu::OpList::Global().add(bias_node); } // create conv node and set input, filter, bias nodes and attributes @@ -147,7 +142,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, ge::AttrValue::LIST_INT({strides[0], strides[1]})); depthwise_conv_node->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - OpList::Global().add(depthwise_conv_node); + lite::npu::OpList::Global().add(depthwise_conv_node); conv_node = depthwise_conv_node; // ConvolutionDepthwise Op doesn't support bias, so append Add node to // support bias @@ -155,7 +150,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto add_node = std::make_shared(unique_op_type + "/add"); add_node->set_input_x1(*depthwise_conv_node); add_node->set_input_x2(*bias_node); - OpList::Global().add(add_node); + lite::npu::OpList::Global().add(add_node); conv_node = add_node; } } else { @@ -174,7 +169,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, ge::AttrValue::LIST_INT({strides[0], strides[1]})); common_conv_node->set_attr_kernel( ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); - OpList::Global().add(common_conv_node); + lite::npu::OpList::Global().add(common_conv_node); conv_node = common_conv_node; // Convolution Op only support bias with dimension {1, oc, 1, 1}, // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) @@ -185,7 +180,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, auto add_node = std::make_shared(unique_op_type + "/add"); add_node->set_input_x1(*common_conv_node); add_node->set_input_x2(*bias_node); - OpList::Global().add(add_node); + lite::npu::OpList::Global().add(add_node); conv_node = add_node; } } @@ -199,7 +194,7 @@ node_map_type ConvConverter(const std::shared_ptr conv_op, std::make_shared(unique_op_type + "/relu"); relu_node->set_input_x(*conv_node); relu_node->set_attr_mode(1); - OpList::Global().add(relu_node); + lite::npu::OpList::Global().add(relu_node); outputs_map[op_info->Output("Output").front()] = relu_node; } else { outputs_map[op_info->Output("Output").front()] = conv_node; diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc index 21e3c73d324a45a4bebea23368eae3542f7a5ab6..f8392ec8d9b08c86a571b47187715c5bb251570f 100644 --- a/lite/kernels/npu/bridges/conv_transpose_op.cc +++ b/lite/kernels/npu/bridges/conv_transpose_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type ConvTransposeConverter( auto scope = conv_transpose_op->scope(); auto op_info = conv_transpose_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " << op_type << "... "; // get input, output and op attributes @@ -70,21 +64,22 @@ node_map_type ConvTransposeConverter( } auto input_sizes_const_node = std::make_shared(unique_op_type + "/input_size"); - input_sizes_const_node->set_attr_value(CreateTensorAndFillData(output_shape)); + input_sizes_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData(output_shape)); conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); - OpList::Global().add(input_sizes_const_node); + lite::npu::OpList::Global().add(input_sizes_const_node); // create filter node CHECK(!inputs_map.count(filter_var_name)); auto filter_const_node = std::make_shared(filter_var_name); - filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); + filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter)); conv_transpose_node->set_input_filter(*filter_const_node); - OpList::Global().add(filter_const_node); + lite::npu::OpList::Global().add(filter_const_node); // set input node CHECK(inputs_map.count(input_var_name)); conv_transpose_node->set_input_x(*inputs_map.at(input_var_name)); - OpList::Global().add(inputs_map.at(input_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(input_var_name)); // set attributes conv_transpose_node->set_attr_mode(1); @@ -99,11 +94,11 @@ node_map_type ConvTransposeConverter( ge::AttrValue::LIST_INT({strides[0], strides[1]})); conv_transpose_node->set_attr_kernel( ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]})); - OpList::Global().add(conv_transpose_node); + lite::npu::OpList::Global().add(conv_transpose_node); // append add node to add bias if has bias std::shared_ptr output_node = conv_transpose_node; - if (HasInputArg(op_info, scope, "Bias")) { + if (lite::npu::HasInputArg(op_info, scope, "Bias")) { // create bias node auto bias_var_name = op_info->Input("Bias").front(); CHECK(!inputs_map.count(bias_var_name)); @@ -112,13 +107,13 @@ node_map_type ConvTransposeConverter( CHECK_EQ(channel_size, filter_shape[1] * groups); auto bias_const_node = std::make_shared(bias_var_name); bias_const_node->set_attr_value( - CvtFromLiteTensor(bias, {1, channel_size, 1, 1})); - OpList::Global().add(bias_const_node); + lite::npu::CvtFromLiteTensor(bias, {1, channel_size, 1, 1})); + lite::npu::OpList::Global().add(bias_const_node); // append add node to add bias node auto add_node = std::make_shared(unique_op_type + "/add"); add_node->set_input_x1(*conv_transpose_node); add_node->set_input_x2(*bias_const_node); - OpList::Global().add(add_node); + lite::npu::OpList::Global().add(add_node); output_node = add_node; } @@ -129,7 +124,7 @@ node_map_type ConvTransposeConverter( std::make_shared(unique_op_type + "/relu"); relu_node->set_input_x(*output_node); relu_node->set_attr_mode(1); - OpList::Global().add(relu_node); + lite::npu::OpList::Global().add(relu_node); outputs_map[op_info->Output("Output").front()] = relu_node; } else { outputs_map[op_info->Output("Output").front()] = output_node; diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc index e42a933e0972041eb835c8435188db2d47c77180..6ba7acc254c0c352fe46aeee77ac3a5d25c4582f 100644 --- a/lite/kernels/npu/bridges/elementwise_ops.cc +++ b/lite/kernels/npu/bridges/elementwise_ops.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type ElementwiseConverter( auto scope = elementwise_op->scope(); auto op_info = elementwise_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "converting elementwise..."; std::shared_ptr elementwise_node = @@ -47,20 +41,20 @@ node_map_type ElementwiseConverter( CHECK(inputs_map.find(x_var_name) != inputs_map.end()); elementwise_node->set_input_x1(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); if (inputs_map.find(y_var_name) != inputs_map.end()) { elementwise_node->set_input_x2(*inputs_map.at(y_var_name)); - OpList::Global().add(inputs_map.at(y_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(y_var_name)); } else { auto consty = std::make_shared(y_var_name); auto* y = scope->FindVar(y_var_name)->GetMutable(); - consty->set_attr_value(CvtFromLiteTensor(y)); + consty->set_attr_value(lite::npu::CvtFromLiteTensor(y)); elementwise_node->set_input_x2(*consty); - OpList::Global().add(consty); + lite::npu::OpList::Global().add(consty); } - OpList::Global().add(elementwise_node); + lite::npu::OpList::Global().add(elementwise_node); // paddlelite has sum only elementwise_node->set_attr_mode(1); diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc index b96d62fd27cd2d40938d6396df7276fa8c64b377..1233ccedd4086bfca36fa4f1ba996814cc68127d 100644 --- a/lite/kernels/npu/bridges/fc_op.cc +++ b/lite/kernels/npu/bridges/fc_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -29,19 +23,22 @@ namespace bridges { node_map_type FCConverter(const std::shared_ptr fc_op, const node_map_type& inputs_map) { - LOG(INFO) << "Converting fc..."; - lite::Scope* scope = fc_op->scope(); - const lite::OpInfo* op_info = fc_op->op_info(); - auto output_node = std::make_shared(UniqueName("fc")); + auto scope = fc_op->scope(); + auto op_info = fc_op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::npu::UniqueName(op_type); + LOG(INFO) << "Converting " + op_type + "..."; + + auto fc_node = std::make_shared(unique_op_type); auto x_var_name = op_info->Input("Input").front(); auto w_var_name = op_info->Input("W").front(); int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); - auto* xtensor = scope->FindVar(x_var_name)->GetMutable(); - auto* wtensor = scope->FindVar(w_var_name)->GetMutable(); - auto x_dims = xtensor->dims(); - auto w_dims = wtensor->dims(); + auto x = scope->FindVar(x_var_name)->GetMutable(); + auto w = scope->FindVar(w_var_name)->GetMutable(); + auto x_dims = x->dims(); + auto w_dims = w->dims(); CHECK_GE(x_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL); @@ -49,65 +46,69 @@ node_map_type FCConverter(const std::shared_ptr fc_op, int m = x_dims.Slice(0, in_num_col_dims).production(); int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production(); int n = w_dims[1]; + CHECK_EQ(k * n, w_dims.production()); + VLOG(3) << "x dims: " << x_dims << " w dims: " << w_dims << " m: " << m + << " k: " << k << " n: " << n; CHECK(inputs_map.count(x_var_name)); CHECK(!inputs_map.count(w_var_name)); - LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; - LOG(INFO) << "x_var_name:" << x_var_name - << ", is data: " << inputs_map.count(x_var_name); - LOG(INFO) << "w_var_name:" << w_var_name - << ", is data: " << inputs_map.count(w_var_name); - - auto xsrc = inputs_map.at(x_var_name); - auto reshapex = std::make_shared(x_var_name + "_reshape"); - reshapex->set_input_tensor(*xsrc); - reshapex->set_attr_shape({m, k}); - reshapex->set_attr_axis(0); - OpList::Global().add(xsrc); - OpList::Global().add(reshapex); - output_node->set_input_x(*reshapex); - - auto wconst = std::make_shared(w_var_name); - ge::TensorDesc wdesc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = wdesc.GetShape().GetShapeSize(); - CHECK_EQ(size, w_dims.production()); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(wdesc); - auto* pdata = reinterpret_cast(wtensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - wconst->set_attr_value(ptensor); - OpList::Global().add(wconst); - output_node->set_input_w(*wconst); - - if (HasInputArg(op_info, scope, "Bias")) { - auto b_var_name = op_info->Input("Bias").front(); - auto* btensor = scope->FindVar(b_var_name)->GetMutable(); - - LOG(INFO) << "b_var_name:" << b_var_name - << ", is data: " << inputs_map.count(b_var_name); - CHECK(!inputs_map.count(b_var_name)); - CHECK_EQ(btensor->numel(), n); - - auto bconst = std::make_shared(b_var_name); - ge::TensorDesc bdesc( - ge::Shape({1, n, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT); - auto size = bdesc.GetShape().GetShapeSize(); - CHECK_EQ(size, n); - ge::TensorPtr ptensor = std::make_shared(); - ptensor->SetTensorDesc(bdesc); - auto* pdata = reinterpret_cast(btensor->mutable_data()); - ptensor->SetData(pdata, size * sizeof(float)); - bconst->set_attr_value(ptensor); - OpList::Global().add(bconst); - output_node->set_input_bias(*bconst); - output_node->set_attr_has_bias(ge::AttrValue::BOOL{true}); + // reshape x to (m, k, 1, 1) + auto reshaped_x_node = + std::make_shared(x_var_name + "_reshape"); + reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name)); + reshaped_x_node->set_attr_shape({m, k, 1, 1}); + reshaped_x_node->set_attr_axis(0); + fc_node->set_input_x(*reshaped_x_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(reshaped_x_node); + + // create w const node, set its shape to (k, n, 1, 1) and fill with + // the transposed w tensor + auto w_const_node = std::make_shared(w_var_name); + ge::TensorDesc w_const_desc( + ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT); + ge::TensorPtr w_const_tensor = std::make_shared(); + w_const_tensor->SetTensorDesc(w_const_desc); + auto w_data = w->mutable_data(); + std::vector transposed_w_data(w_dims.production()); + for (int i = 0; i < k; i++) { + for (int j = 0; j < n; j++) { + transposed_w_data[j * k + i] = w_data[i * n + j]; + } + } + w_const_tensor->SetData(reinterpret_cast(transposed_w_data.data()), + transposed_w_data.size() * sizeof(float)); + w_const_node->set_attr_value(w_const_tensor); + fc_node->set_input_w(*w_const_node); + lite::npu::OpList::Global().add(w_const_node); + + // add bias node if bias tensor exists + if (lite::npu::HasInputArg(op_info, scope, "Bias")) { + auto bias_var_name = op_info->Input("Bias").front(); + auto bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_dims = bias->dims(); + CHECK(!inputs_map.count(bias_var_name)); + CHECK_EQ(bias_dims.production(), n); + + auto bias_const_node = std::make_shared(bias_var_name); + bias_const_node->set_attr_value( + lite::npu::CvtFromLiteTensor(bias, {1, n, 1, 1})); + fc_node->set_input_b(*bias_const_node); + lite::npu::OpList::Global().add(bias_const_node); } + lite::npu::OpList::Global().add(fc_node); - OpList::Global().add(output_node); + // reshape output of fc_node from (m, n, 1, 1) to (m, n) + auto reshaped_fc_node = + std::make_shared(unique_op_type + "_reshape"); + reshaped_fc_node->set_input_tensor(*fc_node); + reshaped_fc_node->set_attr_shape({m, n}); + reshaped_fc_node->set_attr_axis(0); + lite::npu::OpList::Global().add(reshaped_fc_node); node_map_type outputs_map; - outputs_map[op_info->Output("Out").front()] = output_node; + outputs_map[op_info->Output("Out").front()] = reshaped_fc_node; return outputs_map; } diff --git a/lite/kernels/npu/bridges/fc_op_test.cc b/lite/kernels/npu/bridges/fc_op_test.cc index 92936dc6bfdb73df104e93b213f26ac6eedcd4b1..77015236e2eed847d0ec0ea5c06e646e5893f29a 100644 --- a/lite/kernels/npu/bridges/fc_op_test.cc +++ b/lite/kernels/npu/bridges/fc_op_test.cc @@ -126,6 +126,7 @@ TEST(NPUBridges, fc) { test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias); test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias); test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); + test_fc({1, 1024, 1, 1}, {1024, 1000}, 1, use_bias); } } diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc index 8f3e20b023621ab3a1257a7e92686ca32bcdade2..b0cfa1c28fae68ec936e8715fb25d59853d063bc 100644 --- a/lite/kernels/npu/bridges/interpolate_op.cc +++ b/lite/kernels/npu/bridges/interpolate_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,13 +27,13 @@ node_map_type InterpolateConverter( auto scope = interpolate_op->scope(); auto op_info = interpolate_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; // get input, output and attributes from lite op auto x_var_name = op_info->Input("X").front(); CHECK(inputs_map.count(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); auto x = scope->FindVar(x_var_name)->GetMutable(); auto x_dims = x->dims(); @@ -64,7 +58,7 @@ node_map_type InterpolateConverter( // update out_h and out_w if has OutSize bool inputs_map_has_w = false; - if (HasInputArg(op_info, scope, "OutSize")) { + if (lite::npu::HasInputArg(op_info, scope, "OutSize")) { auto out_size_var_name = op_info->Input("OutSize").front(); if (inputs_map.count(out_size_var_name)) { inputs_map_has_w = true; @@ -83,12 +77,12 @@ node_map_type InterpolateConverter( auto interp_method = op_info->GetAttr("interp_method"); if (interp_method == "bilinear") { auto interp_node = std::make_shared(unique_op_type); - OpList::Global().add(interp_node); + lite::npu::OpList::Global().add(interp_node); interp_node->set_input_x(*inputs_map.at(x_var_name)); if (inputs_map_has_w) { auto out_size_var_name = op_info->Input("OutSize").front(); interp_node->set_input_w(*inputs_map.at(out_size_var_name)); - OpList::Global().add(inputs_map.at(out_size_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); } else { const float largest_multiple = 7.0f; float multiple = static_cast(x_h * x_w) / (out_h * out_w); @@ -99,9 +93,9 @@ node_map_type InterpolateConverter( auto w_const_node = std::make_shared(unique_op_type + "/w"); w_const_node->set_attr_value( - CreateTensorAndFillData(std::vector({out_h, out_w}))); + lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); interp_node->set_input_w(*w_const_node); - OpList::Global().add(w_const_node); + lite::npu::OpList::Global().add(w_const_node); } interp_node->set_attr_output_dim_mode( 2); // 0: zoom_factor, 1: shrink_factor, 2: height/width @@ -110,19 +104,19 @@ node_map_type InterpolateConverter( } else if (interp_method == "nearest") { auto interp_node = std::make_shared(unique_op_type); - OpList::Global().add(interp_node); + lite::npu::OpList::Global().add(interp_node); interp_node->set_input_image(*inputs_map.at(x_var_name)); if (inputs_map_has_w) { auto out_size_var_name = op_info->Input("OutSize").front(); interp_node->set_input_size(*inputs_map.at(out_size_var_name)); - OpList::Global().add(inputs_map.at(out_size_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name)); } else { auto w_const_node = std::make_shared(unique_op_type + "/w"); w_const_node->set_attr_value( - CreateTensorAndFillData(std::vector({out_h, out_w}))); + lite::npu::CreateTensorAndFillData(std::vector({out_h, out_w}))); interp_node->set_input_size(*w_const_node); - OpList::Global().add(w_const_node); + lite::npu::OpList::Global().add(w_const_node); } interp_node->set_attr_align_corners(align_corners); outputs_map[op_info->Output("Out").front()] = interp_node; diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc index f22c0d611da90a230fbe070ea880a23994fbebfd..ce1662c71d62a6d73a7a3b9ce594b0dd80b6fec1 100644 --- a/lite/kernels/npu/bridges/mul_op.cc +++ b/lite/kernels/npu/bridges/mul_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -34,7 +28,8 @@ node_map_type MulConverter(const std::shared_ptr mul_op, LOG(INFO) << "converting mul..."; lite::Scope* scope = mul_op->scope(); const lite::OpInfo* op_info = mul_op->op_info(); - auto output_node = std::make_shared(UniqueName("mul")); + auto output_node = + std::make_shared(lite::npu::UniqueName("mul")); auto x_var_name = op_info->Input("X").front(); auto y_var_name = op_info->Input("Y").front(); @@ -66,8 +61,8 @@ node_map_type MulConverter(const std::shared_ptr mul_op, reshapex->set_input_tensor(*xsrc); reshapex->set_attr_shape({m, k}); reshapex->set_attr_axis(0); - OpList::Global().add(xsrc); - OpList::Global().add(reshapex); + lite::npu::OpList::Global().add(xsrc); + lite::npu::OpList::Global().add(reshapex); output_node->set_input_x(*reshapex); } else { auto constx = std::make_shared(x_var_name); @@ -79,7 +74,7 @@ node_map_type MulConverter(const std::shared_ptr mul_op, auto* pdata = reinterpret_cast(xtensor->mutable_data()); ptensor->SetData(pdata, size * sizeof(float)); constx->set_attr_value(ptensor); - OpList::Global().add(constx); + lite::npu::OpList::Global().add(constx); output_node->set_input_x(*constx); } @@ -89,8 +84,8 @@ node_map_type MulConverter(const std::shared_ptr mul_op, reshapey->set_input_tensor(*ysrc); reshapey->set_attr_shape({k, n}); reshapey->set_attr_axis(0); - OpList::Global().add(ysrc); - OpList::Global().add(reshapey); + lite::npu::OpList::Global().add(ysrc); + lite::npu::OpList::Global().add(reshapey); output_node->set_input_w(*reshapey); } else { auto consty = std::make_shared(y_var_name); @@ -102,11 +97,11 @@ node_map_type MulConverter(const std::shared_ptr mul_op, auto* pdata = reinterpret_cast(ytensor->mutable_data()); ptensor->SetData(pdata, size * sizeof(float)); consty->set_attr_value(ptensor); - OpList::Global().add(consty); + lite::npu::OpList::Global().add(consty); output_node->set_input_w(*consty); } - OpList::Global().add(output_node); + lite::npu::OpList::Global().add(output_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = output_node; diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc index a8eefbbc9e688183d065c7e13961b79edfa85d77..acc3b6adf9a89ffc4d984082d7330c30d46362ba 100644 --- a/lite/kernels/npu/bridges/pad2d_op.cc +++ b/lite/kernels/npu/bridges/pad2d_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,15 +26,15 @@ node_map_type Pad2dConverter(const std::shared_ptr pad2d_op, auto scope = pad2d_op->scope(); auto op_info = pad2d_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr pad2d_node = std::make_shared(unique_op_type); auto x_var_name = op_info->Input("X").front(); pad2d_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(pad2d_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(pad2d_node); auto mode = op_info->GetAttr("mode"); if (mode == "constant") { @@ -59,17 +53,19 @@ node_map_type Pad2dConverter(const std::shared_ptr pad2d_op, padding.insert(padding.begin(), xds * 2 - 4, 0); auto npu_padding = std::make_shared(unique_op_type + "/padding"); - npu_padding->set_attr_value(CreateTensorAndFillData(padding, {xds, 2})); + npu_padding->set_attr_value( + lite::npu::CreateTensorAndFillData(padding, {xds, 2})); pad2d_node->set_input_padding(*npu_padding); - OpList::Global().add(npu_padding); + lite::npu::OpList::Global().add(npu_padding); if (mode == "constant") { auto pad_value = op_info->GetAttr("pad_value"); auto npu_pad_value = std::make_shared(unique_op_type + "/pad_value"); - npu_pad_value->set_attr_value(CreateTensorAndFillData({pad_value})); + npu_pad_value->set_attr_value( + lite::npu::CreateTensorAndFillData({pad_value})); pad2d_node->set_input_constant_values(*npu_pad_value); - OpList::Global().add(npu_pad_value); + lite::npu::OpList::Global().add(npu_pad_value); pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 } diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc index e4d6658432d8fbf6c7dc30ecc305b74b9bf81393..66cb27d7c34be707129f78ff15eaf4848f6878c0 100644 --- a/lite/kernels/npu/bridges/pool_op.cc +++ b/lite/kernels/npu/bridges/pool_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type PoolConverter(const std::shared_ptr pool_op, auto scope = pool_op->scope(); auto op_info = pool_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr pool_node = @@ -73,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr pool_op, pool_node->set_attr_ceil_mode(npu_ceil_mode); // output_node->set_attr_data_mode(npu_data_mode); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(pool_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(pool_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = pool_node; diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc index 081c49a30393ffa333e08676cc8a123307455180..50111222dd6e22ad13e675864fc4c8999ee474ff 100644 --- a/lite/kernels/npu/bridges/reshape_op.cc +++ b/lite/kernels/npu/bridges/reshape_op.cc @@ -13,14 +13,8 @@ // limitations under the License. #include "lite/operators/reshape_op.h" -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, auto scope = reshape_op->scope(); auto op_info = reshape_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; // get input, output and op attributes @@ -45,10 +39,10 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, auto reshape_node = std::make_shared(unique_op_type); CHECK(inputs_map.count(x_var_name)); reshape_node->set_input_tensor(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); // read shape from actual shape tensor as input "w" if 'Shape' is found - if (HasInputArg(op_info, scope, "Shape")) { + if (lite::npu::HasInputArg(op_info, scope, "Shape")) { auto actual_shape_var_name = op_info->Input("Shape").front(); if (!inputs_map.count(actual_shape_var_name)) { auto actual_shape = @@ -67,13 +61,14 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, } auto actual_shape_const_node = std::make_shared(actual_shape_var_name); - actual_shape_const_node->set_attr_value(CreateTensorAndFillData( - std::vector(out_shape.begin(), out_shape.end()))); + actual_shape_const_node->set_attr_value( + lite::npu::CreateTensorAndFillData( + std::vector(out_shape.begin(), out_shape.end()))); reshape_node->set_input_w(*actual_shape_const_node); - OpList::Global().add(actual_shape_const_node); + lite::npu::OpList::Global().add(actual_shape_const_node); } else { reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name)); - OpList::Global().add(inputs_map.at(actual_shape_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name)); } } else { auto shape = op_info->GetAttr>("shape"); @@ -87,7 +82,7 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, reshape_node->set_attr_shape( ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); } - OpList::Global().add(reshape_node); + lite::npu::OpList::Global().add(reshape_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = reshape_node; @@ -107,7 +102,7 @@ node_map_type ReshapeConverter(const std::shared_ptr reshape_op, xshape_node->set_input_tensor(*inputs_map.at(x_var_name)); xshape_node->set_attr_shape( ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); - OpList::Global().add(xshape_node); + lite::npu::OpList::Global().add(xshape_node); outputs_map[op_info->Output("XShape").front()] = xshape_node; } return outputs_map; diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc index af45e6102b83b9f2e30c98d461a71488f8cd3d13..4e305b15f2f485317d5040be11cd92269d08baa8 100644 --- a/lite/kernels/npu/bridges/scale_op.cc +++ b/lite/kernels/npu/bridges/scale_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type ScaleConverter(const std::shared_ptr scale_op, auto scope = scale_op->scope(); auto op_info = scale_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; // get input, output and op attributes @@ -52,26 +46,26 @@ node_map_type ScaleConverter(const std::shared_ptr scale_op, auto scale_node = std::make_shared(unique_op_type); CHECK(inputs_map.count(x_var_name)); scale_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(scale_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(scale_node); // add filter node(fill with scale) auto filter_const_node = std::make_shared(unique_op_type + "/filter"); filter_const_node->set_attr_value( - CreateTensorAndFillData(scale, scale_bias_shape)); + lite::npu::CreateTensorAndFillData(scale, scale_bias_shape)); scale_node->set_input_filter(*filter_const_node); - OpList::Global().add(filter_const_node); + lite::npu::OpList::Global().add(filter_const_node); // add bias node(fill with bias) if (fabs(bias) > 1e-6f) { auto bias_const_node = std::make_shared(unique_op_type + "/bias"); bias_const_node->set_attr_value( - CreateTensorAndFillData(bias, scale_bias_shape)); + lite::npu::CreateTensorAndFillData(bias, scale_bias_shape)); scale_node->set_input_bias(*bias_const_node); scale_node->set_attr_has_bias_value(true); - OpList::Global().add(bias_const_node); + lite::npu::OpList::Global().add(bias_const_node); } scale_node->set_attr_axis(1); diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc index 5c95fd53d53ab6d95e8e0f85edb54fa5b48bd637..d1e7bc83dd90f07fd1e0f2811a1492e9bfcc0660 100644 --- a/lite/kernels/npu/bridges/shuffle_channel_op.cc +++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type ShuffleChannelConverter( auto scope = shuffle_channel_op->scope(); auto op_info = shuffle_channel_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr shuffle_channel_node = @@ -43,8 +37,8 @@ node_map_type ShuffleChannelConverter( shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name)); shuffle_channel_node->set_attr_group(op_info->GetAttr("group")); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(shuffle_channel_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(shuffle_channel_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = shuffle_channel_node; diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc index 7473a8ea39bda7a38277da803f699b8bc94b2ede..24712315646d8d83349c47d415ab41cdfcadad88 100644 --- a/lite/kernels/npu/bridges/softmax_op.cc +++ b/lite/kernels/npu/bridges/softmax_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type SoftmaxConverter(const std::shared_ptr softmax_op, auto scope = softmax_op->scope(); auto op_info = softmax_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr softmax_node = @@ -51,8 +45,8 @@ node_map_type SoftmaxConverter(const std::shared_ptr softmax_op, softmax_node->set_input_x(*inputs_map.at(x_var_name)); softmax_node->set_attr_axis(axis); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(softmax_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(softmax_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = softmax_node; diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc index 97b6c19156d0ca9736966cfc8018b4d600f7b807..0caa51c53035ef46b0f29be5a3047860c900a403 100644 --- a/lite/kernels/npu/bridges/split_op.cc +++ b/lite/kernels/npu/bridges/split_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -32,7 +26,7 @@ node_map_type SplitConverter(const std::shared_ptr split_op, lite::Scope* scope = split_op->scope(); const lite::OpInfo* op_info = split_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " << op_type << " ... "; auto x_var_name = op_info->Input("X").front(); @@ -45,7 +39,7 @@ node_map_type SplitConverter(const std::shared_ptr split_op, std::make_shared(unique_op_type); CHECK(inputs_map.count(x_var_name)); output_node->set_input_x(*inputs_map.at(x_var_name)); - OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); output_node->set_attr_axis(static_cast(axis)); if (num > 0) { @@ -63,18 +57,18 @@ node_map_type SplitConverter(const std::shared_ptr split_op, for (auto out_var_name : out_var_names) { auto const_node = std::make_shared( unique_op_type + "/const_zero" + std::to_string(index)); - const_node->set_attr_value(CreateTensorAndFillData(0)); - OpList::Global().add(const_node); + const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0)); + lite::npu::OpList::Global().add(const_node); auto add_node = std::make_shared(unique_op_type + "/add" + std::to_string(index)); add_node->set_input_x1(*output_node, "y" + std::to_string(index)); add_node->set_input_x2(*const_node); outputs_map[out_var_name] = add_node; - OpList::Global().add(add_node); + lite::npu::OpList::Global().add(add_node); index++; } - OpList::Global().add(output_node); + lite::npu::OpList::Global().add(output_node); return outputs_map; } diff --git a/lite/kernels/npu/bridges/test_helper.cc b/lite/kernels/npu/bridges/test_helper.cc index 594b2db47457cf3af4f2c4786d2bd94e57815c6e..b410a4190d86f2ddf020e7f223787acc0108a398 100644 --- a/lite/kernels/npu/bridges/test_helper.cc +++ b/lite/kernels/npu/bridges/test_helper.cc @@ -14,10 +14,9 @@ #include "lite/kernels/npu/bridges/test_helper.h" #include -#include "ai_ddk_lib/include/graph/op/all_ops.h" +#include "lite/backends/npu/builder.h" #include "lite/core/op_registry.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" #include "lite/operators/graph_op.h" namespace paddle { @@ -44,7 +43,7 @@ void LauchOp(const std::shared_ptr op, ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT); auto input_node = std::make_shared(input_var_name); input_node->update_input_desc_x(input_desc); - OpList::Global().add(input_node); + lite::npu::OpList::Global().add(input_node); inputs_map[input_var_name] = input_node; } auto outputs_map = supported_lists.at(op_type)(op, inputs_map); @@ -63,7 +62,7 @@ void LauchOp(const std::shared_ptr op, auto weight = scope->Var(weight_var_name)->GetMutable(); weight->set_persistable(true); weight->set_precision(PRECISION(kInt8)); - CHECK(BuildModel(graph_inputs, graph_outputs, weight)); + CHECK(lite::npu::BuildModel(graph_inputs, graph_outputs, weight)); CHECK_GT(weight->numel(), 0); CHECK_NE(weight->data(), 0); @@ -94,7 +93,7 @@ void LauchOp(const std::shared_ptr op, graph_kernel->Launch(); // release all of resources of generated model - OpList::Global().clear(); + lite::npu::OpList::Global().clear(); } } // namespace bridges diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc index ac243a060158b2edcf7354276d62998ee1dc6b31..5e9a69837b9e253845e6a1df35a897cfe342a84e 100644 --- a/lite/kernels/npu/bridges/transpose_op.cc +++ b/lite/kernels/npu/bridges/transpose_op.cc @@ -12,14 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "ai_ddk_lib/include/graph/buffer.h" -#include "ai_ddk_lib/include/graph/graph.h" -#include "ai_ddk_lib/include/graph/model.h" -#include "ai_ddk_lib/include/graph/op/all_ops.h" -#include "ai_ddk_lib/include/graph/operator.h" -#include "ai_ddk_lib/include/graph/operator_reg.h" +#include "lite/backends/npu/builder.h" #include "lite/kernels/npu/bridges/registry.h" -#include "lite/kernels/npu/bridges/utils.h" namespace paddle { namespace lite { @@ -33,7 +27,7 @@ node_map_type TransposeConverter( auto scope = transpose_op->scope(); auto op_info = transpose_op->op_info(); auto op_type = op_info->Type(); - auto unique_op_type = UniqueName(op_type); + auto unique_op_type = lite::npu::UniqueName(op_type); LOG(INFO) << "Converting " + op_type + "..."; std::shared_ptr transpose_node = @@ -50,8 +44,8 @@ node_map_type TransposeConverter( w_data[i] = 1.f; } auto npu_w = std::make_shared(w_var_name); - npu_w->set_attr_value(CvtFromLiteTensor(w)); - OpList::Global().add(npu_w); + npu_w->set_attr_value(lite::npu::CvtFromLiteTensor(w)); + lite::npu::OpList::Global().add(npu_w); auto axis = op_info->GetAttr>("axis"); auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end()); @@ -61,8 +55,8 @@ node_map_type TransposeConverter( transpose_node->set_input_w(*npu_w); transpose_node->set_attr_order(npu_axis); - OpList::Global().add(inputs_map.at(x_var_name)); - OpList::Global().add(transpose_node); + lite::npu::OpList::Global().add(inputs_map.at(x_var_name)); + lite::npu::OpList::Global().add(transpose_node); node_map_type outputs_map; outputs_map[op_info->Output("Out").front()] = transpose_node; diff --git a/lite/kernels/npu/bridges/utils.h b/lite/kernels/npu/bridges/utils.h deleted file mode 100644 index 382879f649fd086221eef2c62ee48bcf8da48c9d..0000000000000000000000000000000000000000 --- a/lite/kernels/npu/bridges/utils.h +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include "ai_ddk_lib/include/graph/operator_reg.h" -#include "lite/core/op_lite.h" -#include "lite/core/target_wrapper.h" -#include "lite/core/tensor.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace npu { -namespace bridges { - -class OpList { - public: - static OpList& Global() { - static thread_local OpList x; - return x; - } - void clear() { lists_.clear(); } - void add(std::shared_ptr p) { lists_.push_back(p); } - - private: - std::vector> lists_; -}; - -// Build HIAI IR graph to om model, and store om model data into lite tensor -bool BuildModel(std::vector& inputs, // NOLINT - std::vector& outputs, // NOLINT - lite::Tensor* model_data); - -std::string UniqueName(const std::string& prefix); - -ge::DataType PrecisionConverter(PrecisionType itype); - -ge::Format DataLayoutConverter(DataLayoutType itype); - -ge::TensorPtr CvtFromLiteTensor(Tensor* in_tensor, - std::vector out_shape = {}, - PrecisionType in_ptype = PRECISION(kFloat), - DataLayoutType in_ltype = DATALAYOUT(kNCHW)); - -template -ge::TensorPtr CreateTensorAndFillData(std::vector data, - std::vector shape = {}, - ge::Format format = ge::FORMAT_NCHW) { - const std::type_info& info = typeid(T); - ge::DataType type = ge::DT_FLOAT; - if (info == typeid(float)) { - type = ge::DT_FLOAT; - } else if (info == typeid(int8_t)) { - type = ge::DT_INT8; - } else if (info == typeid(int32_t)) { - type = ge::DT_INT32; - } else { - LOG(FATAL) << "Unknow value type " << info.name(); - } - if (shape.empty()) { - shape = {static_cast(data.size())}; - } else { - int size = 1; - for (auto i : shape) { - size *= i; - } - CHECK_EQ(data.size(), size); - } - ge::TensorDesc desc(ge::Shape(shape), format, type); - ge::TensorPtr tensor = std::make_shared(); - tensor->SetTensorDesc(desc); - tensor->SetData(reinterpret_cast(data.data()), - data.size() * sizeof(T)); - return tensor; -} - -template -ge::TensorPtr CreateTensorAndFillData(T value, - std::vector shape = {1}, - ge::Format format = ge::FORMAT_NCHW) { - int64_t size = 1; - for (auto i : shape) { - size *= i; - } - std::vector data(size, value); - return CreateTensorAndFillData(data, shape, format); -} - -bool HasInputArg(const OpInfo* op_info, - const Scope* scope, - const std::string& argname); - -} // namespace bridges -} // namespace npu -} // namespace kernels -} // namespace lite -} // namespace paddle diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc index 41a36238cc787625e7d6191d98800ba88cbef508..f2b42c658d11edfed65eea2af48a3c0202ba3114 100644 --- a/lite/kernels/npu/graph_compute.cc +++ b/lite/kernels/npu/graph_compute.cc @@ -49,8 +49,8 @@ void GraphCompute::PrepareForRun() { VLOG(3) << "npu_idims[" << i << "]: " << npu_idims_[i].GetNumber() << "," << npu_idims_[i].GetChannel() << "," << npu_idims_[i].GetHeight() << "," << npu_idims_[i].GetWidth(); - VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i]->dims(); - CHECK_EQ(param.inputs[i]->dims().production(), + VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i].second->dims(); + CHECK_EQ(param.inputs[i].second->dims().production(), npu_idims_[i].GetNumber() * npu_idims_[i].GetChannel() * npu_idims_[i].GetHeight() * npu_idims_[i].GetWidth()); npu_itensors_[i].reset(new hiai::AiTensor); @@ -61,16 +61,16 @@ void GraphCompute::PrepareForRun() { VLOG(3) << "npu_odims[" << i << "]: " << npu_odims_[i].GetNumber() << "," << npu_odims_[i].GetChannel() << "," << npu_odims_[i].GetHeight() << "," << npu_odims_[i].GetWidth(); - VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i]->dims(); + VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i].second->dims(); auto out_size = npu_odims_[i].GetNumber() * npu_odims_[i].GetChannel() * npu_odims_[i].GetHeight() * npu_odims_[i].GetWidth(); - if (param.outputs[i]->dims().production() != out_size) { - param.outputs[i]->Resize({npu_odims_[i].GetNumber(), - npu_odims_[i].GetChannel(), - npu_odims_[i].GetHeight(), - npu_odims_[i].GetWidth()}); + if (param.outputs[i].second->dims().production() != out_size) { + param.outputs[i].second->Resize({npu_odims_[i].GetNumber(), + npu_odims_[i].GetChannel(), + npu_odims_[i].GetHeight(), + npu_odims_[i].GetWidth()}); } - LOG(INFO) << param.outputs[i]->dims(); + LOG(INFO) << param.outputs[i].second->dims(); npu_otensors_[i].reset(new hiai::AiTensor); npu_otensors_[i]->Init(&(npu_odims_[i])); } @@ -80,7 +80,7 @@ bool GraphCompute::input_dims_changed() const { auto& param = this->Param(); CHECK_EQ(param.inputs.size(), npu_idims_.size()); for (size_t i = 0; i < param.inputs.size(); ++i) { - auto param_idims = param.inputs[i]->dims(); + auto param_idims = param.inputs[i].second->dims(); CHECK(!param_idims.empty()); CHECK_EQ(param_idims.size(), 4); std::vector idims{static_cast(npu_idims_[i].GetNumber()), @@ -105,7 +105,7 @@ void GraphCompute::Run() { CHECK_EQ(param.outputs.size(), npu_otensors_.size()); for (size_t i = 0; i < param.inputs.size(); ++i) { - auto* itensor = param.inputs[i]; + auto* itensor = param.inputs[i].second; CHECK(itensor); const auto* i_data = itensor->data(); std::memcpy( @@ -126,10 +126,10 @@ void GraphCompute::Run() { CHECK_EQ(hiai::AI_SUCCESS, model_client_->Process( model_context_, npu_itensors_, npu_otensors_, 1000, istamp)); - LOG(INFO) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; + VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; for (size_t i = 0; i < param.outputs.size(); ++i) { - auto* otensor = param.outputs[i]; + auto* otensor = param.outputs[i].second; CHECK(otensor); auto* o_data = otensor->mutable_data(); auto* npu_obuffer = static_cast(npu_otensors_[i]->GetBuffer()); diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt index 60219e3b18665280ece5c0b77723bc311cb8eebd..6d47c880c8daf1ec8981dfb4083324b79c25cec1 100644 --- a/lite/kernels/x86/CMakeLists.txt +++ b/lite/kernels/x86/CMakeLists.txt @@ -1,4 +1,4 @@ -add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops) +add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function) # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps}) @@ -55,6 +55,8 @@ lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS ba lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86) lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86) lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS activation_compute_x86) +lite_cc_test(test_tanh_compute_x86 SRCS tanh_compute_test.cc DEPS activation_compute_x86) +lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_compute_x86) lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86) lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86) lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86) diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc index 0ed09c43a5df9a087e5a21c6c9566b7b785a5afa..b4a053419c5c6f04b4b053d7bf902a57e9562518 100644 --- a/lite/kernels/x86/activation_compute.cc +++ b/lite/kernels/x86/activation_compute.cc @@ -35,3 +35,25 @@ REGISTER_LITE_KERNEL(relu, .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .Finalize(); + +// float +REGISTER_LITE_KERNEL(tanh, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::TanhCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); + +// float +REGISTER_LITE_KERNEL(gelu, + kX86, + kFloat, + kNCHW, + paddle::lite::kernels::x86::GeluCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) + .Finalize(); diff --git a/lite/kernels/x86/activation_compute.h b/lite/kernels/x86/activation_compute.h index 27752401949a30234a36854260793a33b4487eba..482684b0672c1ed7f0d571f852e134e92ddcaafa 100644 --- a/lite/kernels/x86/activation_compute.h +++ b/lite/kernels/x86/activation_compute.h @@ -13,8 +13,10 @@ // limitations under the License. #pragma once +#include #include #include +#include "lite/backends/x86/math/blas.h" #include "lite/core/kernel.h" #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" @@ -115,6 +117,76 @@ class ReluCompute : public KernelLite { virtual ~ReluCompute() = default; }; +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +class TanhCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + param.Out->template mutable_data(); + Activate>(param.X, param.Out); + } + + virtual ~TanhCompute() = default; +}; + +// gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) +template +struct GeluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { +// Because the execute or device context can not be deliver here, it keep the +// marco for NVCC. +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + paddle::lite::x86::math::CBlas::AXPY( + n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + paddle::lite::x86::math::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + paddle::lite::x86::math::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); +#endif + } +}; + +template +class GeluCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override { + auto& param = *param_.get_mutable(); + + param.Out->template mutable_data(); + Activate>(param.X, param.Out); + } + + virtual ~GeluCompute() = default; +}; + } // namespace x86 } // namespace kernels } // namespace lite diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..20479760e916613f14745d8b7316e094950f6a46 --- /dev/null +++ b/lite/kernels/x86/gelu_compute_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/x86/activation_compute.cc" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(gelu_x86, retrive_op) { + auto gelu = + KernelRegistry::Global().Create("gelu"); + ASSERT_FALSE(gelu.empty()); + ASSERT_TRUE(gelu.front()); +} + +TEST(gelu_x86, init) { + GeluCompute gelu; + ASSERT_EQ(gelu.precision(), PRECISION(kFloat)); + ASSERT_EQ(gelu.target(), TARGET(kX86)); +} + +TEST(gelu_x86, run_test) { + lite::Tensor x, out; + constexpr int batch_size = 1; + std::vector x_shape{batch_size, 3, 2, 2}; + x.Resize(lite::DDim(x_shape)); + std::vector out_shape{batch_size, 3, 2, 2}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); i++) { + int sign = i % 2 == 0 ? 1 : -1; + x_data[i] = static_cast(i * sign) * 0.8f; + } + // GeluCompute gelu; + GeluCompute gelu; + operators::ActivationParam param; + + param.X = &x; + param.Out = &out; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + gelu.SetContext(std::move(ctx)); + gelu.SetParam(param); + gelu.Run(); + + LOG(INFO) << "output: "; + std::vector ref_data{0., + -0.169484, + 1.512321, + -0.019674, + 3.197801, + -0.000126719, + 4.8, + -0., + 6.4000001, + -0., + 8., + -0.}; + for (int i = 0; i < out.dims().production(); i++) { + LOG(INFO) << out_data[i]; + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(gelu, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa65ca02df27642fc0114a075ad8a4249f3b70de --- /dev/null +++ b/lite/kernels/x86/tanh_compute_test.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/x86/activation_compute.cc" + +namespace paddle { +namespace lite { +namespace kernels { +namespace x86 { + +TEST(tanh_x86, retrive_op) { + auto tanh = + KernelRegistry::Global().Create("tanh"); + ASSERT_FALSE(tanh.empty()); + ASSERT_TRUE(tanh.front()); +} + +TEST(tanh_x86, init) { + TanhCompute tanh; + ASSERT_EQ(tanh.precision(), PRECISION(kFloat)); + ASSERT_EQ(tanh.target(), TARGET(kX86)); +} + +TEST(tanh_x86, run_test) { + lite::Tensor x, out; + constexpr int batch_size = 1; + std::vector x_shape{batch_size, 3, 2, 2}; + x.Resize(lite::DDim(x_shape)); + std::vector out_shape{batch_size, 3, 2, 2}; + out.Resize(lite::DDim(out_shape)); + + auto x_data = x.mutable_data(); + auto out_data = out.mutable_data(); + + for (int64_t i = 0; i < x.dims().production(); i++) { + int sign = i % 2 == 0 ? 1 : -1; + x_data[i] = static_cast(i * sign) * 0.08f; + } + // TanhCompute tanh; + TanhCompute tanh; + operators::ActivationParam param; + + param.X = &x; + param.Out = &out; + std::unique_ptr ctx(new KernelContext); + ctx->As(); + tanh.SetContext(std::move(ctx)); + tanh.SetParam(param); + tanh.Run(); + + LOG(INFO) << "output: "; + std::vector ref_data{0., + -0.079829, + 0.158648, + -0.235495, + 0.309506, + -0.379949, + 0.446243, + -0.507977, + 0.564899, + -0.616909, + 0.664036, + -0.706419}; + for (int i = 0; i < out.dims().production(); i++) { + LOG(INFO) << out_data[i]; + EXPECT_NEAR(out_data[i], ref_data[i], 1e-5); + } +} + +} // namespace x86 +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_KERNEL(tanh, kX86, kFloat, kNCHW, def); diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..72c48ceab079bc65e4f2363a1702de52586733d6 --- /dev/null +++ b/lite/kernels/xpu/CMakeLists.txt @@ -0,0 +1,9 @@ + +if(NOT LITE_WITH_XPU) + return () +endif() + +add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime) +# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu) + +add_subdirectory(bridges) diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1f7b67be3b0b1798ea50daa6638873500786912 --- /dev/null +++ b/lite/kernels/xpu/bridges/CMakeLists.txt @@ -0,0 +1,29 @@ +lite_cc_library(xpu_bridge_registry SRCS registry.cc) + +set(xpu_bridge_deps xpu_bridge_registry xpu_builder op) + +lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps}) +lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps}) +lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps}) +lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps}) +lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps}) +lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps}) + +set(xpu_bridges + xpu_bridge_registry + xpu_bridge_act_op + xpu_bridge_conv_op + xpu_bridge_elementwise_ops + xpu_bridge_pool_op + xpu_bridge_softmax_op + xpu_bridge_mul_op + CACHE INTERNAL "xpu_bridges") + +set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops}) + +lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) +lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) +lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) +lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) +lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) +lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps}) diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d8e11caa96fdbff3a853a192a8d16f2eccd96337 --- /dev/null +++ b/lite/kernels/xpu/bridges/act_op.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type ActConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " + op_type + "..."; + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // create act node and set params from op + auto x_var_name = op_info->Input("X").front(); + CHECK(input_nodes.count(x_var_name)); + std::shared_ptr act_node = nullptr; + if (op_type == "relu") { + act_node = std::make_shared( + graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name))); + } else { + // TODO(hong19860320) supports more activation ops + LOG(FATAL) << "[XPU] Unsupported activation type " << op_type; + } + graph_ctx->builder->SetLayer(unique_op_type); + + // output converted nodes + node_map_type output_nodes; + output_nodes[op_info->Output("Out").front()] = act_node; + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter); diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a3efab46e3c7caee08bf646a560a0ab9abcf5c7 --- /dev/null +++ b/lite/kernels/xpu/bridges/act_op_test.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" +#include "lite/operators/activation_ops.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +void relu_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + DDim out_dims = out->dims(); + CHECK_EQ(x_dims.production(), out_dims.production()); + for (int i = 0; i < out_dims.production(); i++) { + out_data[i] = std::max(0.f, x_data[i]); + } +} + +void test_relu(int bs, int ic, int ih, int iw) { + // prepare input&output variables + Scope scope; + std::string x_var_name("x"); + std::string out_var_name("out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("relu"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + + // create and convert op to XPU model, and run it on XPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + out_ref->CopyDataFrom(*out); + + // execute reference implementation and save to output tensor + relu_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(NPUBridges, relu) { + for (auto bs : {1, 3}) { + for (auto ic : {3, 4}) { + for (auto ih : {2, 5}) { + for (auto iw : {5, 9}) { + VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih + << " iw: " << iw; + test_relu(bs, ic, ih, iw); + } + } + } + } +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(relu); +USE_XPU_BRIDGE(relu); diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c7c2f0ca5f303555eaa74ea04dad27c9de70d89a --- /dev/null +++ b/lite/kernels/xpu/bridges/conv_op.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type ConvConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " << op_type << "... "; + + // get input, filter and op attributes + auto input_var_name = op_info->Input("Input").front(); + auto input = scope->FindVar(input_var_name)->GetMutable(); + auto input_dims = input->dims(); + auto filter_var_name = op_info->Input("Filter").front(); + auto filter = scope->FindVar(filter_var_name)->GetMutable(); + auto filter_dims = filter->dims(); + auto bs = input_dims[0]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4); + CHECK_EQ(filter_dims.size(), 4); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + auto fuse_relu = op_info->GetAttr("fuse_relu"); + CHECK_EQ(strides.size(), 2); + CHECK_EQ(paddings.size(), 2); + CHECK_EQ(dilations.size(), 2); + std::vector output_shape({bs, oc}); + for (size_t i = 0; i < 2; i++) { + const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1; + output_shape.push_back( + (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1); + } + DDim output_dims(output_shape); + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // create filter node + CHECK(!input_nodes.count(filter_var_name)); + auto filter_const_node = std::make_shared( + graph_ctx->builder->CreateTensor(filter_var_name, + lite::xpu::CvtShape(filter_dims), + ::xtcl::Float(32))); + auto filter_const_tensor = lite::xpu::CvtTensor(filter); + graph_ctx->params->emplace( + std::make_pair(filter_var_name, *filter_const_tensor)); + + // create conv node and set input, filter, bias nodes and attributes + auto conv_attrs = xtcl::make_node(); + conv_attrs->strides = std::move(lite::xpu::CvtShape(strides)); + conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings)); + conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations)); + conv_attrs->groups = groups; + // conv_attrs->channels = nullptr; + conv_attrs->kernel_size = std::move(xtcl::Array(nullptr)); + conv_attrs->data_layout = "NCHW"; + conv_attrs->kernel_layout = "OIHW"; + conv_attrs->out_layout = ""; + // conv_attrs->out_dtype = ""; + CHECK(input_nodes.count(input_var_name)); + auto conv_node = + std::make_shared(graph_ctx->builder->CreateConv2D( + *input_nodes.at(input_var_name), *filter_const_node, conv_attrs)); + graph_ctx->builder->SetLayer(unique_op_type); + + // create bias node if has bias + // supports the bias nodes with the following dimensions + // 0: {oc} + // 1: {1, oc, oh, ow} + // 2: {n, oc, oh, ow} + if (lite::xpu::HasInputArg(op_info, scope, "Bias")) { + auto bias_var_name = op_info->Input("Bias").front(); + auto* bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + bool is_channel_bias = false; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {oc}; + is_channel_bias = true; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(ERROR) << "bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + } + std::shared_ptr bias_node = nullptr; + if (input_nodes.count(bias_var_name)) { + // bias node from input node + bias_node = input_nodes.at(bias_var_name); + } else { + // bias node with const tensor + auto bias_const_node = std::make_shared( + graph_ctx->builder->CreateTensor(bias_var_name, + lite::xpu::CvtShape(bias_shape), + ::xtcl::Float(32))); + auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape); + graph_ctx->params->emplace( + std::make_pair(bias_var_name, *bias_const_tensor)); + bias_node = bias_const_node; + } + std::shared_ptr add_node = nullptr; + if (is_channel_bias) { + add_node = std::make_shared( + graph_ctx->builder->CreateBiasAdd(*conv_node, *bias_node, 1)); + } else { + add_node = std::make_shared( + graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node)); + } + graph_ctx->builder->SetLayer(unique_op_type + "/add"); + conv_node = add_node; + } + + // output converted nodes + node_map_type output_nodes; + if (fuse_relu) { + // append relu node if fuse_relu is true + auto relu_node = std::make_shared( + graph_ctx->builder->CreateRelu(*conv_node)); + graph_ctx->builder->SetLayer(unique_op_type + "/relu"); + output_nodes[op_info->Output("Output").front()] = relu_node; + } else { + output_nodes[op_info->Output("Output").front()] = conv_node; + } + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter); +REGISTER_XPU_BRIDGE(depthwise_conv2d, + paddle::lite::kernels::xpu::bridges::ConvConverter); diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ebdb67bd0d2801a9036696f52790f7104279b0cb --- /dev/null +++ b/lite/kernels/xpu/bridges/conv_op_test.cc @@ -0,0 +1,281 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +void conv_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto input = + scope->FindVar(op_info->Input("Input").front())->GetMutable(); + auto filter = + scope->FindVar(op_info->Input("Filter").front())->GetMutable(); + auto output = + scope->FindVar(op_info->Output("Output").front())->GetMutable(); + std::vector strides = + op_info->GetAttr>("strides"); + std::vector paddings = + op_info->GetAttr>("paddings"); + int32_t groups = op_info->GetAttr("groups"); + std::vector dilations = + op_info->GetAttr>("dilations"); + bool fuse_relu = op_info->GetAttr("fuse_relu"); + auto input_dims = input->dims(); + auto filter_dims = filter->dims(); + auto output_dims = output->dims(); + auto input_data = input->mutable_data(); + auto filter_data = filter->mutable_data(); + auto output_data = output->mutable_data(); + int kernel_w = filter_dims[3]; + int kernel_h = filter_dims[2]; + int stride_w = strides[1]; + int stride_h = strides[0]; + int dila_w = dilations[1]; + int dila_h = dilations[0]; + int pad_w = paddings[1]; + int pad_h = paddings[0]; + int batch_size = input_dims[0]; + int in_ch_size = input_dims[1]; + int in_h = input_dims[2]; + int in_w = input_dims[3]; + int out_ch_size = output_dims[1]; + int out_h = output_dims[2]; + int out_w = output_dims[3]; + int out_c_group = out_ch_size / groups; + int in_c_group = in_ch_size / groups; + Tensor* bias = nullptr; + float* bias_data = nullptr; + bool is_channel_bias = false; + if (op_info->HasInput("Bias")) { + auto bias_var_names = op_info->Input("Bias"); + if (bias_var_names.size() > 0) { + auto bias_var_name = bias_var_names.front(); + bias = scope->FindVar(bias_var_name)->GetMutable(); + auto bias_dims = bias->dims(); + is_channel_bias = bias_dims.production() == out_ch_size; + bias_data = bias->mutable_data(); + } + } + for (int n = 0; n < batch_size; ++n) { + for (int g = 0; g < groups; ++g) { + for (int oc = 0; oc < out_c_group; ++oc) { + for (int oh = 0; oh < out_h; ++oh) { + for (int ow = 0; ow < out_w; ++ow) { + int out_idx = n * groups * out_c_group * out_h * out_w + + g * out_c_group * out_h * out_w + oc * out_h * out_w + + oh * out_w + ow; + float out_value = + bias_data != nullptr + ? (is_channel_bias ? bias_data[g * out_c_group + oc] + : bias_data[out_idx]) + : 0; + // + out_value *= beta; + for (int ic = 0; ic < in_c_group; ++ic) { + for (int kh = 0; kh < kernel_h; ++kh) { + for (int kw = 0; kw < kernel_w; ++kw) { + int iw = ow * stride_w - pad_w + kw * (dila_w); + int ih = oh * stride_h - pad_h + kh * (dila_h); + if (iw < 0 || iw >= in_w) continue; + if (ih < 0 || ih >= in_h) continue; + int in_idx = n * in_ch_size * in_h * in_w + + g * in_c_group * in_h * in_w + ic * in_h * in_w + + ih * in_w + iw; + int filter_idx = + g * out_c_group * in_c_group * kernel_h * kernel_w + + oc * in_c_group * kernel_h * kernel_w + + ic * kernel_h * kernel_w + kh * kernel_w + kw; + out_value += input_data[in_idx] * filter_data[filter_idx]; + } + } + } + if (fuse_relu) { + out_value = out_value > 0 ? out_value : 0; + } + output_data[out_idx] = out_value; + } + } + } + } + } +} + +void test_conv(int bs, + int ic, + int oc, + int ih, + int iw, + bool has_bias, + bool is_channel_bias, + bool fuse_relu, + bool depthwise, + int dilation, + int stride, + int padding, + int kernel) { + // prepare input&output variables + Scope scope; + std::string input_var_name("input"); + std::string filter_var_name("filter"); + std::string bias_var_name("bias"); + std::string output_var_name("output"); + std::string output_ref_var_name("output_ref"); + auto* input = scope.Var(input_var_name)->GetMutable(); + auto* filter = scope.Var(filter_var_name)->GetMutable(); + auto* bias = scope.Var(bias_var_name)->GetMutable(); + auto* output = scope.Var(output_var_name)->GetMutable(); + auto* output_ref = scope.Var(output_ref_var_name)->GetMutable(); + + // get group size and input&filter shape + int groups = 1; + if (depthwise) { // depthwise convolution ? + groups = oc = ic; + } + std::vector input_shape = {bs, ic, ih, iw}; + std::vector filter_shape = {oc, ic / groups, kernel, kernel}; + std::vector output_shape({bs, oc}); + for (size_t i = 0; i < 2; i++) { + const int dkernel = dilation * (kernel - 1) + 1; + int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1; + output_shape.push_back(output_size); + } + input->Resize(input_shape); + filter->Resize(filter_shape); + + // initialize input&output data + FillTensor(input); + FillTensor(filter); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d"); + opdesc.SetInput("Input", {input_var_name}); + opdesc.SetInput("Filter", {filter_var_name}); + opdesc.SetOutput("Output", {output_var_name}); + opdesc.SetAttr("dilations", std::vector({dilation, dilation})); + opdesc.SetAttr("strides", std::vector({stride, stride})); + opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("groups", groups); + opdesc.SetAttr("fuse_relu", static_cast(fuse_relu)); + if (has_bias) { + if (is_channel_bias) { + bias->Resize({1, oc, 1, 1}); + } else { + bias->Resize({1, output_shape[1], output_shape[2], output_shape[3]}); + } + FillTensor(bias); + opdesc.SetInput("Bias", {bias_var_name}); + } + + // create and convert op to NPU model, then run it on NPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {input_var_name}, {output_var_name}); + output_ref->CopyDataFrom(*output); + + // execute reference implementation and save to output tensor('out') + conv_ref(op); + + // compare results + auto* output_data = output->mutable_data(); + auto* output_ref_data = output_ref->mutable_data(); + for (int i = 0; i < output->dims().production(); i++) { + VLOG(5) << i; + EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5); + } +} + +TEST(NPUBridges, conv) { +#if 0 + for (auto bs : {1, 2}) { + for (auto ic : {3, 6}) { + for (auto oc : {6, 9}) { + for (auto ih : {14, 28}) { + for (auto iw : {14, 28}) { + for (auto has_bias : {false, true}) { + for (auto is_channel_bias : {false, true}) { + for (auto fuse_relu : {false, true}) { + for (auto depthwise : {false, true}) { + for (auto dilation : {1, 2}) { + for (auto stride : {1, 2}) { + for (auto kernel : {1, 3, 5}) { + std::vector paddings = {kernel / 2}; + if (kernel / 2 != 0) { + paddings.push_back(0); + } + for (auto padding : paddings) { + VLOG(3) << "bs: " << bs << " ic: " << ic + << " oc: " << oc << " ih: " << ih + << " iw: " << iw + << " has_bias: " << has_bias + << " is_channel_bias: " << is_channel_bias + << " fuse_relu: " << fuse_relu + << " depthwise: " << depthwise + << " dilation: " << dilation + << " stride: " << stride + << " padding: " << padding + << " kernel: " << kernel; + test_conv(bs, + ic, + oc, + ih, + iw, + has_bias, + is_channel_bias, + fuse_relu, + depthwise, + dilation, + stride, + padding, + kernel); + } + } + } + } + } + } + } + } + } + } + } + } + } +#else + test_conv(1, 1, 1, 4, 4, false, false, false, false, 1, 1, 1, 3); + test_conv(1, 1, 1, 4, 4, true, true, false, false, 1, 1, 1, 3); + test_conv(1, 1, 1, 4, 4, true, false, false, false, 1, 1, 1, 3); +#endif +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(conv2d); +USE_XPU_BRIDGE(conv2d); + +USE_LITE_OP(depthwise_conv2d); +USE_XPU_BRIDGE(depthwise_conv2d); diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..70906b5ec9fb155efe3edcb885926a25936f41be --- /dev/null +++ b/lite/kernels/xpu/bridges/elementwise_ops.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type ElementwiseConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " + op_type + "..."; + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // get input, and attributes + auto x_var_name = op_info->Input("X").front(); + auto y_var_name = op_info->Input("Y").front(); + auto axis = op_info->GetAttr("axis"); + auto x_tensor = scope->FindMutableTensor(x_var_name); + auto y_tensor = scope->FindMutableTensor(y_var_name); + auto x_dims = x_tensor->dims(); + auto y_dims = y_tensor->dims(); + + // create x and y node + std::shared_ptr x_node = nullptr; + if (input_nodes.count(x_var_name)) { + x_node = input_nodes.at(x_var_name); + } else { + x_node = std::make_shared(graph_ctx->builder->CreateTensor( + x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32))); + auto x_const_tensor = lite::xpu::CvtTensor(x_tensor); + graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor)); + } + + std::shared_ptr y_node = nullptr; + if (input_nodes.count(y_var_name)) { + y_node = input_nodes.at(y_var_name); + } else { + y_node = std::make_shared(graph_ctx->builder->CreateTensor( + y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32))); + auto y_const_tensor = lite::xpu::CvtTensor(y_tensor); + graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor)); + } + + // create elementwise node and set input, attributes + std::shared_ptr elementwise_node = nullptr; + if (y_dims.size() == 1) { + elementwise_node = std::make_shared( + graph_ctx->builder->CreateBiasAdd(*x_node, *y_node, axis)); + } else if (x_dims.size() == y_dims.size()) { + elementwise_node = std::make_shared( + graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node)); + } else { + LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x " + "and y of the same dimension. But recieved x's dimension: " + << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis; + } + graph_ctx->builder->SetLayer(unique_op_type); + + // output converted nodes + node_map_type output_nodes; + output_nodes[op_info->Output("Out").front()] = elementwise_node; + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(elementwise_add, + paddle::lite::kernels::xpu::bridges::ElementwiseConverter); diff --git a/lite/kernels/xpu/bridges/elementwise_ops_test.cc b/lite/kernels/xpu/bridges/elementwise_ops_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2abda822e3ae380ad376e92db99b5ad204a2a2a4 --- /dev/null +++ b/lite/kernels/xpu/bridges/elementwise_ops_test.cc @@ -0,0 +1,188 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/elementwise_ops.h" +#include +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +template +void elementwise_add_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + + auto x_data = x->data(); + auto y_data = y->data(); + dtype* out_data = out->mutable_data(); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + int axis = op_info->GetAttr("axis"); + + if (axis < 0) { + axis = x_dims.size() - y_dims.size(); + } + int batch = 1; + int channels = 1; + int num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + num *= x_dims[i]; + } + // do elementwise add/sub/max... + std::string elt_type = "add"; + if (elt_type == "add") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr + diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "sub") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr - diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "mul") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = *din_ptr * diny_data; + dout_ptr++; + din_ptr++; + } + } + } + } else if (elt_type == "max") { + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + int offset = (i * channels + j) * num; + const dtype* din_ptr = x_data + offset; + const dtype diny_data = y_data[j]; + dtype* dout_ptr = out_data + offset; + for (int k = 0; k < num; ++k) { + *dout_ptr = std::max(*din_ptr, diny_data); + dout_ptr++; + din_ptr++; + } + } + } + } else { + LOG(FATAL) << "unsupported Elementwise type: " << elt_type; + } +} + +void test_elementwise_add(std::vector x_dims, + std::vector y_dims, + int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string y_var_name = "y"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* y = scope.Var(y_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(x_dims); + if (y_dims.size() == 0) { + y->Resize(x_dims); + } else { + y->Resize(y_dims); + } + + // initialize input&output data + FillTensor(x); + FillTensor(y); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("elementwise_add"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetInput("Y", {y_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to XPU model, then run it on XPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name, y_var_name}, {out_var_name}); + out_ref->CopyDataFrom(*out); + + // execute reference implementation and save to output tensor + elementwise_add_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +// xpu's bias_add only support y with one dimension +TEST(XPUBridges, elementwise_add) { + test_elementwise_add({1, 2, 3, 4}, {1}, 0); + test_elementwise_add({1, 2, 3, 4}, {2}, 1); + test_elementwise_add({2, 2, 3, 4}, {3}, 2); + test_elementwise_add({2, 2, 3, 4}, {4}, 3); + test_elementwise_add({2, 2, 3, 4}, {4}, -1); + test_elementwise_add({2, 2, 3, 4}, {}, -1); +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(elementwise_add); +USE_XPU_BRIDGE(elementwise_add); diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..edf44f78bbfb54cf4316d3b9d7d9be2a121669d7 --- /dev/null +++ b/lite/kernels/xpu/bridges/mul_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type MulConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto scope = op->scope(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " + op_type + "..."; + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // get input, and attributes + auto x_var_name = op_info->Input("X").front(); + auto y_var_name = op_info->Input("Y").front(); + auto y_tensor = scope->FindMutableTensor(y_var_name); + auto y_dims = y_tensor->dims(); + CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2"; + + auto x_num_col_dims = op_info->GetAttr("x_num_col_dims"); + CHECK_EQ(x_num_col_dims, 1) << "xpu now only support x_num_col_dims == 1"; + auto y_num_col_dims = op_info->GetAttr("x_num_col_dims"); + CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1"; + + // create x node + std::shared_ptr x_node = nullptr; + x_node = std::make_shared( + graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name))); + graph_ctx->builder->SetLayer(unique_op_type + "/X"); + + // transpose y + DDimLite y_dims_t(std::vector{1, 1}); + y_dims_t[0] = y_dims[1]; + y_dims_t[1] = y_dims[0]; + auto y_var_name_t = unique_op_type + "/Y"; + Tensor* y_tensor_t = new Tensor(); + y_tensor_t->Resize(y_dims_t); + auto y_data_t = y_tensor_t->mutable_data(); + auto y_data = y_tensor->mutable_data(); + for (int i = 0; i < y_dims_t[0]; i++) { + for (int j = 0; j < y_dims_t[1]; j++) { + y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i]; + } + } + + // create y node + std::shared_ptr y_const_node = nullptr; + y_const_node = std::make_shared(graph_ctx->builder->CreateTensor( + y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32))); + auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t); + graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor)); + delete y_tensor_t; + + // create mul node and set params from op + std::shared_ptr mul_node = nullptr; + mul_node = std::make_shared(graph_ctx->builder->CreateDense( + *x_node, *y_const_node, static_cast(y_dims[1]))); + graph_ctx->builder->SetLayer(unique_op_type); + + // output converted nodes + node_map_type output_nodes; + output_nodes[op_info->Output("Out").front()] = mul_node; + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter); diff --git a/lite/kernels/xpu/bridges/mul_op_test.cc b/lite/kernels/xpu/bridges/mul_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..cd439b68cb7286a919a8fce97371443f53ed40db --- /dev/null +++ b/lite/kernels/xpu/bridges/mul_op_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/mul_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +void mul_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + int32_t x_num_col_dims = op_info->GetAttr("x_num_col_dims"); + int32_t y_num_col_dims = op_info->GetAttr("y_num_col_dims"); + auto x_data = x->mutable_data(); + auto y_data = y->mutable_data(); + auto out_data = out->mutable_data(); + auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims); + auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims); + CHECK_EQ(x_mat_dims[1], y_mat_dims[0]); + const int M = x_mat_dims[0]; + const int K = x_mat_dims[1]; + const int N = y_mat_dims[1]; + for (int m = 0; m < M; ++m) { + for (int n = 0; n < N; ++n) { + out_data[m * N + n] = 0; + for (int k = 0; k < K; ++k) { + out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n]; + } + } + } +} + +void test_mul(const std::vector& x_shape, + const std::vector& y_shape, + int x_num_col_dims, + int y_num_col_dims) { + Scope scope; + std::string x_var_name("X"); + std::string y_var_name("Y"); + std::string out_var_name("Out"); + std::string out_ref_var_name("out_ref"); + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* y = scope.Var(y_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize(x_shape); + y->Resize(y_shape); + + FillTensor(x); + FillTensor(y); + + // create mul op + cpp::OpDesc mul_op_desc; + mul_op_desc.SetType("mul"); + mul_op_desc.SetInput("X", {x_var_name}); + mul_op_desc.SetInput("Y", {y_var_name}); + mul_op_desc.SetOutput("Out", {out_var_name}); + mul_op_desc.SetAttr("x_num_col_dims", static_cast(x_num_col_dims)); + mul_op_desc.SetAttr("y_num_col_dims", static_cast(y_num_col_dims)); + + auto mul_op = CreateOp(mul_op_desc, &scope); + LauchOp(mul_op, {x_var_name}, {out_var_name}); + out_ref->CopyDataFrom(*out); + + mul_ref(mul_op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(XPUBridges, mul) { + test_mul({1, 2, 3, 4}, {24, 2}, 1, 1); + test_mul({2, 2, 3, 4}, {24, 2}, 1, 1); + test_mul({2, 7}, {7, 3}, 1, 1); + // test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2); + // test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2); + // test_mul({1, 4, 1, 1}, {4, 8}, 1, 1); +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(mul); +USE_XPU_BRIDGE(mul); diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..27e936eaaa125f26b0bdab43f5c38d60769cfd88 --- /dev/null +++ b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h @@ -0,0 +1,25 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/kernels/xpu/bridges/registry.h" + +USE_XPU_BRIDGE(relu); +USE_XPU_BRIDGE(conv2d); +USE_XPU_BRIDGE(depthwise_conv2d); +USE_XPU_BRIDGE(elementwise_add); +USE_XPU_BRIDGE(pool2d); +USE_XPU_BRIDGE(softmax); +USE_XPU_BRIDGE(mul); diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbc6a9919c446508afa5a3b8a1c35352f9b8ecfa --- /dev/null +++ b/lite/kernels/xpu/bridges/pool_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type PoolConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " + op_type + "..."; + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // get input, and attributes + auto x_var_name = op_info->Input("X").front(); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto ceil_mode = op_info->GetAttr("ceil_mode"); + auto paddings = op_info->GetAttr>("paddings"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto strides = op_info->GetAttr>("strides"); + auto exclusive = op_info->GetAttr("exclusive"); + + // create pool node and set params from op + CHECK(input_nodes.count(x_var_name)); + std::shared_ptr pool_node = nullptr; + if (pooling_type == "max") { + if (global_pooling) { + pool_node = std::make_shared( + graph_ctx->builder->CreateGlobalMaxPool2D( + *input_nodes.at(x_var_name))); + } else { + pool_node = std::make_shared( + graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name), + lite::xpu::CvtShape(ksize), + lite::xpu::CvtShape(strides), + lite::xpu::CvtShape(paddings), + "NCHW", + ceil_mode)); + } + } else if (pooling_type == "avg") { + if (global_pooling) { + pool_node = std::make_shared( + graph_ctx->builder->CreateGlobalAvgPool2D( + *input_nodes.at(x_var_name))); + } else { + pool_node = std::make_shared( + // !exclusive ---> count_include_pad + graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name), + lite::xpu::CvtShape(ksize), + lite::xpu::CvtShape(strides), + lite::xpu::CvtShape(paddings), + "NCHW", + ceil_mode, + !exclusive)); + } + } else { + LOG(FATAL) << "Unsupported pooling type: " << pooling_type; + } + graph_ctx->builder->SetLayer(unique_op_type); + + // output converted nodes + node_map_type output_nodes; + output_nodes[op_info->Output("Out").front()] = pool_node; + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter); diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed5f922d59b5ca5e387076c9a533c4b4c251cc87 --- /dev/null +++ b/lite/kernels/xpu/bridges/pool_op_test.cc @@ -0,0 +1,267 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +void pool_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto& in_dims = x->dims(); + auto& out_dims = out->dims(); + + const float* src_ptr = x->data(); + float* dst_ptr = out->mutable_data(); + + std::vector ksize = op_info->GetAttr>("ksize"); + std::vector strides = op_info->GetAttr>("strides"); + std::vector paddings = op_info->GetAttr>("paddings"); + bool exclusive = op_info->GetAttr("exclusive"); + std::string pooling_type = op_info->GetAttr("pooling_type"); + bool global_pooling = op_info->GetAttr("global_pooling"); + + int in_n = in_dims[0]; + int in_c = in_dims[1]; + int in_h = in_dims[2]; + int in_w = in_dims[3]; + int size_in_n = in_c * in_h * in_w; + int size_in_c = in_h * in_w; + + int out_h = out_dims[2]; + int out_w = out_dims[3]; + int size_out_n = in_c * out_h * out_w; + int size_out_c = out_h * out_w; + + int window_h = ksize[0]; + int window_w = ksize[1]; + int stride_h = strides[0]; + int stride_w = strides[1]; + int pad_h = paddings[0]; + int pad_w = paddings[1]; + + if (global_pooling == true) { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + const float* src = src_ptr + n * size_in_n + c * size_in_c; + float res = src[0]; + if (pooling_type == "max") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res = cur_val > res ? cur_val : res; + } + } else if (pooling_type == "avg") { + for (int i = 1; i < size_in_c; ++i) { + float cur_val = src[i]; + res += cur_val; + } + res /= size_in_c; + } + dst_ptr[n * size_out_n + c] = res; + } + } + } else { + for (int n = 0; n < in_n; ++n) { + for (int c = 0; c < in_c; ++c) { + for (int h = 0; h < out_h; ++h) { + int sh = h * stride_h; + int eh = sh + window_h; + sh = (sh - pad_h) < 0 ? 0 : sh - pad_h; + eh = (eh - pad_h) > in_h ? in_h : eh - pad_h; + for (int w = 0; w < out_w; ++w) { + int sw = w * stride_w; + int ew = sw + window_w; + sw = (sw - pad_w) < 0 ? 0 : sw - pad_w; + ew = (ew - pad_w) > in_w ? in_w : ew - pad_w; + int pooling_size = (ew - sw) * (eh - sh); + if (pooling_size == 0) continue; + float res = 0.f; + for (int kh = sh; kh < eh; ++kh) { + for (int kw = sw; kw < ew; ++kw) { + int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw; + if (kh == sh && kw == sw) { + res = src_ptr[src_idx]; + } else { + if (pooling_type == "max") { + res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx]; + } + if (pooling_type == "avg") { + res += src_ptr[src_idx]; + } + } + } + } + if (pooling_type == "avg") { + if (exclusive) { + res /= pooling_size; + } else { + res /= window_h * window_w; + } + } + dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res; + } + } + } + } + } +} + +void test_pool(int bs, + int ic, + int ih, + int iw, + std::string pooling_type, + bool ceil_mode, + bool global_pooling, + bool exclusive, + int ksize, + int stride, + int padding) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("pool2d"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("pooling_type", pooling_type); + opdesc.SetAttr("ksize", std::vector({ksize, ksize})); + opdesc.SetAttr("global_pooling", global_pooling); + opdesc.SetAttr("exclusive", exclusive); + opdesc.SetAttr("strides", std::vector({stride, stride})); + opdesc.SetAttr("paddings", std::vector({padding, padding})); + opdesc.SetAttr("ceil_mode", ceil_mode); + + // create and convert op to XPU model, then run it on XPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + out_ref->CopyDataFrom(*out); + + // execute reference implementation and save to output tensor + pool_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(XPUBridges, pool) { + for (auto pooling_type : {"max", "avg"}) { + for (auto bs : {1, 3}) { + for (auto ic : {2}) { + for (auto ih : {3}) { + for (auto iw : {4}) { + test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0); + } + } + } + } + } + + for (auto pooling_type : {"max"}) { + for (auto ceil_mode : {true, false}) { + for (auto ksize : {2, 3}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1}) { + for (auto bs : {1, 3}) { + for (auto ic : {2}) { + for (auto ih : {3}) { + for (auto iw : {4}) { + test_pool(bs, + ic, + ih, + iw, + pooling_type, + ceil_mode, + false, + true, + ksize, + stride, + padding); + } + } + } + } + } + } + } + } + } + + for (auto pooling_type : {"avg"}) { + for (auto ceil_mode : {true, false}) { + for (auto exclusive : {true, false}) { + for (auto ksize : {2, 3}) { + for (auto stride : {1, 2}) { + for (auto padding : {0, 1}) { + for (auto bs : {1, 3}) { + for (auto ic : {2}) { + for (auto ih : {3}) { + for (auto iw : {4}) { + test_pool(bs, + ic, + ih, + iw, + pooling_type, + ceil_mode, + false, + exclusive, + ksize, + stride, + padding); + } + } + } + } + } + } + } + } + } + } +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(pool2d); +USE_XPU_BRIDGE(pool2d); diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/xpu/bridges/registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ab1b69a25a29aeb1c1ceaff25525459ef2e94cd --- /dev/null +++ b/lite/kernels/xpu/bridges/registry.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/bridges/registry.h" +#include + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +Factory& Factory::Instance() { + static Factory g_xpu_bridge; + return g_xpu_bridge; +} + +bool Factory::HasType(const std::string& op_type) const { + return map_.count(op_type); +} + +void Factory::Insert(const std::string& op_type, const func_type& func_name) { + map_.insert(std::make_pair(op_type, func_name)); +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h new file mode 100644 index 0000000000000000000000000000000000000000..c990399c1cdeb865dc214d2f1c6d1970b6d27b85 --- /dev/null +++ b/lite/kernels/xpu/bridges/registry.h @@ -0,0 +1,93 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +// xpu network builder and constant tensors +class graph_ctx_type { + public: + std::shared_ptr builder; + std::shared_ptr params; +}; + +// var_name, xpu node pointer +using node_map_type = + std::unordered_map>; + +using func_type = std::function, graph_ctx_type*, const node_map_type&)>; +using cvt_map_type = std::unordered_map; +class Factory { + public: + static Factory& Instance(); + + const cvt_map_type& AllFunctions() const { return map_; } + bool HasType(const std::string& op_type) const; + void Insert(const std::string& op_type, const func_type& func_name); + Factory() = default; + + private: + cvt_map_type map_; + DISALLOW_COPY_AND_ASSIGN(Factory); +}; + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +// some platform-independent defintion +#if defined(_WIN32) +#define UNUSED +#define __builtin_expect(EXP, C) (EXP) +#else +#define UNUSED __attribute__((unused)) +#endif + +#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name) \ + STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \ + __reg_xpu_bridge_##op_type##__, \ + "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \ + int __reg_xpu_bridge_##op_type##_Insert() { \ + paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert( \ + #op_type, cvt_func_name); \ + return 0; \ + } + +#define USE_XPU_BRIDGE(op_type) \ + extern int __reg_xpu_bridge_##op_type##_Insert(); \ + static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \ + __reg_xpu_bridge_##op_type##_Insert(); diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3972496762a1d399ab59e7a69b0e9e18a9c28300 --- /dev/null +++ b/lite/kernels/xpu/bridges/softmax_op.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/builder.h" +#include "lite/kernels/xpu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +node_map_type SoftmaxConverter(const std::shared_ptr op, + graph_ctx_type* graph_ctx, + const node_map_type& input_nodes) { + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto unique_op_type = lite::xpu::UniqueName(op_type); + LOG(INFO) << "[XPU] Converting " + op_type + "..."; + + // check context + CHECK(graph_ctx != nullptr); + CHECK(graph_ctx->builder != nullptr); + CHECK(graph_ctx->params != nullptr); + + // get op's attributes + auto x_var_name = op_info->Input("X").front(); + auto axis = op_info->GetAttr("axis"); + + // create softmax node and set params from ops + CHECK(input_nodes.count(x_var_name)); + std::shared_ptr softmax_node = nullptr; + softmax_node = std::make_shared( + graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis)); + graph_ctx->builder->SetLayer(unique_op_type); + + // output converted nodes + node_map_type output_nodes; + output_nodes[op_info->Output("Out").front()] = softmax_node; + return output_nodes; +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_XPU_BRIDGE(softmax, + paddle::lite::kernels::xpu::bridges::SoftmaxConverter); diff --git a/lite/kernels/xpu/bridges/softmax_op_test.cc b/lite/kernels/xpu/bridges/softmax_op_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2cd12cbf4e8dc108ac43fec55a568ecec72a51ab --- /dev/null +++ b/lite/kernels/xpu/bridges/softmax_op_test.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/softmax_op.h" +#include +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/kernels/xpu/bridges/test_helper.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +template +void softmax_ref(const std::shared_ptr op) { + Scope* scope = op->scope(); + const OpInfo* op_info = op->op_info(); + auto x = scope->FindVar(op_info->Input("X").front())->GetMutable(); + auto out = + scope->FindVar(op_info->Output("Out").front())->GetMutable(); + auto x_data = x->data(); + auto out_data = out->mutable_data(); + DDim x_dims = x->dims(); + + auto x_rank = x_dims.size(); + int axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_rank; + } + int axis_size = x_dims[axis]; + int outer_num = x_dims.Slice(0, axis).production(); + int inner_num = x_dims.Slice(axis + 1, x_rank).production(); + int compute_size = outer_num * inner_num; + for (int i = 0; i < compute_size; i++) { + int idx_inner = i % inner_num; + int idx_outer = (i / inner_num) * axis_size; + int start = idx_outer * inner_num + idx_inner; + int offset; + + offset = start; + dtype max_data = std::numeric_limits::lowest(); + for (int j = 0; j < axis_size; j++) { + max_data = x_data[offset] > max_data ? x_data[offset] : max_data; + offset += inner_num; + } + + offset = start; + dtype sum_data = (dtype)0; + for (int j = 0; j < axis_size; j++) { + out_data[offset] = exp(x_data[offset] - max_data); + sum_data += out_data[offset]; + offset += inner_num; + } + + offset = start; + for (int j = 0; j < axis_size; j++) { + out_data[offset] /= sum_data; + offset += inner_num; + } + } +} + +void test_softmax(int bs, int ic, int ih, int iw, int axis) { + // prepare input&output variables + Scope scope; + std::string x_var_name = "x"; + std::string out_var_name = "out"; + std::string out_ref_var_name = "out_ref"; + auto* x = scope.Var(x_var_name)->GetMutable(); + auto* out = scope.Var(out_var_name)->GetMutable(); + auto* out_ref = scope.Var(out_ref_var_name)->GetMutable(); + x->Resize({bs, ic, ih, iw}); + + // initialize input&output data + FillTensor(x); + + // initialize op desc + cpp::OpDesc opdesc; + opdesc.SetType("softmax"); + opdesc.SetInput("X", {x_var_name}); + opdesc.SetOutput("Out", {out_var_name}); + opdesc.SetAttr("axis", axis); + + // create and convert op to XPU model, then run it on XPU + auto op = CreateOp(opdesc, &scope); + LauchOp(op, {x_var_name}, {out_var_name}); + out_ref->CopyDataFrom(*out); + + // execute reference implementation and save to output tensor + softmax_ref(op); + + // compare results + auto* out_data = out->mutable_data(); + auto* out_ref_data = out_ref->mutable_data(); + for (int i = 0; i < out->dims().production(); i++) { + EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5); + } +} + +TEST(XPUBridges, softmax) { + for (auto bs : {2, 3}) { + for (auto ic : {4}) { + for (auto ih : {5}) { + for (auto iw : {6}) { + for (auto axis : {-3, -1, 0, 1, 2, 3}) { + test_softmax(bs, ic, ih, iw, axis); + } + } + } + } + } +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(softmax); +USE_XPU_BRIDGE(softmax); diff --git a/lite/kernels/xpu/bridges/test_helper.cc b/lite/kernels/xpu/bridges/test_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..1a19324b946203c008093136d7a207ffaf23fbd6 --- /dev/null +++ b/lite/kernels/xpu/bridges/test_helper.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/bridges/test_helper.h" +#include +#include "lite/backends/xpu/builder.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/xpu/bridges/registry.h" +#include "lite/operators/graph_op.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +void LauchOp(const std::shared_ptr op, + const std::vector& input_var_names, + const std::vector& output_var_names) { + auto scope = op->scope(); + auto op_type = op->op_info()->Type(); + + // convert lite op to XPU op + const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance(); + const auto& supported_lists = bridges.AllFunctions(); + CHECK(bridges.HasType(op_type)); + graph_ctx_type graph_ctx; + graph_ctx.builder = std::make_shared(); + graph_ctx.params = + std::make_shared(); + node_map_type input_nodes; + for (auto input_var_name : input_var_names) { + auto input = scope->FindVar(input_var_name)->GetMutable(); + auto input_node = std::make_shared( + graph_ctx.builder->CreateTensor(input_var_name, + lite::xpu::CvtShape(input->dims()), + ::xtcl::Float(32))); + input_nodes[input_var_name] = input_node; + } + auto output_nodes = supported_lists.at(op_type)(op, &graph_ctx, input_nodes); + CHECK_GT(output_nodes.size(), 0); + + // build network graph and output model data + std::vector> ordered_output_nodes; + for (auto output_var_name : output_var_names) { + ordered_output_nodes.push_back(output_nodes.at(output_var_name)); + } + std::string weight_var_name = "weight"; + auto weight = scope->Var(weight_var_name)->GetMutable(); + weight->set_persistable(true); + weight->set_precision(PRECISION(kInt8)); + CHECK(lite::xpu::BuildModel( + graph_ctx.builder, graph_ctx.params, &ordered_output_nodes, weight)); + CHECK_GT(weight->numel(), 0); + CHECK(weight->data() != nullptr); + + // create graph op and set inputs and outputs + cpp::OpDesc graph_op_desc; + graph_op_desc.SetType("graph_op"); + graph_op_desc.SetInput("Inputs", input_var_names); + graph_op_desc.SetInput("Weight", {weight_var_name}); + graph_op_desc.SetOutput("Outputs", output_var_names); + + auto graph_op = + std::make_shared(graph_op_desc.Type()); + graph_op->SetValidPlaces({Place{TARGET(kXPU), PRECISION(kFloat)}}); + CHECK(graph_op->Attach(graph_op_desc, scope)); + CHECK(graph_op->CheckShape()); + CHECK(graph_op->InferShape()); + + // create graph op kernel and set XPU context + auto graph_kernels = + graph_op->CreateKernels({Place{TARGET(kXPU), PRECISION(kFloat)}}); + CHECK(!graph_kernels.empty()); + auto graph_kernel = + std::move(graph_kernels.front()); // use the first kernel by default + auto graph_device = ContextScheduler::Global().NewContext(TARGET(kXPU)); + graph_kernel->SetContext(std::move(graph_device)); + + // perform graph op kernel and store to output variables + graph_kernel->Launch(); + + lite::xpu::DeviceInfo::Global().Clear(); +} + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +USE_LITE_OP(graph_op); +USE_LITE_KERNEL(graph_op, kXPU, kFloat, kNCHW, def); diff --git a/lite/kernels/xpu/bridges/test_helper.h b/lite/kernels/xpu/bridges/test_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..c8bba5da66550a9eccaefa8b2d9a31a233f5f706 --- /dev/null +++ b/lite/kernels/xpu/bridges/test_helper.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/op_lite.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { +namespace bridges { + +template +std::shared_ptr CreateOp(const cpp::OpDesc& opdesc, lite::Scope* scope) { + auto op = std::make_shared(opdesc.Type()); + op->SetValidPlaces({Place{TARGET(kHost), PRECISION(kFloat)}, + Place{TARGET(kARM), PRECISION(kFloat)}, + Place{TARGET(kXPU), PRECISION(kFloat)}}); + CHECK(op->Attach(opdesc, scope)); + CHECK(op->CheckShape()); + CHECK(op->InferShape()); + return op; +} + +// T is the target data type +// R is the range data type, e.g. int, half +template +void FillTensor(Tensor* x, + T lower = static_cast(-2), + T upper = static_cast(2)) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + + T* x_data = x->mutable_data(); + for (int i = 0; i < x->dims().production(); ++i) { + auto r = uniform_dist(rng) * (upper - lower) + lower; + x_data[i] = static_cast(static_cast(r)); + } +} + +void LauchOp(const std::shared_ptr op, + const std::vector& input_var_names, + const std::vector& output_var_names); + +} // namespace bridges +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9e5be1a1d5c764c378f3fdf29d73148743962a4 --- /dev/null +++ b/lite/kernels/xpu/graph_compute.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/graph_compute.h" +#include +#include +#include +#include +#include "lite/backends/xpu/runtime.h" +#include "lite/core/op_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +void GraphCompute::PrepareForRun() { + // auto& ctx = this->ctx_->template As(); + auto& param = this->Param(); + CHECK(param.weight); + CHECK(lite::xpu::LoadModel(*param.weight, &runtime_)); + CHECK(runtime_ != nullptr); +} + +void GraphCompute::Run() { + auto& param = this->Param(); + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + auto start_time = GetCurrentUS(); + for (int i = 0; i < param.inputs.size(); i++) { + auto input_var_name = param.inputs[i].first; + auto input_tensor = param.inputs[i].second; + LOG(INFO) << "input dims[" << i << ":" << input_var_name + << "]: " << input_tensor->dims(); + auto input_tensor_data = input_tensor->data(); + for (int j = 0; j < input_tensor->dims().production(); j++) { + VLOG(3) << input_tensor_data[j]; + } + auto input_ndarray = xtcl::xNDArray::Empty( + input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0}); + auto input_ndarray_data = + static_cast(input_ndarray.ToDLPack()->dl_tensor.data); + std::memcpy(input_ndarray_data, + input_tensor_data, + sizeof(float) * input_tensor->dims().production()); + runtime_->SetInputZeroCopy(input_var_name, + &input_ndarray.ToDLPack()->dl_tensor); + } + runtime_->Run(); + for (int i = 0; i < param.outputs.size(); i++) { + auto output_ndarray = runtime_->GetOutput(i); + auto output_var_name = param.outputs[i].first; + auto output_tensor = param.outputs[i].second; + output_tensor->Resize(output_ndarray.Shape()); + LOG(INFO) << "output dims[" << i << ":" << output_var_name + << "]: " << output_tensor->dims(); + auto output_ndarray_data = + static_cast(output_ndarray.ToDLPack()->dl_tensor.data); + auto output_tensor_data = output_tensor->mutable_data(); + std::memcpy(output_tensor_data, + output_ndarray_data, + sizeof(float) * output_tensor->dims().production()); + for (int j = 0; j < output_tensor->dims().production(); j++) { + VLOG(3) << output_tensor_data[j]; + } + } + LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us"; +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(graph_op, + kXPU, + kFloat, + kNCHW, + paddle::lite::kernels::xpu::GraphCompute, + def) + .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))}) + .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))}) + .Finalize(); diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/xpu/graph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..5406daa8a1b757989d006f4e0ea09baedc809e33 --- /dev/null +++ b/lite/kernels/xpu/graph_compute.h @@ -0,0 +1,47 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/kernel.h" +#include "lite/core/op_registry.h" +#include "lite/core/types.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +class GraphCompute : public KernelLite { + public: + using param_t = operators::GraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~GraphCompute() = default; + + private: + std::shared_ptr runtime_{nullptr}; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc index 8fd3fe8e6dc07a677d48dc54d330fd6568698de2..018ce264e2f18862549a4abc0444d02dcbb573ee 100644 --- a/lite/operators/graph_op.cc +++ b/lite/operators/graph_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/operators/graph_op.h" +#include #include "lite/core/op_registry.h" namespace paddle { @@ -34,7 +35,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { for (auto var : inputs) { CHECK(scope->FindVar(var)); - param_.inputs.push_back(scope->FindVar(var)->GetMutable()); + param_.inputs.push_back( + std::make_pair(var, scope->FindVar(var)->GetMutable())); } param_.weight = scope->FindVar(weight.front())->GetMutable(); @@ -42,7 +44,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { for (auto var : outputs) { CHECK(scope->FindVar(var)); - param_.outputs.push_back(scope->FindVar(var)->GetMutable()); + param_.outputs.push_back( + std::make_pair(var, scope->FindVar(var)->GetMutable())); } return true; diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h index 45d53f17f91ce5a7b42e9e54829640b9c94005db..097dd91163357d9fa43818c68687a48de06fe8aa 100644 --- a/lite/operators/op_params.h +++ b/lite/operators/op_params.h @@ -14,6 +14,7 @@ #pragma once #include +#include #include #include "lite/api/paddle_place.h" #include "lite/core/scope.h" @@ -69,9 +70,9 @@ struct CalibParam { }; struct GraphParam { - std::vector inputs{}; + std::vector> inputs{}; lite::Tensor* weight{}; - std::vector outputs{}; + std::vector> outputs{}; }; /// -------------------------- NN operators ------------------------------------ diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index 1e5fdbb34de3fd0b986e6ec635545fd114f42e5f..f2c2c9a71666b539248c955c6e75470c5933b5c9 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -1,4 +1,4 @@ -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh index 30fd812fb5f49d141786d9dab0f64788e27d07fc..03a74046f17ad03bccc7b6d5050acae9d643686c 100755 --- a/lite/tools/build_npu.sh +++ b/lite/tools/build_npu.sh @@ -5,12 +5,13 @@ set -ex ARM_OS="android" # android only yet ARM_ABI="armv8" # armv8, armv7 ARM_LANG="gcc" # gcc only yet -ANDROID_STL="c++_shared" # c++_shared, c++_static +ANDROID_STL="c++_static" # c++_shared, c++_static DDK_ROOT="$(pwd)/ai_ddk_lib/" # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/ TARGET_NAME="test_npu_pass" # default target BUILD_EXTRA=OFF # ON(with sequence ops)/OFF WITH_JAVA=ON # ON(build jar and jni so)/OFF WITH_TESTING=ON # ON/OFF +SHUTDOWN_LOG=OFF # ON(disable logging)/OFF ON_TINY_PUBLISH=OFF # ON(tiny publish)/OFF(full publish) function print_usage { @@ -75,6 +76,7 @@ function build_npu { fi if [[ "${ON_TINY_PUBLISH}" == "ON" ]]; then WITH_TESTING=OFF + SHUTDOWN_LOG=ON publish_dir="tiny_publish" else publish_dir="full_publish" @@ -97,6 +99,7 @@ function build_npu { -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \ -DWITH_TESTING=${WITH_TESTING} \ -DLITE_WITH_JAVA=${WITH_JAVA} \ + -DLITE_SHUTDOWN_LOG=${SHUTDOWN_LOG} \ -DLITE_WITH_NPU=ON \ -DLITE_ON_TINY_PUBLISH=${ON_TINY_PUBLISH} \ -DANDROID_API_LEVEL=24 \ diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh new file mode 100755 index 0000000000000000000000000000000000000000..62a123c82b2945147fa8616ad8faf0af33a32302 --- /dev/null +++ b/lite/tools/build_xpu.sh @@ -0,0 +1,116 @@ +#!/bin/bash +set -ex + +# global variables with default value +XPU_SDK_ROOT="$(pwd)/../XPU_SDK" # XPU SDK +TARGET_NAME="lite_compile_deps" # default target +BUILD_EXTRA=ON # ON(with sequence ops)/OFF +WITH_TESTING=ON # ON/OFF + +function print_usage { + echo -e "\nUSAGE:" + echo + echo "----------------------------------------" + echo -e "--xpu_sdk_root=" + echo -e "--target_name=" + echo "----------------------------------------" + echo +} + +# readonly variables with default value +readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \ + -DWITH_PYTHON=OFF \ + -DLITE_WITH_ARM=OFF" + +readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-1} + +readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz +readonly workspace=$(pwd) + +function prepare_thirdparty { + if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then + rm -rf $workspace/third-party + + if [ ! -f $workspace/third-party-05b862.tar.gz ]; then + wget $THIRDPARTY_TAR + fi + tar xzf third-party-05b862.tar.gz + else + git submodule update --init --recursive + fi +} + +# for code gen, a source file is generated after a test, but is dependended by some targets in cmake. +# here we fake an empty file to make cmake works. +function prepare_workspace { + # in build directory + # 1. Prepare gen_code file + GEN_CODE_PATH_PREFIX=lite/gen_code + mkdir -p ./${GEN_CODE_PATH_PREFIX} + touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc + + # 2.Prepare debug tool + DEBUG_TOOL_PATH_PREFIX=lite/tools/debug + mkdir -p ./${DEBUG_TOOL_PATH_PREFIX} + cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/ + + # clone submodule + # git submodule update --init --recursive + prepare_thirdparty +} + +function build_xpu { + build_dir=${workspace}/build.lite.xpu + mkdir -p $build_dir + cd $build_dir + + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + prepare_workspace + cmake .. \ + ${CMAKE_COMMON_OPTIONS} \ + -DWITH_GPU=OFF \ + -DWITH_MKLDNN=OFF \ + -DLITE_WITH_X86=ON \ + -DWITH_MKL=ON \ + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_XPU=ON \ + -DWITH_TESTING=${WITH_TESTING} \ + -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + + make $TARGET_NAME -j$NUM_CORES_FOR_COMPILE + + cd - + echo "Done" +} + +function main { + # Parse command line. + for i in "$@"; do + case $i in + --target_name=*) + TARGET_NAME="${i#*=}" + shift + ;; + --build_extra=*) + BUILD_EXTRA="${i#*=}" + shift + ;; + --xpu_sdk_root=*) + XPU_SDK_ROOT="${i#*=}" + shift + ;; + build) + build_xpu + shift + ;; + *) + # unknown option + print_usage + exit 1 + ;; + esac + done +} + +main $@ diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh index 0e8f75f10ace88a1fa57ebce8f158ab3416546b6..8be8e6e6b6da1e2aa38b6fcbcf95b23a8543a5be 100755 --- a/lite/tools/ci_build.sh +++ b/lite/tools/ci_build.sh @@ -248,6 +248,63 @@ function build_test_train { } +function cmake_xpu { + export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib" + prepare_workspace + cmake .. \ + ${common_flags} \ + -DWITH_GPU=OFF \ + -DWITH_MKLDNN=OFF \ + -DLITE_WITH_X86=ON \ + -DWITH_MKL=ON \ + -DLITE_BUILD_EXTRA=ON \ + -DLITE_WITH_XPU=ON \ + -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK" +} + +function build_xpu { + make lite_compile_deps -j$NUM_CORES_FOR_COMPILE +} + +# It will eagerly test all lite related unittests. +function test_xpu { + # Due to the missing of xpu kernels, we skip the following tests temporarily. + # TODO(xxx) clear the skip list latter + local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet" + "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86" + "test_inceptionv4_lite_x86" "test_light_api" + "test_apis" "test_model_bin" + ) + local to_skip=0 + for _test in $(cat $TESTS_FILE); do + to_skip=0 + for skip_name in ${skip_list[@]}; do + if [ $skip_name = $_test ]; then + echo "to skip " $skip_name + to_skip=1 + fi + done + + if [ $to_skip -eq 0 ]; then + ctest -R $_test -V + fi + done +} + +# Build the code and run lite server tests. This is executed in the CI system. +function build_test_xpu { + cur_dir=$(pwd) + + build_dir=$cur_dir/build.lite.xpu + mkdir -p $build_dir + cd $build_dir + + cmake_xpu + build_xpu + + test_xpu +} + # test_arm_android function test_arm_android { local test_name=$1 @@ -850,6 +907,10 @@ function main { cmake_x86 shift ;; + cmake_xpu) + cmake_xpu + shift + ;; cmake_opencl) cmake_opencl $ARM_OS $ARM_ABI $ARM_LANG shift @@ -874,6 +935,10 @@ function main { test_server shift ;; + test_xpu) + test_xpu + shift + ;; test_arm) test_arm $ARM_OS $ARM_ABI $ARM_LANG $ARM_PORT shift @@ -890,6 +955,10 @@ function main { build_test_server shift ;; + build_test_xpu) + build_test_xpu + shift + ;; build_test_train) build_test_train shift diff --git a/lite/tools/debug/CMakeLists.txt b/lite/tools/debug/CMakeLists.txt index ae098b05a66668e1cd4166c4b174feec538d8b37..43c0812ab91f6ddcba02f93d2eea60f5a5268341 100644 --- a/lite/tools/debug/CMakeLists.txt +++ b/lite/tools/debug/CMakeLists.txt @@ -13,6 +13,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL) X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} CL_DEPS ${opencl_kernels}) endif()