提交 20ba19fe 编写于 作者: N nhzlx

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into develop

...@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) ...@@ -59,6 +59,7 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
...@@ -184,6 +185,10 @@ if(LITE_WITH_CUDA) ...@@ -184,6 +185,10 @@ if(LITE_WITH_CUDA)
include(cuda) include(cuda)
endif() endif()
if(LITE_WITH_XPU)
include(xpu)
endif()
include(generic) # simplify cmake module include(generic) # simplify cmake module
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
include(util) # set unittest and link libs include(util) # set unittest and link libs
......
...@@ -127,6 +127,10 @@ if (LITE_WITH_NPU) ...@@ -127,6 +127,10 @@ if (LITE_WITH_NPU)
add_definitions("-DLITE_WITH_NPU") add_definitions("-DLITE_WITH_NPU")
endif() endif()
if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU")
endif()
if (LITE_WITH_OPENCL) if (LITE_WITH_OPENCL)
add_definitions("-DLITE_WITH_OPENCL") add_definitions("-DLITE_WITH_OPENCL")
endif() endif()
......
...@@ -50,9 +50,6 @@ find_library(NPU_DDK_IR_FILE NAMES hiai_ir ...@@ -50,9 +50,6 @@ find_library(NPU_DDK_IR_FILE NAMES hiai_ir
find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}) PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
find_library(NPU_DDK_PROTO_FILE NAMES protobuf-lite
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH})
if(NOT NPU_DDK_HIAI_FILE) if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}") message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else() else()
...@@ -77,14 +74,8 @@ else() ...@@ -77,14 +74,8 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE}) set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif() endif()
if(NOT NPU_DDK_PROTO_FILE) set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
message(FATAL_ERROR "Can not find NPU_DDK_PROTO_FILE in ${NPU_DDK_ROOT}") set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
else()
message(STATUS "Found NPU_DDK Protobuf Library: ${NPU_DDK_PROTO_FILE}")
add_library(npu_ddk_proto SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_proto PROPERTY IMPORTED_LOCATION ${NPU_DDK_PROTO_FILE})
endif()
set(npu_ddk_libs npu_ddk_hiai npu_ddk_ir npu_ddk_ir_build npu_ddk_proto CACHE INTERNAL "npu ddk libs")
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -83,6 +83,12 @@ function (lite_deps TARGET) ...@@ -83,6 +83,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_XPU)
foreach(var ${lite_deps_XPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE) set(${TARGET} ${deps} PARENT_SCOPE)
endfunction() endfunction()
...@@ -107,7 +113,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -107,7 +113,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -118,6 +124,7 @@ function(lite_cc_library TARGET) ...@@ -118,6 +124,7 @@ function(lite_cc_library TARGET)
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
...@@ -236,6 +243,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels") ...@@ -236,6 +243,7 @@ set(arm_kernels CACHE INTERNAL "arm kernels")
set(x86_kernels CACHE INTERNAL "x86 kernels") set(x86_kernels CACHE INTERNAL "x86 kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels") set(host_kernels CACHE INTERNAL "host kernels")
...@@ -305,6 +313,12 @@ function(add_kernel TARGET device level) ...@@ -305,6 +313,12 @@ function(add_kernel TARGET device level)
endif() endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU)
return()
endif()
set(xpu_kernels "${xpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "FPGA") if ("${device}" STREQUAL "FPGA")
if (NOT LITE_WITH_FPGA) if (NOT LITE_WITH_FPGA)
return() return()
...@@ -338,6 +352,7 @@ function(add_kernel TARGET device level) ...@@ -338,6 +352,7 @@ function(add_kernel TARGET device level)
lite_cc_library(${TARGET} SRCS ${args_SRCS} lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS} DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS} X86_DEPS ${args_X86_DEPS}
XPU_DEPS ${args_XPU_DEPS}
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
...@@ -386,6 +401,7 @@ function(add_operator TARGET level) ...@@ -386,6 +401,7 @@ function(add_operator TARGET level)
lite_cc_library(${TARGET} SRCS ${args_SRCS} lite_cc_library(${TARGET} SRCS ${args_SRCS}
DEPS ${args_DEPS} DEPS ${args_DEPS}
X86_DEPS ${args_X86_DEPS} X86_DEPS ${args_X86_DEPS}
XPU_DEPS ${args_XPU_DEPS}
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_XPU)
return()
endif()
if(NOT DEFINED XPU_SDK_ROOT)
set(XPU_SDK_ROOT $ENV{XPU_SDK_ROOT})
if(NOT XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif()
endif()
message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
include_directories("${XPU_SDK_ROOT}/XTDK/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
if(NOT XPU_SDK_XPU_API_FILE)
message(FATAL_ERROR "Can not find XPU API Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU API Library: ${XPU_SDK_XPU_API_FILE}")
add_library(xpu_sdk_xpu_api SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xpu_api PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_API_FILE})
endif()
find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
if(NOT XPU_SDK_XPU_RT_FILE)
message(FATAL_ERROR "Can not find XPU RT Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU RT Library: ${XPU_SDK_XPU_RT_FILE}")
add_library(xpu_sdk_xpu_rt SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
endif()
find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
if(NOT XPU_SDK_XPU_JITC_FILE)
message(FATAL_ERROR "Can not find XPU JITC Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU JITC Library: ${XPU_SDK_XPU_JITC_FILE}")
add_library(xpu_sdk_xpu_jitc SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xpu_jitc PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_JITC_FILE})
endif()
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
endif()
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_xpu_jitc xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
...@@ -6,6 +6,7 @@ message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}") ...@@ -6,6 +6,7 @@ message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
......
...@@ -26,11 +26,21 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and ...@@ -26,11 +26,21 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
DEPS ${light_lib_DEPS} DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels}) ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(paddle_light_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
else() else()
if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
add_library(paddle_light_api_shared SHARED "") add_library(paddle_light_api_shared SHARED "")
target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc) target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
endif()
endif() endif()
endif() endif()
...@@ -39,7 +49,8 @@ if (WITH_TESTING) ...@@ -39,7 +49,8 @@ if (WITH_TESTING)
DEPS scope optimizer target_wrapper_host model_parser program DEPS scope optimizer target_wrapper_host model_parser program
${ops} ${host_kernels} ${ops} ${host_kernels}
CUDA_DEPS ${cuda_kernels} CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels})
endif() endif()
if(LITE_WITH_FPGA) if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps}) set(light_api_deps ${light_api_deps} ${fpga_deps})
...@@ -51,6 +62,7 @@ message(STATUS "get X86 kernels ${x86_kernels}") ...@@ -51,6 +62,7 @@ message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
# for full api # for full api
...@@ -63,6 +75,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -63,6 +75,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kenrels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kenrels})
endif() endif()
...@@ -82,6 +95,7 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -82,6 +95,7 @@ lite_cc_library(light_api SRCS light_api.cc
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kenrels} CL_DEPS ${opencl_kenrels}
FPGA_DEPS ${fpga_kenrels}) FPGA_DEPS ${fpga_kenrels})
...@@ -96,6 +110,7 @@ if(WITH_TESTING) ...@@ -96,6 +110,7 @@ if(WITH_TESTING)
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
...@@ -223,6 +238,7 @@ lite_cc_test(test_apis SRCS apis_test.cc ...@@ -223,6 +238,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
DEPS cxx_api light_api ${ops} paddle_api_light DEPS cxx_api light_api ${ops} paddle_api_light
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -250,6 +266,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle ...@@ -250,6 +266,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
${ops} ${ops}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
...@@ -264,6 +281,7 @@ if(NOT IOS) ...@@ -264,6 +281,7 @@ if(NOT IOS)
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels})
...@@ -271,6 +289,7 @@ if(NOT IOS) ...@@ -271,6 +289,7 @@ if(NOT IOS)
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}) X86_DEPS ${x86_kernels})
......
...@@ -17,10 +17,20 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -17,10 +17,20 @@ if (NOT LITE_ON_TINY_PUBLISH)
# Unlike static library, module library has to link target to be able to work # Unlike static library, module library has to link target to be able to work
# as a single .so lib. # as a single .so lib.
target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
if (LITE_WITH_NPU)
# Strips the symbols of our protobuf functions to fix the conflicts during
# loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(paddle_lite_jni PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
else() else()
add_library(paddle_lite_jni SHARED "") add_library(paddle_lite_jni SHARED "")
target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc) target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
add_dependencies(paddle_lite_jni op_list_h kernel_list_h) add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
if (LITE_WITH_NPU)
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
endif()
endif() endif()
if (APPLE) if (APPLE)
......
...@@ -46,8 +46,16 @@ std::string Place::DebugString() const { ...@@ -46,8 +46,16 @@ std::string Place::DebugString() const {
} }
const std::string& TargetToStr(TargetType target) { const std::string& TargetToStr(TargetType target) {
static const std::string target2string[] = { static const std::string target2string[] = {"unk",
"unk", "host", "x86", "cuda", "arm", "opencl", "any", "fpga", "npu"}; "host",
"x86",
"cuda",
"arm",
"opencl",
"any",
"fpga",
"npu",
"xpu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -84,7 +92,8 @@ const std::string& TargetRepr(TargetType target) { ...@@ -84,7 +92,8 @@ const std::string& TargetRepr(TargetType target) {
"kOpenCL", "kOpenCL",
"kAny", "kAny",
"kFPGA", "kFPGA",
"kNPU"}; "kNPU",
"kXPU"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
......
...@@ -50,8 +50,9 @@ enum class TargetType : int { ...@@ -50,8 +50,9 @@ enum class TargetType : int {
kOpenCL = 5, kOpenCL = 5,
kFPGA = 7, kFPGA = 7,
kNPU = 8, kNPU = 8,
kXPU = 9,
kAny = 6, // any target kAny = 6, // any target
NUM = 9, // number of fields. NUM = 10, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -5,3 +5,4 @@ add_subdirectory(cuda) ...@@ -5,3 +5,4 @@ add_subdirectory(cuda)
add_subdirectory(fpga) add_subdirectory(fpga)
add_subdirectory(host) add_subdirectory(host)
add_subdirectory(npu) add_subdirectory(npu)
add_subdirectory(xpu)
...@@ -2,4 +2,5 @@ if(NOT LITE_WITH_NPU) ...@@ -2,4 +2,5 @@ if(NOT LITE_WITH_NPU)
return() return()
endif() endif()
lite_cc_library(npu_runtime SRCS runtime.cc DEPS npu_ddk_hiai) lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs})
lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
...@@ -12,21 +12,14 @@ ...@@ -12,21 +12,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "lite/kernels/npu/bridges/utils.h" #include "lite/backends/npu/builder.h"
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <utility> #include <utility>
#include "ai_ddk_lib/include/graph/buffer.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data
#include "ai_ddk_lib/include/graph/tensor.h" // for ge::TensorUtils
#include "ai_ddk_lib/include/hiai_ir_build.h"
#include "lite/backends/npu/runtime.h" #include "lite/backends/npu/runtime.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels {
namespace npu { namespace npu {
namespace bridges {
// Build HIAI IR graph to om model, and store om model data into lite tensor // Build HIAI IR graph to om model, and store om model data into lite tensor
bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT bool BuildModel(std::vector<ge::Operator>& inputs, // NOLINT
...@@ -165,8 +158,6 @@ bool HasInputArg(const OpInfo* op_info, ...@@ -165,8 +158,6 @@ bool HasInputArg(const OpInfo* op_info,
} }
} }
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -18,16 +18,159 @@ ...@@ -18,16 +18,159 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "ai_ddk_lib/include/graph/buffer.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h" #include "ai_ddk_lib/include/graph/operator_reg.h"
#include "ai_ddk_lib/include/hiai_ir_build.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/target_wrapper.h" #include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
// Extended Ops of HIAI DDK
namespace ge {
/**
* Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
* The inputs must be two-dimensional matrices and the inner dimension of "x1"
* (after being transposed if transpose_x1 is true) must match the outer
* dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
* x : the first input tensor, must be non const op.
* w : the second input tensor, must be const op.
* bias: the optional bias tensor, must be const op.
* <Output>
* y : the output tensor.
* <Attr>
* has_bias: If true, enable input bias.
*/
REG_OP(MatMul)
.INPUT(x, TensorType({DT_FLOAT}))
.INPUT(w, TensorType({DT_FLOAT}))
.OPTIONAL_INPUT(bias, TensorType({DT_FLOAT})) // bias must be const input
.OUTPUT(y, TensorType({DT_FLOAT}))
.ATTR(has_bias, AttrValue::BOOL{false}) // when has input::bias,set true
.OP_END();
/**
* Computes the gradients of convolution with respect to the input.
* <Input>
* input_sizes : An integer vector representing the shape of input,
* where input is a 4-D [batch, height, width, channels] tensor.
* filter : the filter tensor, with shape [H , W, filter_channel,
* filter_number], filter_channel must be same as x channel.
* x : The input tensor.
* <Output>
* y : The output tensor.
* <Attr>
* format: 0: NCHW. 1: NHWC
* group : 1: default
* num_output : 0: default, num_output must be equal to
* (filter_channel * group)
* pad : Padding for the beginning and ending along each axis
* stride : Stride along each axis.
* dilation : dilation value along each axis of the filter.
* pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
* bias_term : 0: default
* kernel : The shape of the convolution kernel
*/
REG_OP(Deconvolution)
.INPUT(input_sizes, TensorType({DT_UINT8}))
.INPUT(filter, TensorType({DT_FLOAT}))
.INPUT(x, TensorType({DT_FLOAT}))
.OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
.OUTPUT(y, TensorType({DT_FLOAT}))
.ATTR(mode, AttrValue::INT{1})
.ATTR(format, AttrValue::INT{1})
.ATTR(group, AttrValue::INT{1})
.ATTR(num_output, AttrValue::INT{0})
.ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
.ATTR(stride, AttrValue::LIST_INT({1, 1}))
.ATTR(dilation, AttrValue::LIST_INT({1, 1}))
.ATTR(pad_mode, AttrValue::INT{0})
.ATTR(bias_term, AttrValue::INT{0})
.ATTR(kernel, AttrValue::LIST_INT({0, 0}))
.OP_END();
/**
* Resize images to size using bilinear interpolation.
* <Input>
* x : The tensor of 4-D
* w : A int32 Tensor of 2 elements: [height, width].
* <Output>
* y : the output tensor
* <Attr>
* align_corners : If true, the centers of the 4 corner pixels of the
* input and output tensors are aligned, preserving the values at the corner
* pixels.
* output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
* shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
* controled by the [height, width] of w.
* shrink_factor : shrink factor.
* zoom_factor : zoom factor.
* pad_begin : begin of pad.
* pad_end : end of pad.
*/
REG_OP(ResizeBilinear)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
.INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
.OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
.ATTR(align_corners, AttrValue::BOOL{false})
.ATTR(output_dim_mode, AttrValue::INT{2})
.ATTR(shrink_factor, AttrValue::INT{1})
.ATTR(zoom_factor, AttrValue::INT{1})
.ATTR(pad_begin, AttrValue::INT{0})
.ATTR(pad_end, AttrValue::INT{0})
.OP_END();
/**
* Resize images to size using nearest neighbor interpolation.
* <Input>
* image : Resize images to size using nearest neighbor interpolation.
* size : Must be one dimension and two elements
* <Output>
* output : the output tensor
* <Attr>
* align_corners : If true, the centers of the 4 corner pixels of the
* input and output tensors are aligned, preserving the values at the corner
* pixels. Defaults to false
*/
REG_OP(ResizeNearestNeighbor)
.INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
.INPUT(size, TensorType({DT_INT32}))
.OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
.ATTR(align_corners, AttrValue::BOOL{false})
.OP_END();
/**
* Pads a tensor.
* <Input>
* x : the input tensor
* padding : the input tensor must be 2-D
* constant_values : constant values must be a scalar
* <Output>
* output : the output tensor
* <Attr>
* t_paddings : Default DT_INT32 , t_paddings must be the same with
* datatype of the padding
* mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC
* T : datatype of constant_values DT_INT32:3 DT_FLOAT:0
*/
REG_OP(Pad)
.INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
.INPUT(padding, TensorType({DT_INT32}))
.OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT}))
.OUTPUT(output, TensorType({DT_FLOAT, DT_INT32}))
.ATTR(t_paddings, AttrValue::INT{3})
.ATTR(mode, AttrValue::INT{0})
.REQUIRED_ATTR(T, AttrValue::INT)
.OP_END();
} // namespace ge
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace kernels {
namespace npu { namespace npu {
namespace bridges {
class OpList { class OpList {
public: public:
...@@ -106,8 +249,6 @@ bool HasInputArg(const OpInfo* op_info, ...@@ -106,8 +249,6 @@ bool HasInputArg(const OpInfo* op_info,
const Scope* scope, const Scope* scope,
const std::string& argname); const std::string& argname);
} // namespace bridges
} // namespace npu } // namespace npu
} // namespace kernels
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -32,8 +32,8 @@ math_library(sampler) ...@@ -32,8 +32,8 @@ math_library(sampler)
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3) lite_cc_library(blas SRCS blas.cc DEPS cblas framework_proto eigen3 dynload_mklml)
math_library(math_function DEPS blas) math_library(math_function DEPS blas dynload_mklml)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows math_function blas) math_library(selected_rows_functor DEPS selected_rows math_function blas)
......
if(NOT LITE_WITH_XPU)
return()
endif()
lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs})
lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include <mutex> // NOLINT
#include <utility>
#include "lite/backends/xpu/runtime.h"
namespace paddle {
namespace lite {
namespace xpu {
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
std::string UniqueName(const std::string& prefix) {
static std::mutex counter_mtx;
static std::unordered_map<std::string, int> counter_map;
std::unique_lock<std::mutex> counter_lck(counter_mtx);
int counter = 1;
auto it = counter_map.find(prefix);
if (it == counter_map.end()) {
counter_map[prefix] = counter;
} else {
counter = ++(it->second);
}
return prefix + "_" + std::to_string(counter);
}
xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
xtcl::DataType out_type = ::xtcl::Float(32);
switch (in_type) {
case PRECISION(kFloat):
out_type = ::xtcl::Float(32);
break;
case PRECISION(kInt8):
out_type = ::xtcl::Int(8);
break;
case PRECISION(kInt32):
out_type = ::xtcl::Int(32);
break;
default:
LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
<< ") from Lite to XPU";
break;
}
return out_type;
}
DLDataType CvtDataType(PrecisionType in_type) {
DLDataType out_type = {kDLFloat, 32, 1};
switch (in_type) {
case PRECISION(kFloat):
out_type = {kDLFloat, 32, 1};
break;
case PRECISION(kInt8):
out_type = {kDLInt, 8, 1};
break;
case PRECISION(kInt32):
out_type = {kDLInt, 32, 1};
break;
default:
LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
<< ") from Lite to XPU";
break;
}
return out_type;
}
xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape) {
xtcl::Array<xtcl::xIndexExpr> out_shape;
for (auto dim : in_shape) {
out_shape.push_back(dim);
}
return out_shape;
}
xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape) {
return CvtShape(std::vector<int>(in_shape.begin(), in_shape.end()));
}
xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
return CvtShape(in_dims.Vectorize());
}
std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
std::vector<int64_t> out_shape,
PrecisionType in_ptype,
DataLayoutType in_ltype) {
uint8_t* in_data = nullptr;
auto in_size = in_tensor->dims().production();
auto in_shape = in_tensor->dims().Vectorize();
if (out_shape.empty()) {
out_shape = in_shape;
}
int in_bytes;
if (in_ptype == PRECISION(kFloat)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
in_bytes = in_size * sizeof(float);
} else if (in_ptype == PRECISION(kInt32)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
in_bytes = in_size * sizeof(int32_t);
} else if (in_ptype == PRECISION(kInt8)) {
in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
in_bytes = in_size * sizeof(int8_t);
} else {
LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
}
auto out_tensor = std::make_shared<xtcl::xNDArray>(
xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
auto out_data =
reinterpret_cast<uint8_t*>(out_tensor->ToDLPack()->dl_tensor.data);
std::memcpy(out_data, in_data, in_bytes);
return out_tensor;
}
// Build the XPU subgraph to the XPU model, store the model data into the
// weight tensor of the graph op, and the model data will be loaded again
// by the graph computing kernel when the graph op is executed for inference.
// Due to the lack of XPU APIs for building and outputing the model data,
// the compiled XPU runtime object will be managed by the global variable
// 'DeviceInfo' and the key name for finding the runtime object will be
// stored in the weight tensor of graph op.
// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
// data to the weight tensor of graph op.
bool BuildModel(
std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
lite::Tensor* model) {
LOG(INFO) << "[XPU] Build Model.";
CHECK(builder != nullptr);
CHECK(outputs != nullptr);
CHECK_GT(outputs->size(), 0);
CHECK(model != nullptr);
// build graph and fill all of constant params
xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
auto target = xtcl::Target::Create("llvm");
auto compiler = xtcl::network::xTensorCompiler(network, target);
compiler.SetParams(*params); // set the data of constant tensors
compiler.Build();
// create and register runtime
auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
compiler.CreateRuntimeInstance());
if (runtime == nullptr) {
LOG(WARNING) << "[XPU] Build Model failed!";
return false;
}
std::string name = UniqueName("xpu");
LOG(INFO) << "[XPU] Model Name: " << name;
DeviceInfo::Global().Insert(name, runtime);
model->Resize({static_cast<int64_t>(name.length() + 1)});
memcpy(model->mutable_data<int8_t>(),
reinterpret_cast<const int8_t*>(name.c_str()),
name.length() + 1);
return true;
}
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <xtcl/xtcl.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace xpu {
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
std::string UniqueName(const std::string& prefix);
xtcl::DataType CvtPrecisionType(PrecisionType in_type);
DLDataType CvtDataType(PrecisionType in_type);
xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape);
xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
std::shared_ptr<xtcl::xNDArray> CvtTensor(
Tensor* in_tensor,
std::vector<int64_t> out_shape = {},
PrecisionType in_ptype = PRECISION(kFloat),
DataLayoutType in_ltype = DATALAYOUT(kNCHW));
bool BuildModel(
std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
lite::Tensor* model);
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/runtime.h"
#include <vector>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace xpu {
// Extract the model data and recover the XPU model for inference, the function
// is called by the graph computing kernel when the graph op is executed.
// Due to the lack of XPU APIs for loading and recovering the XPU model from
// memory, the key name is obtained from the weight tensor of graph op, to get
// the runtime object for inference from the global variable 'DeviceInfo'.
// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
bool LoadModel(const lite::Tensor &model,
std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
LOG(INFO) << "[XPU] Load Model.";
CHECK_GT(model.dims().production(), 0);
std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
LOG(INFO) << "[XPU] Model Name: " << name;
CHECK(runtime != nullptr);
*runtime = DeviceInfo::Global().Find(name);
if (*runtime == nullptr) {
LOG(WARNING) << "[XPU] Load Model failed!";
return false;
}
return true;
}
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <xtcl/xtcl.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace xpu {
class DeviceInfo {
public:
static DeviceInfo& Global() {
static DeviceInfo x;
return x;
}
DeviceInfo() {}
void Insert(const std::string& name,
std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
if (runtimes_.find(name) != runtimes_.end()) {
LOG(WARNING) << "[XPU] Model " << name << " already exists.";
return;
}
runtimes_.emplace(std::make_pair(name, runtime));
}
void Clear() { runtimes_.clear(); }
std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
const std::string& name) const {
if (runtimes_.find(name) != runtimes_.end()) {
return runtimes_.at(name);
} else {
return nullptr;
}
}
private:
int device_id_{0};
std::string device_name_{"default"};
std::unordered_map<std::string,
std::shared_ptr<xtcl::network::xRuntimeInstance>>
runtimes_;
};
bool LoadModel(const lite::Tensor& model,
std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
} // namespace xpu
} // namespace lite
} // namespace paddle
...@@ -35,7 +35,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor) ...@@ -35,7 +35,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
if (LITE_WITH_ARM) if (LITE_WITH_ARM)
lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime)
else() else()
lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags) lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime)
endif() endif()
#-------------------------------------------- GET CODE META INFO ------------------------------------------ #-------------------------------------------- GET CODE META INFO ------------------------------------------
......
...@@ -5,6 +5,6 @@ endif() ...@@ -5,6 +5,6 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if(NOT LITE_WITH_OPENCL AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -28,6 +28,9 @@ ...@@ -28,6 +28,9 @@
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
#include "lite/backends/npu/runtime.h" #include "lite/backends/npu/runtime.h"
#endif #endif
#ifdef LITE_WITH_XPU
#include "lite/backends/xpu/runtime.h"
#endif
#include <map> #include <map>
#include <memory> #include <memory>
...@@ -55,6 +58,7 @@ using X86Context = Context<TargetType::kX86>; ...@@ -55,6 +58,7 @@ using X86Context = Context<TargetType::kX86>;
using CUDAContext = Context<TargetType::kCUDA>; using CUDAContext = Context<TargetType::kCUDA>;
using ARMContext = Context<TargetType::kARM>; using ARMContext = Context<TargetType::kARM>;
using NPUContext = Context<TargetType::kNPU>; using NPUContext = Context<TargetType::kNPU>;
using XPUContext = Context<TargetType::kXPU>;
using OpenCLContext = Context<TargetType::kOpenCL>; using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>; using FPGAContext = Context<TargetType::kFPGA>;
...@@ -84,6 +88,20 @@ class Context<TargetType::kNPU> { ...@@ -84,6 +88,20 @@ class Context<TargetType::kNPU> {
}; };
#endif #endif
#ifdef LITE_WITH_XPU
template <>
class Context<TargetType::kXPU> {
public:
Context() {}
explicit Context(const NPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(XPUContext* ctx) {}
std::string name() const { return "XPUContext"; }
};
#endif
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
template <> template <>
class Context<TargetType::kARM> { class Context<TargetType::kARM> {
...@@ -340,6 +358,12 @@ class ContextScheduler { ...@@ -340,6 +358,12 @@ class ContextScheduler {
&ctx->As<NPUContext>()); &ctx->As<NPUContext>());
break; break;
#endif #endif
#ifdef LITE_WITH_XPU
case TARGET(kXPU):
kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
&ctx->As<XPUContext>());
break;
#endif
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
case TARGET(kOpenCL): case TARGET(kOpenCL):
kernel_contexts_[TargetType::kOpenCL].As<OpenCLContext>().CopySharedTo( kernel_contexts_[TargetType::kOpenCL].As<OpenCLContext>().CopySharedTo(
...@@ -386,6 +410,9 @@ class ContextScheduler { ...@@ -386,6 +410,9 @@ class ContextScheduler {
#endif #endif
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>(); InitContext<TargetType::kNPU, NPUContext>();
#endif
#ifdef LITE_WITH_XPU
InitContext<TargetType::kXPU, XPUContext>();
#endif #endif
} }
......
...@@ -53,6 +53,7 @@ void ExpandPlaces(std::set<Place>* places, const Place& place) { ...@@ -53,6 +53,7 @@ void ExpandPlaces(std::set<Place>* places, const Place& place) {
TARGET(kARM), TARGET(kARM),
TARGET(kOpenCL), TARGET(kOpenCL),
TARGET(kNPU), TARGET(kNPU),
TARGET(kXPU),
TARGET(kFPGA)}); TARGET(kFPGA)});
static const Types<PrecisionType> precision_set( static const Types<PrecisionType> precision_set(
{PRECISION(kFloat), PRECISION(kInt8), PRECISION(kFP16), PRECISION(kAny)}); {PRECISION(kFloat), PRECISION(kInt8), PRECISION(kFP16), PRECISION(kAny)});
......
...@@ -16,7 +16,7 @@ set(subgraph_passes subgraph_pass) ...@@ -16,7 +16,7 @@ set(subgraph_passes subgraph_pass)
if(LITE_WITH_NPU) if(LITE_WITH_NPU)
lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
DEPS mir_pass types context ${mir_fusers} ${npu_bridges} ${npu_ddk_libs} graph_op subgraph_pass) DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
list(APPEND subgraph_passes npu_pass) list(APPEND subgraph_passes npu_pass)
lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
...@@ -30,5 +30,21 @@ if(LITE_WITH_NPU) ...@@ -30,5 +30,21 @@ if(LITE_WITH_NPU)
endif() endif()
endif() endif()
if(LITE_WITH_XPU)
lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc
DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass)
list(APPEND subgraph_passes xpu_pass)
lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc
DEPS xpu_pass mir_passes paddle_api_full gflags
ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
--optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
if (WITH_TESTING)
add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
endif()
endif()
set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes") set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
message(STATUS "----> subgraph_passes: ${subgraph_passes}") message(STATUS "----> subgraph_passes: ${subgraph_passes}")
...@@ -22,14 +22,9 @@ ...@@ -22,14 +22,9 @@
#include "lite/core/mir/pass_registry.h" #include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h" #include "lite/core/mir/pattern_matcher.h"
#include "ai_ddk_lib/include/HiAiModelManagerService.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h" // for ge::op::Data
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h" #include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -51,7 +46,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode( ...@@ -51,7 +46,7 @@ std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
auto wgt = std::make_shared<ge::op::Const>(arg.name); auto wgt = std::make_shared<ge::op::Const>(arg.name);
LOG(INFO) << "in convert const:" << arg.name; LOG(INFO) << "in convert const:" << arg.name;
VLOG(4) << dims; VLOG(4) << dims;
wgt->set_attr_value(lite::kernels::npu::bridges::CvtFromLiteTensor(tensor)); wgt->set_attr_value(lite::npu::CvtFromLiteTensor(tensor));
return wgt; return wgt;
} else { } else {
CHECK_EQ(dims.size(), 4); CHECK_EQ(dims.size(), 4);
...@@ -132,7 +127,7 @@ std::string GenerateNPUProgramPass::BuildNPUGraph( ...@@ -132,7 +127,7 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
// Compiling IR graph to NPU model and store mode data into weight tensor with // Compiling IR graph to NPU model and store mode data into weight tensor with
// persistable=true, Sothat the model parser can recognize it and save it to // persistable=true, Sothat the model parser can recognize it and save it to
// param files // param files
if (!lite::kernels::npu::bridges::BuildModel(inputs, outputs, weight)) { if (!lite::npu::BuildModel(inputs, outputs, weight)) {
LOG(WARNING) << "Build NPU failed subgraph " << sub_id; LOG(WARNING) << "Build NPU failed subgraph " << sub_id;
throw std::runtime_error("Build NPU failed subgraph."); throw std::runtime_error("Build NPU failed subgraph.");
} }
......
...@@ -20,10 +20,10 @@ ...@@ -20,10 +20,10 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "lite/backends/npu/builder.h"
#include "lite/core/mir/pass.h" #include "lite/core/mir/pass.h"
#include "lite/core/mir/subgraph/subgraph_program_pass.h" #include "lite/core/mir/subgraph/subgraph_program_pass.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
......
...@@ -93,11 +93,13 @@ void CompareOutputTensor( ...@@ -93,11 +93,13 @@ void CompareOutputTensor(
auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape()); auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size); EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
for (size_t j = 0; j < ref_output_tensor_size; j++) { for (size_t j = 0; j < ref_output_tensor_size; j++) {
auto diff = auto abs_diff =
std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) / std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
(std::fabs(ref_output_tensor_data[j]) + 1e-6); auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
VLOG(3) << diff; VLOG(3) << "val: " << tar_output_tensor_data[j]
EXPECT_LT(diff, 0.1); << " ref: " << ref_output_tensor_data[j]
<< " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
EXPECT_LT(rel_diff, 0.1);
} }
} }
} }
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/mir/pattern_matcher.h"
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::mir::Node* var_node,
const Scope* scope) {
CHECK(var_node->IsArg());
const auto& arg = var_node->AsArg();
auto var_name = arg.name;
VLOG(4) << "[XPU] Convert var node " << var_name;
auto* var = scope->FindVar(var_name);
CHECK(var);
auto* tensor = var->GetMutable<lite::Tensor>();
CHECK(tensor);
auto dims = tensor->dims();
auto cvted_var_node =
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
if (arg.is_weight) {
auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
}
return cvted_var_node;
}
void GenerateXPUProgramPass::CvtAllOpNodes(
const std::vector<Node*>& op_nodes,
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
const auto& supported_lists = bridges.AllFunctions();
// return record all converted vars
// op node's inputs must be found in converted_vars
for (auto& node : op_nodes) {
lite::kernels::xpu::bridges::node_map_type input_nodes;
auto& stmt = node->AsStmt();
for (auto& var_node : node->inlinks) {
auto& arg = var_node->AsArg();
// weight should be handled in the converter, so skip here
if (arg.is_weight) {
continue;
}
auto var_name = arg.name;
if (!cvted_var_nodes->count(var_name)) {
cvted_var_nodes->insert(std::make_pair(
var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
}
input_nodes.insert(*cvted_var_nodes->find(var_name));
}
auto output_nodes =
supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
}
}
std::string GenerateXPUProgramPass::BuildXPUGraph(
const std::unordered_set<Node*>& op_nodes,
const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& out_data_vars,
int sub_id) {
auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
graph_ctx.params =
std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
auto any_op = (*op_nodes.begin())->AsStmt().op();
auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
weight->set_persistable(true);
weight->set_precision(PRECISION(kInt8));
// Compiling graph to XPU model and store mode data into weight tensor with
// persistable=true, Sothat the model parser can recognize it and save it to
// param files
std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
for (auto out_data_var : out_data_vars) {
auto var_name = out_data_var->AsArg().name;
ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
}
if (!lite::xpu::BuildModel(graph_ctx.builder,
graph_ctx.params,
&ordered_cvted_var_nodes,
weight)) {
LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
throw std::runtime_error("[XPU] Build XPU graph failed.");
}
LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
return weight_var_name;
}
void GenerateXPUProgramPass::GenXPUSubgraph(
const std::unique_ptr<SSAGraph>& graph,
const std::unordered_set<Node*>& op_nodes,
int sub_id) {
std::unordered_set<Node*> in_data_vars;
std::unordered_set<Node*> in_wgt_vars;
std::unordered_set<Node*> out_data_vars;
std::unordered_set<Node*> out_unused_vars;
FindInputOutputVars(
op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
auto weight_var_name =
BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
auto any_op = (*op_nodes.begin())->AsStmt().op();
InsertNewNode(graph,
weight_var_name,
any_op->scope(),
any_op->valid_places(),
in_data_vars,
in_wgt_vars,
out_data_vars,
out_unused_vars);
auto nodes2rm = GetNode2rm(
op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
GraphSafeRemoveNodes(graph.get(), nodes2rm);
}
void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
const auto& op_map = bridges.AllFunctions();
std::vector<std::string> supported_op_types;
for (auto& i : op_map) {
LOG(INFO) << "[XPU] Supported type: " << i.first;
supported_op_types.push_back(i.first);
}
try {
int num_subgraph = FuseSubgraph(graph, supported_op_types);
InferOnce(graph);
auto op_nodes_all = ClassifySubgraph(graph);
CHECK_EQ(op_nodes_all.size(), num_subgraph);
int id = 1;
for (auto& op_nodes : op_nodes_all) {
LOG(INFO) << "[XPU] Converting Subgraph " << id;
GenXPUSubgraph(graph, op_nodes.second, id);
LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
<< Visualize(graph.get());
id++;
}
} catch (...) {
LOG(WARNING) << "[XPU] Build XPU graph failed.";
throw std::runtime_error("[XPU] Build XPU graph failed.");
}
for (auto& item : graph->StmtTopologicalOrder()) {
if (item->IsStmt()) {
auto& stmt = item->AsStmt();
LOG(INFO) << stmt;
insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
}
}
}
std::unique_ptr<RuntimeProgram> GenerateXPUProgramPass::GenProgram() {
LOG(INFO) << "[XPU] program insts.size=" << insts_.size();
std::unique_ptr<RuntimeProgram> program(
new RuntimeProgram(std::move(insts_)));
return program;
}
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(generate_xpu_program_pass,
paddle::lite::mir::subgraph::GenerateXPUProgramPass)
.BindTargets({TARGET(kXPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "lite/backends/xpu/builder.h"
#include "lite/core/mir/pass.h"
#include "lite/core/mir/subgraph/subgraph_program_pass.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace mir {
namespace subgraph {
class GenerateXPUProgramPass : public SubgraphProgramPass {
public:
using key2nodes_t = std::map<std::string, Node*>;
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
std::unique_ptr<RuntimeProgram> GenProgram();
protected:
// nodes2cvt: op nodes to convert
// return cvted_vars: converted var nodes
void CvtAllOpNodes(
const std::vector<Node*>& op_nodes,
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
std::shared_ptr<xtcl::xExpr> CvtVarNode(
lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
lite::mir::Node* var_node,
const Scope* scope);
std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes,
const std::unordered_set<Node*>& in_data_vars,
const std::unordered_set<Node*>& out_data_vars,
int sub_id);
void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
const std::unordered_set<Node*>& op_nodes,
int sub_id);
private:
std::vector<Instruction> insts_;
};
} // namespace subgraph
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <cmath>
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/utils/cp_logging.h"
DEFINE_string(model_file, "", "model file path of combined protobuf model");
DEFINE_string(params_file, "", "params file path of combined protobuf model");
DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
DEFINE_int32(output_tensor_num, 1, "number of output tensors");
namespace paddle {
namespace lite {
std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
std::vector<std::vector<int64_t>> shape;
while (!txt.empty()) {
size_t idx = txt.find_first_of(":");
std::string dims = txt.substr(0, idx);
std::vector<int64_t> s;
while (!dims.empty()) {
size_t idx = dims.find_first_of(",");
int d = atoi(dims.substr(0, idx).c_str());
VLOG(3) << d;
s.push_back(d);
if (idx == std::string::npos) {
break;
} else {
dims = dims.substr(idx + 1);
}
}
shape.push_back(s);
if (idx == std::string::npos) {
break;
} else {
txt = txt.substr(idx + 1);
}
}
return shape;
}
int64_t ShapeProduction(std::vector<int64_t> shape) {
int64_t s = 1;
for (int64_t dim : shape) {
s *= dim;
}
return s;
}
void FillInputTensor(
const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
const std::vector<std::vector<int64_t>>& input_tensor_shape,
const float value) {
for (int i = 0; i < input_tensor_shape.size(); i++) {
auto input_tensor = predictor->GetInput(i);
input_tensor->Resize(input_tensor_shape[i]);
auto input_tensor_data = input_tensor->mutable_data<float>();
auto input_tensor_size = ShapeProduction(input_tensor->shape());
for (int j = 0; j < input_tensor_size; j++) {
input_tensor_data[j] = value;
}
}
}
void CompareOutputTensor(
const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
const int output_tensor_num) {
for (int i = 0; i < output_tensor_num; i++) {
auto tar_output_tensor = tar_predictor->GetOutput(i);
auto ref_output_tensor = ref_predictor->GetOutput(i);
auto tar_output_tensor_data = tar_output_tensor->data<float>();
auto ref_output_tensor_data = ref_output_tensor->data<float>();
auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
for (size_t j = 0; j < ref_output_tensor_size; j++) {
auto diff =
std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
(std::fabs(ref_output_tensor_data[j]) + 1e-6);
VLOG(3) << diff;
EXPECT_LT(diff, 0.1);
}
}
}
std::shared_ptr<lite_api::PaddlePredictor> TestModel(
const std::string& model_dir,
const std::string& model_file,
const std::string& params_file,
const std::vector<lite_api::Place>& valid_places,
const std::vector<std::vector<int64_t>>& input_tensor_shape,
const std::string& optimized_model_dir) {
// generate optimized model
lite_api::CxxConfig cxx_config;
cxx_config.set_model_dir(model_dir);
cxx_config.set_model_file(model_file);
cxx_config.set_param_file(params_file);
cxx_config.set_valid_places(valid_places);
auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
FillInputTensor(predictor, input_tensor_shape, -1);
predictor->SaveOptimizedModel(optimized_model_dir,
lite_api::LiteModelType::kNaiveBuffer);
#if 0 // TODO(hong19860320) supports light api for XPU
// load optimized model
lite_api::MobileConfig mobile_config;
mobile_config.set_model_dir(optimized_model_dir);
mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
mobile_config.set_threads(1);
predictor = lite_api::CreatePaddlePredictor(mobile_config);
FillInputTensor(predictor, input_tensor_shape, 1);
#endif
// run optimized model
for (int i = 0; i < FLAGS_warmup; i++) {
predictor->Run();
}
for (int i = 0; i < FLAGS_repeats; i++) {
auto start = GetCurrentUS();
predictor->Run();
LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
}
return predictor;
}
TEST(XPUSubgraph, compare) {
// parsing input tensor shape, supported formats: "1,3,224,224"
// "1,3,224,224:1,80"
std::vector<std::vector<int64_t>> input_tensor_shape =
ParseShape(FLAGS_input_tensor_shape);
// generate and run optimized CPU model
LOG(INFO) << " ================ CPU ================== ";
auto cpu_predictor =
TestModel(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
{lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
input_tensor_shape,
FLAGS_optimized_model_dir + "/CPU");
// generate and run optimized XPU model
LOG(INFO) << " ================ XPU ================== ";
auto xpu_predictor =
TestModel(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
{lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
input_tensor_shape,
FLAGS_optimized_model_dir + "/XPU");
// verify results
CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
}
} // namespace lite
} // namespace paddle
...@@ -207,8 +207,26 @@ void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) { ...@@ -207,8 +207,26 @@ void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
if (!item->IsStmt()) continue; if (!item->IsStmt()) continue;
auto& stmt = item->AsStmt(); auto& stmt = item->AsStmt();
auto& op = stmt.op(); auto& op = stmt.op();
auto scope = op->scope();
std::string op_type = op->op_info()->Type(); std::string op_type = op->op_info()->Type();
if (op_type == "feed" || op_type == "fetch") continue; // check the dimension of input variables in the scope, must not be empty !
if (op_type == "feed") {
auto input_var_names = op->op_info()->output_names();
CHECK_GE(input_var_names.size(), 1);
for (auto input_var_name : input_var_names) {
auto input_var = scope->FindVar(input_var_name);
CHECK(input_var) << "No input variable '" << input_var_name
<< "' found in scope " << scope;
auto input = input_var->GetMutable<lite::Tensor>();
CHECK(!input->dims().empty()) << "The dimension of input variable '"
<< input_var_name
<< "' can not be empty.";
}
continue;
}
if (op_type == "fetch") {
continue;
}
op->CheckShape(); op->CheckShape();
op->InferShape(); op->InferShape();
// TOOD(xxx): remove Launch() at last // TOOD(xxx): remove Launch() at last
......
...@@ -46,6 +46,9 @@ TEST(SubgraphTest, models) { ...@@ -46,6 +46,9 @@ TEST(SubgraphTest, models) {
#endif #endif
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)}, Place{TARGET(kNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_XPU
Place{TARGET(kXPU), PRECISION(kFloat)},
#endif #endif
}); });
lite::Program program(program_desc, scope, valid_places); lite::Program program(program_desc, scope, valid_places);
......
...@@ -78,6 +78,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create( ...@@ -78,6 +78,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kNPU): { case TARGET(kNPU): {
CREATE_KERNEL(kNPU); CREATE_KERNEL(kNPU);
} break; } break;
case TARGET(kXPU): {
CREATE_KERNEL(kXPU);
} break;
case TARGET(kFPGA): { case TARGET(kFPGA): {
CREATE_KERNEL(kFPGA); CREATE_KERNEL(kFPGA);
} break; } break;
...@@ -142,6 +145,11 @@ KernelRegistry::KernelRegistry() ...@@ -142,6 +145,11 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kNPU, kAny, kNCHW); INIT_FOR(kNPU, kAny, kNCHW);
INIT_FOR(kNPU, kAny, kAny); INIT_FOR(kNPU, kAny, kAny);
INIT_FOR(kXPU, kFloat, kNCHW);
INIT_FOR(kXPU, kInt8, kNCHW);
INIT_FOR(kXPU, kAny, kNCHW);
INIT_FOR(kXPU, kAny, kAny);
INIT_FOR(kFPGA, kFP16, kNHWC); INIT_FOR(kFPGA, kFP16, kNHWC);
INIT_FOR(kFPGA, kFP16, kAny); INIT_FOR(kFPGA, kFP16, kAny);
INIT_FOR(kFPGA, kFloat, kNHWC); INIT_FOR(kFPGA, kFloat, kNHWC);
......
...@@ -178,6 +178,16 @@ class KernelRegistry final { ...@@ -178,6 +178,16 @@ class KernelRegistry final {
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kXPU),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kXPU),
PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kXPU),
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kFPGA), KernelRegistryForTarget<TARGET(kFPGA),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
......
...@@ -28,6 +28,9 @@ ...@@ -28,6 +28,9 @@
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
#include "lite/core/mir/subgraph/generate_npu_program_pass.h" #include "lite/core/mir/subgraph/generate_npu_program_pass.h"
#endif #endif
#ifdef LITE_WITH_XPU
#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
#endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -106,7 +109,8 @@ class Optimizer { ...@@ -106,7 +109,8 @@ class Optimizer {
"runtime_context_assign_pass", "runtime_context_assign_pass",
"argument_type_display_pass", // "argument_type_display_pass", //
#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) #if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
!defined(LITE_WITH_XPU)
// TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
"memory_optimize_pass", "memory_optimize_pass",
#endif #endif
...@@ -121,14 +125,27 @@ class Optimizer { ...@@ -121,14 +125,27 @@ class Optimizer {
// Generate a new program based on the mir graph. // Generate a new program based on the mir graph.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram() { std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
auto target_place = Place{
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
if (std::find(valid_places_.begin(), TARGET(kNPU),
valid_places_.end(), #endif
Place{TARGET(kNPU), PRECISION(kFloat)}) != #ifdef LITE_WITH_XPU
TARGET(kXPU),
#endif
PRECISION(kFloat)};
if (std::find(valid_places_.begin(), valid_places_.end(), target_place) !=
valid_places_.end()) { valid_places_.end()) {
#ifdef LITE_WITH_NPU
auto pass = mir::PassManager::Global() auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateNPUProgramPass>( .LookUp<mir::subgraph::GenerateNPUProgramPass>(
"generate_npu_program_pass"); "generate_npu_program_pass");
#endif
#ifdef LITE_WITH_XPU
auto pass = mir::PassManager::Global()
.LookUp<mir::subgraph::GenerateXPUProgramPass>(
"generate_xpu_program_pass");
#endif
try { try {
pass->Apply(graph_); pass->Apply(graph_);
auto program = pass->GenProgram(); auto program = pass->GenProgram();
...@@ -136,7 +153,8 @@ class Optimizer { ...@@ -136,7 +153,8 @@ class Optimizer {
program->set_exec_scope(exec_scope_); program->set_exec_scope(exec_scope_);
return program; return program;
} catch (...) { } catch (...) {
LOG(WARNING) << "Build NPU graph failed"; LOG(WARNING) << "Build " << TargetToStr(target_place.target)
<< " program failed!";
} }
} }
#endif #endif
......
...@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc ...@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
...@@ -42,6 +43,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co ...@@ -42,6 +43,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
......
...@@ -9,3 +9,4 @@ add_subdirectory(x86) ...@@ -9,3 +9,4 @@ add_subdirectory(x86)
add_subdirectory(opencl) add_subdirectory(opencl)
add_subdirectory(fpga) add_subdirectory(fpga)
add_subdirectory(npu) add_subdirectory(npu)
add_subdirectory(xpu)
lite_cc_library(npu_bridge_registry SRCS registry.cc DEPS ${npu_ddk_libs}) lite_cc_library(npu_bridge_registry SRCS registry.cc)
lite_cc_library(npu_bridge_utils SRCS utils.cc DEPS ${npu_ddk_libs} npu_runtime tensor op scope)
set(npu_bridge_deps npu_bridge_registry npu_bridge_utils op) set(npu_bridge_deps npu_bridge_registry npu_builder op)
lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps}) lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
...@@ -23,7 +22,6 @@ lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps}) ...@@ -23,7 +22,6 @@ lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
set(npu_bridges set(npu_bridges
npu_bridge_registry npu_bridge_registry
npu_bridge_utils
npu_bridge_fc_op npu_bridge_fc_op
npu_bridge_conv_op npu_bridge_conv_op
npu_bridge_mul_op npu_bridge_mul_op
...@@ -43,7 +41,7 @@ set(npu_bridges ...@@ -43,7 +41,7 @@ set(npu_bridges
npu_bridge_pad2d_op npu_bridge_pad2d_op
CACHE INTERNAL "npu_bridges") CACHE INTERNAL "npu_bridges")
set(npu_bridge_test_deps ${npu_ddk_libs} ${npu_bridges} ${npu_kernels} ${ops}) set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps}) lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op, ...@@ -32,7 +26,7 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
auto scope = act_op->scope(); auto scope = act_op->scope();
auto op_info = act_op->op_info(); auto op_info = act_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
// create act node and set input node from inputs_map // create act node and set input node from inputs_map
...@@ -40,8 +34,8 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op, ...@@ -40,8 +34,8 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
auto act_node = std::make_shared<ge::op::Activation>(unique_op_type); auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
act_node->set_input_x(*inputs_map.at(x_var_name)); act_node->set_input_x(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(act_node); lite::npu::OpList::Global().add(act_node);
// parse and set activation type // parse and set activation type
int act_mode = 1; int act_mode = 1;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type BatchNormConverter( ...@@ -33,7 +27,7 @@ node_map_type BatchNormConverter(
auto scope = batch_norm_op->scope(); auto scope = batch_norm_op->scope();
auto op_info = batch_norm_op->op_info(); auto op_info = batch_norm_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::BatchNorm> batch_norm_node = std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
...@@ -43,27 +37,27 @@ node_map_type BatchNormConverter( ...@@ -43,27 +37,27 @@ node_map_type BatchNormConverter(
auto scale_var_name = op_info->Input("Scale").front(); auto scale_var_name = op_info->Input("Scale").front();
lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>(); lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name); auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
npu_scale->set_attr_value(CvtFromLiteTensor(scale)); npu_scale->set_attr_value(lite::npu::CvtFromLiteTensor(scale));
OpList::Global().add(npu_scale); lite::npu::OpList::Global().add(npu_scale);
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>(); lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name); auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
npu_bias->set_attr_value(CvtFromLiteTensor(bias)); npu_bias->set_attr_value(lite::npu::CvtFromLiteTensor(bias));
OpList::Global().add(npu_bias); lite::npu::OpList::Global().add(npu_bias);
auto mean_var_name = op_info->Input("Mean").front(); auto mean_var_name = op_info->Input("Mean").front();
lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>(); lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name); auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
npu_mean->set_attr_value(CvtFromLiteTensor(mean)); npu_mean->set_attr_value(lite::npu::CvtFromLiteTensor(mean));
OpList::Global().add(npu_mean); lite::npu::OpList::Global().add(npu_mean);
auto variance_var_name = op_info->Input("Variance").front(); auto variance_var_name = op_info->Input("Variance").front();
lite::Tensor* variance = lite::Tensor* variance =
scope->FindVar(variance_var_name)->GetMutable<Tensor>(); scope->FindVar(variance_var_name)->GetMutable<Tensor>();
auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name); auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
npu_variance->set_attr_value(CvtFromLiteTensor(variance)); npu_variance->set_attr_value(lite::npu::CvtFromLiteTensor(variance));
OpList::Global().add(npu_variance); lite::npu::OpList::Global().add(npu_variance);
float npu_momentum = op_info->GetAttr<float>("momentum"); float npu_momentum = op_info->GetAttr<float>("momentum");
float npu_epsilon = op_info->GetAttr<float>("epsilon"); float npu_epsilon = op_info->GetAttr<float>("epsilon");
...@@ -80,8 +74,8 @@ node_map_type BatchNormConverter( ...@@ -80,8 +74,8 @@ node_map_type BatchNormConverter(
batch_norm_node->set_attr_mode(npu_mode); batch_norm_node->set_attr_mode(npu_mode);
batch_norm_node->set_attr_use_global_stats(npu_use_global_stats); batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(batch_norm_node); lite::npu::OpList::Global().add(batch_norm_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Y").front()] = batch_norm_node; outputs_map[op_info->Output("Y").front()] = batch_norm_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op, ...@@ -32,7 +26,7 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
lite::Scope* scope = concat_op->scope(); lite::Scope* scope = concat_op->scope();
const lite::OpInfo* op_info = concat_op->op_info(); const lite::OpInfo* op_info = concat_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "converting " << op_type << " ... "; LOG(INFO) << "converting " << op_type << " ... ";
auto x_var_names = op_info->Input("X"); auto x_var_names = op_info->Input("X");
...@@ -48,17 +42,17 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op, ...@@ -48,17 +42,17 @@ node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
for (auto x_var_name : x_var_names) { for (auto x_var_name : x_var_names) {
if (inputs_map.find(x_var_name) != inputs_map.end()) { if (inputs_map.find(x_var_name) != inputs_map.end()) {
output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name)); output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
} else { } else {
auto consty = std::make_shared<ge::op::Const>(x_var_name); auto consty = std::make_shared<ge::op::Const>(x_var_name);
auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>(); auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
consty->set_attr_value(CvtFromLiteTensor(x)); consty->set_attr_value(lite::npu::CvtFromLiteTensor(x));
output_node->set_dynamic_input_x(index + 1, *consty); output_node->set_dynamic_input_x(index + 1, *consty);
OpList::Global().add(consty); lite::npu::OpList::Global().add(consty);
} }
index++; index++;
} }
OpList::Global().add(output_node); lite::npu::OpList::Global().add(output_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node; outputs_map[op_info->Output("Out").front()] = output_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -32,7 +26,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
auto scope = conv_op->scope(); auto scope = conv_op->scope();
auto op_info = conv_op->op_info(); auto op_info = conv_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " << op_type << "... "; LOG(INFO) << "Converting " << op_type << "... ";
// get input, filter and op attributes // get input, filter and op attributes
...@@ -78,13 +72,13 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -78,13 +72,13 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
// check input // check input
CHECK(inputs_map.count(input_var_name)); CHECK(inputs_map.count(input_var_name));
OpList::Global().add(inputs_map.at(input_var_name)); lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
// create filter node // create filter node
CHECK(!inputs_map.count(filter_var_name)); CHECK(!inputs_map.count(filter_var_name));
auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name); auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter));
OpList::Global().add(filter_const_node); lite::npu::OpList::Global().add(filter_const_node);
// create bias node if has bias // create bias node if has bias
// supports the bias nodes with the following dimensions // supports the bias nodes with the following dimensions
...@@ -93,7 +87,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -93,7 +87,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
// 2: {n, oc, oh, ow} // 2: {n, oc, oh, ow}
std::shared_ptr<ge::Operator> bias_node = nullptr; std::shared_ptr<ge::Operator> bias_node = nullptr;
bool is_channel_bias = false; bool is_channel_bias = false;
if (HasInputArg(op_info, scope, "Bias")) { if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>(); auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims(); auto bias_dims = bias->dims();
...@@ -121,10 +115,11 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -121,10 +115,11 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
} else { } else {
// bias node with const data // bias node with const data
auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name); auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
bias_const_node->set_attr_value(CvtFromLiteTensor(bias, bias_shape)); bias_const_node->set_attr_value(
lite::npu::CvtFromLiteTensor(bias, bias_shape));
bias_node = bias_const_node; bias_node = bias_const_node;
} }
OpList::Global().add(bias_node); lite::npu::OpList::Global().add(bias_node);
} }
// create conv node and set input, filter, bias nodes and attributes // create conv node and set input, filter, bias nodes and attributes
...@@ -147,7 +142,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -147,7 +142,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
depthwise_conv_node->set_attr_kernel( depthwise_conv_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
OpList::Global().add(depthwise_conv_node); lite::npu::OpList::Global().add(depthwise_conv_node);
conv_node = depthwise_conv_node; conv_node = depthwise_conv_node;
// ConvolutionDepthwise Op doesn't support bias, so append Add node to // ConvolutionDepthwise Op doesn't support bias, so append Add node to
// support bias // support bias
...@@ -155,7 +150,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -155,7 +150,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add"); auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
add_node->set_input_x1(*depthwise_conv_node); add_node->set_input_x1(*depthwise_conv_node);
add_node->set_input_x2(*bias_node); add_node->set_input_x2(*bias_node);
OpList::Global().add(add_node); lite::npu::OpList::Global().add(add_node);
conv_node = add_node; conv_node = add_node;
} }
} else { } else {
...@@ -174,7 +169,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -174,7 +169,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
common_conv_node->set_attr_kernel( common_conv_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
OpList::Global().add(common_conv_node); lite::npu::OpList::Global().add(common_conv_node);
conv_node = common_conv_node; conv_node = common_conv_node;
// Convolution Op only support bias with dimension {1, oc, 1, 1}, // Convolution Op only support bias with dimension {1, oc, 1, 1},
// so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
...@@ -185,7 +180,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -185,7 +180,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add"); auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
add_node->set_input_x1(*common_conv_node); add_node->set_input_x1(*common_conv_node);
add_node->set_input_x2(*bias_node); add_node->set_input_x2(*bias_node);
OpList::Global().add(add_node); lite::npu::OpList::Global().add(add_node);
conv_node = add_node; conv_node = add_node;
} }
} }
...@@ -199,7 +194,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op, ...@@ -199,7 +194,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
std::make_shared<ge::op::Activation>(unique_op_type + "/relu"); std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
relu_node->set_input_x(*conv_node); relu_node->set_input_x(*conv_node);
relu_node->set_attr_mode(1); relu_node->set_attr_mode(1);
OpList::Global().add(relu_node); lite::npu::OpList::Global().add(relu_node);
outputs_map[op_info->Output("Output").front()] = relu_node; outputs_map[op_info->Output("Output").front()] = relu_node;
} else { } else {
outputs_map[op_info->Output("Output").front()] = conv_node; outputs_map[op_info->Output("Output").front()] = conv_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type ConvTransposeConverter( ...@@ -33,7 +27,7 @@ node_map_type ConvTransposeConverter(
auto scope = conv_transpose_op->scope(); auto scope = conv_transpose_op->scope();
auto op_info = conv_transpose_op->op_info(); auto op_info = conv_transpose_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " << op_type << "... "; LOG(INFO) << "Converting " << op_type << "... ";
// get input, output and op attributes // get input, output and op attributes
...@@ -70,21 +64,22 @@ node_map_type ConvTransposeConverter( ...@@ -70,21 +64,22 @@ node_map_type ConvTransposeConverter(
} }
auto input_sizes_const_node = auto input_sizes_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/input_size"); std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
input_sizes_const_node->set_attr_value(CreateTensorAndFillData(output_shape)); input_sizes_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(output_shape));
conv_transpose_node->set_input_input_sizes(*input_sizes_const_node); conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
OpList::Global().add(input_sizes_const_node); lite::npu::OpList::Global().add(input_sizes_const_node);
// create filter node // create filter node
CHECK(!inputs_map.count(filter_var_name)); CHECK(!inputs_map.count(filter_var_name));
auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name); auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
filter_const_node->set_attr_value(CvtFromLiteTensor(filter)); filter_const_node->set_attr_value(lite::npu::CvtFromLiteTensor(filter));
conv_transpose_node->set_input_filter(*filter_const_node); conv_transpose_node->set_input_filter(*filter_const_node);
OpList::Global().add(filter_const_node); lite::npu::OpList::Global().add(filter_const_node);
// set input node // set input node
CHECK(inputs_map.count(input_var_name)); CHECK(inputs_map.count(input_var_name));
conv_transpose_node->set_input_x(*inputs_map.at(input_var_name)); conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
OpList::Global().add(inputs_map.at(input_var_name)); lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
// set attributes // set attributes
conv_transpose_node->set_attr_mode(1); conv_transpose_node->set_attr_mode(1);
...@@ -99,11 +94,11 @@ node_map_type ConvTransposeConverter( ...@@ -99,11 +94,11 @@ node_map_type ConvTransposeConverter(
ge::AttrValue::LIST_INT({strides[0], strides[1]})); ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_transpose_node->set_attr_kernel( conv_transpose_node->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]})); ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
OpList::Global().add(conv_transpose_node); lite::npu::OpList::Global().add(conv_transpose_node);
// append add node to add bias if has bias // append add node to add bias if has bias
std::shared_ptr<ge::Operator> output_node = conv_transpose_node; std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
if (HasInputArg(op_info, scope, "Bias")) { if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
// create bias node // create bias node
auto bias_var_name = op_info->Input("Bias").front(); auto bias_var_name = op_info->Input("Bias").front();
CHECK(!inputs_map.count(bias_var_name)); CHECK(!inputs_map.count(bias_var_name));
...@@ -112,13 +107,13 @@ node_map_type ConvTransposeConverter( ...@@ -112,13 +107,13 @@ node_map_type ConvTransposeConverter(
CHECK_EQ(channel_size, filter_shape[1] * groups); CHECK_EQ(channel_size, filter_shape[1] * groups);
auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name); auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
bias_const_node->set_attr_value( bias_const_node->set_attr_value(
CvtFromLiteTensor(bias, {1, channel_size, 1, 1})); lite::npu::CvtFromLiteTensor(bias, {1, channel_size, 1, 1}));
OpList::Global().add(bias_const_node); lite::npu::OpList::Global().add(bias_const_node);
// append add node to add bias node // append add node to add bias node
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add"); auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
add_node->set_input_x1(*conv_transpose_node); add_node->set_input_x1(*conv_transpose_node);
add_node->set_input_x2(*bias_const_node); add_node->set_input_x2(*bias_const_node);
OpList::Global().add(add_node); lite::npu::OpList::Global().add(add_node);
output_node = add_node; output_node = add_node;
} }
...@@ -129,7 +124,7 @@ node_map_type ConvTransposeConverter( ...@@ -129,7 +124,7 @@ node_map_type ConvTransposeConverter(
std::make_shared<ge::op::Activation>(unique_op_type + "/relu"); std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
relu_node->set_input_x(*output_node); relu_node->set_input_x(*output_node);
relu_node->set_attr_mode(1); relu_node->set_attr_mode(1);
OpList::Global().add(relu_node); lite::npu::OpList::Global().add(relu_node);
outputs_map[op_info->Output("Output").front()] = relu_node; outputs_map[op_info->Output("Output").front()] = relu_node;
} else { } else {
outputs_map[op_info->Output("Output").front()] = output_node; outputs_map[op_info->Output("Output").front()] = output_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type ElementwiseConverter( ...@@ -33,7 +27,7 @@ node_map_type ElementwiseConverter(
auto scope = elementwise_op->scope(); auto scope = elementwise_op->scope();
auto op_info = elementwise_op->op_info(); auto op_info = elementwise_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "converting elementwise..."; LOG(INFO) << "converting elementwise...";
std::shared_ptr<ge::op::Eltwise> elementwise_node = std::shared_ptr<ge::op::Eltwise> elementwise_node =
...@@ -47,20 +41,20 @@ node_map_type ElementwiseConverter( ...@@ -47,20 +41,20 @@ node_map_type ElementwiseConverter(
CHECK(inputs_map.find(x_var_name) != inputs_map.end()); CHECK(inputs_map.find(x_var_name) != inputs_map.end());
elementwise_node->set_input_x1(*inputs_map.at(x_var_name)); elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
if (inputs_map.find(y_var_name) != inputs_map.end()) { if (inputs_map.find(y_var_name) != inputs_map.end()) {
elementwise_node->set_input_x2(*inputs_map.at(y_var_name)); elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
OpList::Global().add(inputs_map.at(y_var_name)); lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
} else { } else {
auto consty = std::make_shared<ge::op::Const>(y_var_name); auto consty = std::make_shared<ge::op::Const>(y_var_name);
auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>(); auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
consty->set_attr_value(CvtFromLiteTensor(y)); consty->set_attr_value(lite::npu::CvtFromLiteTensor(y));
elementwise_node->set_input_x2(*consty); elementwise_node->set_input_x2(*consty);
OpList::Global().add(consty); lite::npu::OpList::Global().add(consty);
} }
OpList::Global().add(elementwise_node); lite::npu::OpList::Global().add(elementwise_node);
// paddlelite has sum only // paddlelite has sum only
elementwise_node->set_attr_mode(1); elementwise_node->set_attr_mode(1);
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -29,19 +23,22 @@ namespace bridges { ...@@ -29,19 +23,22 @@ namespace bridges {
node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op, node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
const node_map_type& inputs_map) { const node_map_type& inputs_map) {
LOG(INFO) << "Converting fc..."; auto scope = fc_op->scope();
lite::Scope* scope = fc_op->scope(); auto op_info = fc_op->op_info();
const lite::OpInfo* op_info = fc_op->op_info(); auto op_type = op_info->Type();
auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("fc")); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "...";
auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
auto x_var_name = op_info->Input("Input").front(); auto x_var_name = op_info->Input("Input").front();
auto w_var_name = op_info->Input("W").front(); auto w_var_name = op_info->Input("W").front();
int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims"); int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto* wtensor = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>(); auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
auto x_dims = xtensor->dims(); auto x_dims = x->dims();
auto w_dims = wtensor->dims(); auto w_dims = w->dims();
CHECK_GE(x_dims.size(), 2UL); CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL); CHECK_EQ(w_dims.size(), 2UL);
...@@ -49,65 +46,69 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op, ...@@ -49,65 +46,69 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
int m = x_dims.Slice(0, in_num_col_dims).production(); int m = x_dims.Slice(0, in_num_col_dims).production();
int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production(); int k = x_dims.Slice(in_num_col_dims, x_dims.size()).production();
int n = w_dims[1]; int n = w_dims[1];
CHECK_EQ(k * n, w_dims.production());
VLOG(3) << "x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
<< " k: " << k << " n: " << n;
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
CHECK(!inputs_map.count(w_var_name)); CHECK(!inputs_map.count(w_var_name));
LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k; // reshape x to (m, k, 1, 1)
LOG(INFO) << "x_var_name:" << x_var_name auto reshaped_x_node =
<< ", is data: " << inputs_map.count(x_var_name); std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
LOG(INFO) << "w_var_name:" << w_var_name reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
<< ", is data: " << inputs_map.count(w_var_name); reshaped_x_node->set_attr_shape({m, k, 1, 1});
reshaped_x_node->set_attr_axis(0);
auto xsrc = inputs_map.at(x_var_name); fc_node->set_input_x(*reshaped_x_node);
auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape"); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
reshapex->set_input_tensor(*xsrc); lite::npu::OpList::Global().add(reshaped_x_node);
reshapex->set_attr_shape({m, k});
reshapex->set_attr_axis(0); // create w const node, set its shape to (k, n, 1, 1) and fill with
OpList::Global().add(xsrc); // the transposed w tensor
OpList::Global().add(reshapex); auto w_const_node = std::make_shared<ge::op::Const>(w_var_name);
output_node->set_input_x(*reshapex); ge::TensorDesc w_const_desc(
ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
auto wconst = std::make_shared<ge::op::Const>(w_var_name); ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
ge::TensorDesc wdesc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT); w_const_tensor->SetTensorDesc(w_const_desc);
auto size = wdesc.GetShape().GetShapeSize(); auto w_data = w->mutable_data<float>();
CHECK_EQ(size, w_dims.production()); std::vector<float> transposed_w_data(w_dims.production());
ge::TensorPtr ptensor = std::make_shared<ge::Tensor>(); for (int i = 0; i < k; i++) {
ptensor->SetTensorDesc(wdesc); for (int j = 0; j < n; j++) {
auto* pdata = reinterpret_cast<uint8_t*>(wtensor->mutable_data<float>()); transposed_w_data[j * k + i] = w_data[i * n + j];
ptensor->SetData(pdata, size * sizeof(float)); }
wconst->set_attr_value(ptensor); }
OpList::Global().add(wconst); w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()),
output_node->set_input_w(*wconst); transposed_w_data.size() * sizeof(float));
w_const_node->set_attr_value(w_const_tensor);
if (HasInputArg(op_info, scope, "Bias")) { fc_node->set_input_w(*w_const_node);
auto b_var_name = op_info->Input("Bias").front(); lite::npu::OpList::Global().add(w_const_node);
auto* btensor = scope->FindVar(b_var_name)->GetMutable<lite::Tensor>();
// add bias node if bias tensor exists
LOG(INFO) << "b_var_name:" << b_var_name if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
<< ", is data: " << inputs_map.count(b_var_name); auto bias_var_name = op_info->Input("Bias").front();
CHECK(!inputs_map.count(b_var_name)); auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
CHECK_EQ(btensor->numel(), n); auto bias_dims = bias->dims();
CHECK(!inputs_map.count(bias_var_name));
auto bconst = std::make_shared<ge::op::Const>(b_var_name); CHECK_EQ(bias_dims.production(), n);
ge::TensorDesc bdesc(
ge::Shape({1, n, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT); auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
auto size = bdesc.GetShape().GetShapeSize(); bias_const_node->set_attr_value(
CHECK_EQ(size, n); lite::npu::CvtFromLiteTensor(bias, {1, n, 1, 1}));
ge::TensorPtr ptensor = std::make_shared<ge::Tensor>(); fc_node->set_input_b(*bias_const_node);
ptensor->SetTensorDesc(bdesc); lite::npu::OpList::Global().add(bias_const_node);
auto* pdata = reinterpret_cast<uint8_t*>(btensor->mutable_data<float>());
ptensor->SetData(pdata, size * sizeof(float));
bconst->set_attr_value(ptensor);
OpList::Global().add(bconst);
output_node->set_input_bias(*bconst);
output_node->set_attr_has_bias(ge::AttrValue::BOOL{true});
} }
lite::npu::OpList::Global().add(fc_node);
OpList::Global().add(output_node); // reshape output of fc_node from (m, n, 1, 1) to (m, n)
auto reshaped_fc_node =
std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
reshaped_fc_node->set_input_tensor(*fc_node);
reshaped_fc_node->set_attr_shape({m, n});
reshaped_fc_node->set_attr_axis(0);
lite::npu::OpList::Global().add(reshaped_fc_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node; outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
return outputs_map; return outputs_map;
} }
......
...@@ -126,6 +126,7 @@ TEST(NPUBridges, fc) { ...@@ -126,6 +126,7 @@ TEST(NPUBridges, fc) {
test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias); test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias);
test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias); test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias);
test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias); test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
test_fc({1, 1024, 1, 1}, {1024, 1000}, 1, use_bias);
} }
} }
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,13 +27,13 @@ node_map_type InterpolateConverter( ...@@ -33,13 +27,13 @@ node_map_type InterpolateConverter(
auto scope = interpolate_op->scope(); auto scope = interpolate_op->scope();
auto op_info = interpolate_op->op_info(); auto op_info = interpolate_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
// get input, output and attributes from lite op // get input, output and attributes from lite op
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>(); auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
auto x_dims = x->dims(); auto x_dims = x->dims();
...@@ -64,7 +58,7 @@ node_map_type InterpolateConverter( ...@@ -64,7 +58,7 @@ node_map_type InterpolateConverter(
// update out_h and out_w if has OutSize // update out_h and out_w if has OutSize
bool inputs_map_has_w = false; bool inputs_map_has_w = false;
if (HasInputArg(op_info, scope, "OutSize")) { if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
auto out_size_var_name = op_info->Input("OutSize").front(); auto out_size_var_name = op_info->Input("OutSize").front();
if (inputs_map.count(out_size_var_name)) { if (inputs_map.count(out_size_var_name)) {
inputs_map_has_w = true; inputs_map_has_w = true;
...@@ -83,12 +77,12 @@ node_map_type InterpolateConverter( ...@@ -83,12 +77,12 @@ node_map_type InterpolateConverter(
auto interp_method = op_info->GetAttr<std::string>("interp_method"); auto interp_method = op_info->GetAttr<std::string>("interp_method");
if (interp_method == "bilinear") { if (interp_method == "bilinear") {
auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type); auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
OpList::Global().add(interp_node); lite::npu::OpList::Global().add(interp_node);
interp_node->set_input_x(*inputs_map.at(x_var_name)); interp_node->set_input_x(*inputs_map.at(x_var_name));
if (inputs_map_has_w) { if (inputs_map_has_w) {
auto out_size_var_name = op_info->Input("OutSize").front(); auto out_size_var_name = op_info->Input("OutSize").front();
interp_node->set_input_w(*inputs_map.at(out_size_var_name)); interp_node->set_input_w(*inputs_map.at(out_size_var_name));
OpList::Global().add(inputs_map.at(out_size_var_name)); lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
} else { } else {
const float largest_multiple = 7.0f; const float largest_multiple = 7.0f;
float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w); float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
...@@ -99,9 +93,9 @@ node_map_type InterpolateConverter( ...@@ -99,9 +93,9 @@ node_map_type InterpolateConverter(
auto w_const_node = auto w_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/w"); std::make_shared<ge::op::Const>(unique_op_type + "/w");
w_const_node->set_attr_value( w_const_node->set_attr_value(
CreateTensorAndFillData(std::vector<int>({out_h, out_w}))); lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
interp_node->set_input_w(*w_const_node); interp_node->set_input_w(*w_const_node);
OpList::Global().add(w_const_node); lite::npu::OpList::Global().add(w_const_node);
} }
interp_node->set_attr_output_dim_mode( interp_node->set_attr_output_dim_mode(
2); // 0: zoom_factor, 1: shrink_factor, 2: height/width 2); // 0: zoom_factor, 1: shrink_factor, 2: height/width
...@@ -110,19 +104,19 @@ node_map_type InterpolateConverter( ...@@ -110,19 +104,19 @@ node_map_type InterpolateConverter(
} else if (interp_method == "nearest") { } else if (interp_method == "nearest") {
auto interp_node = auto interp_node =
std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type); std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
OpList::Global().add(interp_node); lite::npu::OpList::Global().add(interp_node);
interp_node->set_input_image(*inputs_map.at(x_var_name)); interp_node->set_input_image(*inputs_map.at(x_var_name));
if (inputs_map_has_w) { if (inputs_map_has_w) {
auto out_size_var_name = op_info->Input("OutSize").front(); auto out_size_var_name = op_info->Input("OutSize").front();
interp_node->set_input_size(*inputs_map.at(out_size_var_name)); interp_node->set_input_size(*inputs_map.at(out_size_var_name));
OpList::Global().add(inputs_map.at(out_size_var_name)); lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
} else { } else {
auto w_const_node = auto w_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/w"); std::make_shared<ge::op::Const>(unique_op_type + "/w");
w_const_node->set_attr_value( w_const_node->set_attr_value(
CreateTensorAndFillData(std::vector<int>({out_h, out_w}))); lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
interp_node->set_input_size(*w_const_node); interp_node->set_input_size(*w_const_node);
OpList::Global().add(w_const_node); lite::npu::OpList::Global().add(w_const_node);
} }
interp_node->set_attr_align_corners(align_corners); interp_node->set_attr_align_corners(align_corners);
outputs_map[op_info->Output("Out").front()] = interp_node; outputs_map[op_info->Output("Out").front()] = interp_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -34,7 +28,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -34,7 +28,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
LOG(INFO) << "converting mul..."; LOG(INFO) << "converting mul...";
lite::Scope* scope = mul_op->scope(); lite::Scope* scope = mul_op->scope();
const lite::OpInfo* op_info = mul_op->op_info(); const lite::OpInfo* op_info = mul_op->op_info();
auto output_node = std::make_shared<ge::op::MatMul>(UniqueName("mul")); auto output_node =
std::make_shared<ge::op::MatMul>(lite::npu::UniqueName("mul"));
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front(); auto y_var_name = op_info->Input("Y").front();
...@@ -66,8 +61,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -66,8 +61,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
reshapex->set_input_tensor(*xsrc); reshapex->set_input_tensor(*xsrc);
reshapex->set_attr_shape({m, k}); reshapex->set_attr_shape({m, k});
reshapex->set_attr_axis(0); reshapex->set_attr_axis(0);
OpList::Global().add(xsrc); lite::npu::OpList::Global().add(xsrc);
OpList::Global().add(reshapex); lite::npu::OpList::Global().add(reshapex);
output_node->set_input_x(*reshapex); output_node->set_input_x(*reshapex);
} else { } else {
auto constx = std::make_shared<ge::op::Const>(x_var_name); auto constx = std::make_shared<ge::op::Const>(x_var_name);
...@@ -79,7 +74,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -79,7 +74,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>()); auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
ptensor->SetData(pdata, size * sizeof(float)); ptensor->SetData(pdata, size * sizeof(float));
constx->set_attr_value(ptensor); constx->set_attr_value(ptensor);
OpList::Global().add(constx); lite::npu::OpList::Global().add(constx);
output_node->set_input_x(*constx); output_node->set_input_x(*constx);
} }
...@@ -89,8 +84,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -89,8 +84,8 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
reshapey->set_input_tensor(*ysrc); reshapey->set_input_tensor(*ysrc);
reshapey->set_attr_shape({k, n}); reshapey->set_attr_shape({k, n});
reshapey->set_attr_axis(0); reshapey->set_attr_axis(0);
OpList::Global().add(ysrc); lite::npu::OpList::Global().add(ysrc);
OpList::Global().add(reshapey); lite::npu::OpList::Global().add(reshapey);
output_node->set_input_w(*reshapey); output_node->set_input_w(*reshapey);
} else { } else {
auto consty = std::make_shared<ge::op::Const>(y_var_name); auto consty = std::make_shared<ge::op::Const>(y_var_name);
...@@ -102,11 +97,11 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op, ...@@ -102,11 +97,11 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>()); auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
ptensor->SetData(pdata, size * sizeof(float)); ptensor->SetData(pdata, size * sizeof(float));
consty->set_attr_value(ptensor); consty->set_attr_value(ptensor);
OpList::Global().add(consty); lite::npu::OpList::Global().add(consty);
output_node->set_input_w(*consty); output_node->set_input_w(*consty);
} }
OpList::Global().add(output_node); lite::npu::OpList::Global().add(output_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = output_node; outputs_map[op_info->Output("Out").front()] = output_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,15 +26,15 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op, ...@@ -32,15 +26,15 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
auto scope = pad2d_op->scope(); auto scope = pad2d_op->scope();
auto op_info = pad2d_op->op_info(); auto op_info = pad2d_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Pad> pad2d_node = std::shared_ptr<ge::op::Pad> pad2d_node =
std::make_shared<ge::op::Pad>(unique_op_type); std::make_shared<ge::op::Pad>(unique_op_type);
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
pad2d_node->set_input_x(*inputs_map.at(x_var_name)); pad2d_node->set_input_x(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(pad2d_node); lite::npu::OpList::Global().add(pad2d_node);
auto mode = op_info->GetAttr<std::string>("mode"); auto mode = op_info->GetAttr<std::string>("mode");
if (mode == "constant") { if (mode == "constant") {
...@@ -59,17 +53,19 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op, ...@@ -59,17 +53,19 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
padding.insert(padding.begin(), xds * 2 - 4, 0); padding.insert(padding.begin(), xds * 2 - 4, 0);
auto npu_padding = auto npu_padding =
std::make_shared<ge::op::Const>(unique_op_type + "/padding"); std::make_shared<ge::op::Const>(unique_op_type + "/padding");
npu_padding->set_attr_value(CreateTensorAndFillData<int>(padding, {xds, 2})); npu_padding->set_attr_value(
lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
pad2d_node->set_input_padding(*npu_padding); pad2d_node->set_input_padding(*npu_padding);
OpList::Global().add(npu_padding); lite::npu::OpList::Global().add(npu_padding);
if (mode == "constant") { if (mode == "constant") {
auto pad_value = op_info->GetAttr<float>("pad_value"); auto pad_value = op_info->GetAttr<float>("pad_value");
auto npu_pad_value = auto npu_pad_value =
std::make_shared<ge::op::Const>(unique_op_type + "/pad_value"); std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
npu_pad_value->set_attr_value(CreateTensorAndFillData<float>({pad_value})); npu_pad_value->set_attr_value(
lite::npu::CreateTensorAndFillData<float>({pad_value}));
pad2d_node->set_input_constant_values(*npu_pad_value); pad2d_node->set_input_constant_values(*npu_pad_value);
OpList::Global().add(npu_pad_value); lite::npu::OpList::Global().add(npu_pad_value);
pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32 pad2d_node->set_attr_T(0); // type of pad_value: 0:float 3:int32
} }
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, ...@@ -32,7 +26,7 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
auto scope = pool_op->scope(); auto scope = pool_op->scope();
auto op_info = pool_op->op_info(); auto op_info = pool_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Pooling> pool_node = std::shared_ptr<ge::op::Pooling> pool_node =
...@@ -73,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op, ...@@ -73,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
pool_node->set_attr_ceil_mode(npu_ceil_mode); pool_node->set_attr_ceil_mode(npu_ceil_mode);
// output_node->set_attr_data_mode(npu_data_mode); // output_node->set_attr_data_mode(npu_data_mode);
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(pool_node); lite::npu::OpList::Global().add(pool_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = pool_node; outputs_map[op_info->Output("Out").front()] = pool_node;
......
...@@ -13,14 +13,8 @@ ...@@ -13,14 +13,8 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/reshape_op.h" #include "lite/operators/reshape_op.h"
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -33,7 +27,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
auto scope = reshape_op->scope(); auto scope = reshape_op->scope();
auto op_info = reshape_op->op_info(); auto op_info = reshape_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
// get input, output and op attributes // get input, output and op attributes
...@@ -45,10 +39,10 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -45,10 +39,10 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type); auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
reshape_node->set_input_tensor(*inputs_map.at(x_var_name)); reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
// read shape from actual shape tensor as input "w" if 'Shape' is found // read shape from actual shape tensor as input "w" if 'Shape' is found
if (HasInputArg(op_info, scope, "Shape")) { if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
auto actual_shape_var_name = op_info->Input("Shape").front(); auto actual_shape_var_name = op_info->Input("Shape").front();
if (!inputs_map.count(actual_shape_var_name)) { if (!inputs_map.count(actual_shape_var_name)) {
auto actual_shape = auto actual_shape =
...@@ -67,13 +61,14 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -67,13 +61,14 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
} }
auto actual_shape_const_node = auto actual_shape_const_node =
std::make_shared<ge::op::Const>(actual_shape_var_name); std::make_shared<ge::op::Const>(actual_shape_var_name);
actual_shape_const_node->set_attr_value(CreateTensorAndFillData( actual_shape_const_node->set_attr_value(
lite::npu::CreateTensorAndFillData(
std::vector<int>(out_shape.begin(), out_shape.end()))); std::vector<int>(out_shape.begin(), out_shape.end())));
reshape_node->set_input_w(*actual_shape_const_node); reshape_node->set_input_w(*actual_shape_const_node);
OpList::Global().add(actual_shape_const_node); lite::npu::OpList::Global().add(actual_shape_const_node);
} else { } else {
reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name)); reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
OpList::Global().add(inputs_map.at(actual_shape_var_name)); lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
} }
} else { } else {
auto shape = op_info->GetAttr<std::vector<int>>("shape"); auto shape = op_info->GetAttr<std::vector<int>>("shape");
...@@ -87,7 +82,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -87,7 +82,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
reshape_node->set_attr_shape( reshape_node->set_attr_shape(
ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end())); ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
} }
OpList::Global().add(reshape_node); lite::npu::OpList::Global().add(reshape_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = reshape_node; outputs_map[op_info->Output("Out").front()] = reshape_node;
...@@ -107,7 +102,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op, ...@@ -107,7 +102,7 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
xshape_node->set_input_tensor(*inputs_map.at(x_var_name)); xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
xshape_node->set_attr_shape( xshape_node->set_attr_shape(
ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end())); ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
OpList::Global().add(xshape_node); lite::npu::OpList::Global().add(xshape_node);
outputs_map[op_info->Output("XShape").front()] = xshape_node; outputs_map[op_info->Output("XShape").front()] = xshape_node;
} }
return outputs_map; return outputs_map;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op, ...@@ -32,7 +26,7 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
auto scope = scale_op->scope(); auto scope = scale_op->scope();
auto op_info = scale_op->op_info(); auto op_info = scale_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
// get input, output and op attributes // get input, output and op attributes
...@@ -52,26 +46,26 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op, ...@@ -52,26 +46,26 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type); auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
scale_node->set_input_x(*inputs_map.at(x_var_name)); scale_node->set_input_x(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(scale_node); lite::npu::OpList::Global().add(scale_node);
// add filter node(fill with scale) // add filter node(fill with scale)
auto filter_const_node = auto filter_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/filter"); std::make_shared<ge::op::Const>(unique_op_type + "/filter");
filter_const_node->set_attr_value( filter_const_node->set_attr_value(
CreateTensorAndFillData(scale, scale_bias_shape)); lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
scale_node->set_input_filter(*filter_const_node); scale_node->set_input_filter(*filter_const_node);
OpList::Global().add(filter_const_node); lite::npu::OpList::Global().add(filter_const_node);
// add bias node(fill with bias) // add bias node(fill with bias)
if (fabs(bias) > 1e-6f) { if (fabs(bias) > 1e-6f) {
auto bias_const_node = auto bias_const_node =
std::make_shared<ge::op::Const>(unique_op_type + "/bias"); std::make_shared<ge::op::Const>(unique_op_type + "/bias");
bias_const_node->set_attr_value( bias_const_node->set_attr_value(
CreateTensorAndFillData(bias, scale_bias_shape)); lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
scale_node->set_input_bias(*bias_const_node); scale_node->set_input_bias(*bias_const_node);
scale_node->set_attr_has_bias_value(true); scale_node->set_attr_has_bias_value(true);
OpList::Global().add(bias_const_node); lite::npu::OpList::Global().add(bias_const_node);
} }
scale_node->set_attr_axis(1); scale_node->set_attr_axis(1);
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type ShuffleChannelConverter( ...@@ -33,7 +27,7 @@ node_map_type ShuffleChannelConverter(
auto scope = shuffle_channel_op->scope(); auto scope = shuffle_channel_op->scope();
auto op_info = shuffle_channel_op->op_info(); auto op_info = shuffle_channel_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node = std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
...@@ -43,8 +37,8 @@ node_map_type ShuffleChannelConverter( ...@@ -43,8 +37,8 @@ node_map_type ShuffleChannelConverter(
shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name)); shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group")); shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(shuffle_channel_node); lite::npu::OpList::Global().add(shuffle_channel_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = shuffle_channel_node; outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op, ...@@ -32,7 +26,7 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
auto scope = softmax_op->scope(); auto scope = softmax_op->scope();
auto op_info = softmax_op->op_info(); auto op_info = softmax_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Softmax> softmax_node = std::shared_ptr<ge::op::Softmax> softmax_node =
...@@ -51,8 +45,8 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op, ...@@ -51,8 +45,8 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
softmax_node->set_input_x(*inputs_map.at(x_var_name)); softmax_node->set_input_x(*inputs_map.at(x_var_name));
softmax_node->set_attr_axis(axis); softmax_node->set_attr_axis(axis);
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(softmax_node); lite::npu::OpList::Global().add(softmax_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = softmax_node; outputs_map[op_info->Output("Out").front()] = softmax_node;
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -32,7 +26,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op, ...@@ -32,7 +26,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
lite::Scope* scope = split_op->scope(); lite::Scope* scope = split_op->scope();
const lite::OpInfo* op_info = split_op->op_info(); const lite::OpInfo* op_info = split_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " << op_type << " ... "; LOG(INFO) << "Converting " << op_type << " ... ";
auto x_var_name = op_info->Input("X").front(); auto x_var_name = op_info->Input("X").front();
...@@ -45,7 +39,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op, ...@@ -45,7 +39,7 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
std::make_shared<ge::op::Split>(unique_op_type); std::make_shared<ge::op::Split>(unique_op_type);
CHECK(inputs_map.count(x_var_name)); CHECK(inputs_map.count(x_var_name));
output_node->set_input_x(*inputs_map.at(x_var_name)); output_node->set_input_x(*inputs_map.at(x_var_name));
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
output_node->set_attr_axis(static_cast<int64_t>(axis)); output_node->set_attr_axis(static_cast<int64_t>(axis));
if (num > 0) { if (num > 0) {
...@@ -63,18 +57,18 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op, ...@@ -63,18 +57,18 @@ node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
for (auto out_var_name : out_var_names) { for (auto out_var_name : out_var_names) {
auto const_node = std::make_shared<ge::op::Const>( auto const_node = std::make_shared<ge::op::Const>(
unique_op_type + "/const_zero" + std::to_string(index)); unique_op_type + "/const_zero" + std::to_string(index));
const_node->set_attr_value(CreateTensorAndFillData(0)); const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0));
OpList::Global().add(const_node); lite::npu::OpList::Global().add(const_node);
auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" + auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
std::to_string(index)); std::to_string(index));
add_node->set_input_x1(*output_node, "y" + std::to_string(index)); add_node->set_input_x1(*output_node, "y" + std::to_string(index));
add_node->set_input_x2(*const_node); add_node->set_input_x2(*const_node);
outputs_map[out_var_name] = add_node; outputs_map[out_var_name] = add_node;
OpList::Global().add(add_node); lite::npu::OpList::Global().add(add_node);
index++; index++;
} }
OpList::Global().add(output_node); lite::npu::OpList::Global().add(output_node);
return outputs_map; return outputs_map;
} }
......
...@@ -14,10 +14,9 @@ ...@@ -14,10 +14,9 @@
#include "lite/kernels/npu/bridges/test_helper.h" #include "lite/kernels/npu/bridges/test_helper.h"
#include <utility> #include <utility>
#include "ai_ddk_lib/include/graph/op/all_ops.h" #include "lite/backends/npu/builder.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
#include "lite/operators/graph_op.h" #include "lite/operators/graph_op.h"
namespace paddle { namespace paddle {
...@@ -44,7 +43,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -44,7 +43,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT); ge::Shape(input->dims().Vectorize()), ge::FORMAT_NCHW, ge::DT_FLOAT);
auto input_node = std::make_shared<ge::op::Data>(input_var_name); auto input_node = std::make_shared<ge::op::Data>(input_var_name);
input_node->update_input_desc_x(input_desc); input_node->update_input_desc_x(input_desc);
OpList::Global().add(input_node); lite::npu::OpList::Global().add(input_node);
inputs_map[input_var_name] = input_node; inputs_map[input_var_name] = input_node;
} }
auto outputs_map = supported_lists.at(op_type)(op, inputs_map); auto outputs_map = supported_lists.at(op_type)(op, inputs_map);
...@@ -63,7 +62,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -63,7 +62,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>(); auto weight = scope->Var(weight_var_name)->GetMutable<Tensor>();
weight->set_persistable(true); weight->set_persistable(true);
weight->set_precision(PRECISION(kInt8)); weight->set_precision(PRECISION(kInt8));
CHECK(BuildModel(graph_inputs, graph_outputs, weight)); CHECK(lite::npu::BuildModel(graph_inputs, graph_outputs, weight));
CHECK_GT(weight->numel(), 0); CHECK_GT(weight->numel(), 0);
CHECK_NE(weight->data<uint8_t>(), 0); CHECK_NE(weight->data<uint8_t>(), 0);
...@@ -94,7 +93,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op, ...@@ -94,7 +93,7 @@ void LauchOp(const std::shared_ptr<lite::OpLite> op,
graph_kernel->Launch(); graph_kernel->Launch();
// release all of resources of generated model // release all of resources of generated model
OpList::Global().clear(); lite::npu::OpList::Global().clear();
} }
} // namespace bridges } // namespace bridges
......
...@@ -12,14 +12,8 @@ ...@@ -12,14 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "ai_ddk_lib/include/graph/buffer.h" #include "lite/backends/npu/builder.h"
#include "ai_ddk_lib/include/graph/graph.h"
#include "ai_ddk_lib/include/graph/model.h"
#include "ai_ddk_lib/include/graph/op/all_ops.h"
#include "ai_ddk_lib/include/graph/operator.h"
#include "ai_ddk_lib/include/graph/operator_reg.h"
#include "lite/kernels/npu/bridges/registry.h" #include "lite/kernels/npu/bridges/registry.h"
#include "lite/kernels/npu/bridges/utils.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -33,7 +27,7 @@ node_map_type TransposeConverter( ...@@ -33,7 +27,7 @@ node_map_type TransposeConverter(
auto scope = transpose_op->scope(); auto scope = transpose_op->scope();
auto op_info = transpose_op->op_info(); auto op_info = transpose_op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto unique_op_type = UniqueName(op_type); auto unique_op_type = lite::npu::UniqueName(op_type);
LOG(INFO) << "Converting " + op_type + "..."; LOG(INFO) << "Converting " + op_type + "...";
std::shared_ptr<ge::op::Permute> transpose_node = std::shared_ptr<ge::op::Permute> transpose_node =
...@@ -50,8 +44,8 @@ node_map_type TransposeConverter( ...@@ -50,8 +44,8 @@ node_map_type TransposeConverter(
w_data[i] = 1.f; w_data[i] = 1.f;
} }
auto npu_w = std::make_shared<ge::op::Const>(w_var_name); auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
npu_w->set_attr_value(CvtFromLiteTensor(w)); npu_w->set_attr_value(lite::npu::CvtFromLiteTensor(w));
OpList::Global().add(npu_w); lite::npu::OpList::Global().add(npu_w);
auto axis = op_info->GetAttr<std::vector<int>>("axis"); auto axis = op_info->GetAttr<std::vector<int>>("axis");
auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end()); auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
...@@ -61,8 +55,8 @@ node_map_type TransposeConverter( ...@@ -61,8 +55,8 @@ node_map_type TransposeConverter(
transpose_node->set_input_w(*npu_w); transpose_node->set_input_w(*npu_w);
transpose_node->set_attr_order(npu_axis); transpose_node->set_attr_order(npu_axis);
OpList::Global().add(inputs_map.at(x_var_name)); lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
OpList::Global().add(transpose_node); lite::npu::OpList::Global().add(transpose_node);
node_map_type outputs_map; node_map_type outputs_map;
outputs_map[op_info->Output("Out").front()] = transpose_node; outputs_map[op_info->Output("Out").front()] = transpose_node;
......
...@@ -49,8 +49,8 @@ void GraphCompute::PrepareForRun() { ...@@ -49,8 +49,8 @@ void GraphCompute::PrepareForRun() {
VLOG(3) << "npu_idims[" << i << "]: " << npu_idims_[i].GetNumber() << "," VLOG(3) << "npu_idims[" << i << "]: " << npu_idims_[i].GetNumber() << ","
<< npu_idims_[i].GetChannel() << "," << npu_idims_[i].GetHeight() << npu_idims_[i].GetChannel() << "," << npu_idims_[i].GetHeight()
<< "," << npu_idims_[i].GetWidth(); << "," << npu_idims_[i].GetWidth();
VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i]->dims(); VLOG(3) << "lite_idims[" << i << "]: " << param.inputs[i].second->dims();
CHECK_EQ(param.inputs[i]->dims().production(), CHECK_EQ(param.inputs[i].second->dims().production(),
npu_idims_[i].GetNumber() * npu_idims_[i].GetChannel() * npu_idims_[i].GetNumber() * npu_idims_[i].GetChannel() *
npu_idims_[i].GetHeight() * npu_idims_[i].GetWidth()); npu_idims_[i].GetHeight() * npu_idims_[i].GetWidth());
npu_itensors_[i].reset(new hiai::AiTensor); npu_itensors_[i].reset(new hiai::AiTensor);
...@@ -61,16 +61,16 @@ void GraphCompute::PrepareForRun() { ...@@ -61,16 +61,16 @@ void GraphCompute::PrepareForRun() {
VLOG(3) << "npu_odims[" << i << "]: " << npu_odims_[i].GetNumber() << "," VLOG(3) << "npu_odims[" << i << "]: " << npu_odims_[i].GetNumber() << ","
<< npu_odims_[i].GetChannel() << "," << npu_odims_[i].GetHeight() << npu_odims_[i].GetChannel() << "," << npu_odims_[i].GetHeight()
<< "," << npu_odims_[i].GetWidth(); << "," << npu_odims_[i].GetWidth();
VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i]->dims(); VLOG(3) << "lite_odims[" << i << "]: " << param.outputs[i].second->dims();
auto out_size = npu_odims_[i].GetNumber() * npu_odims_[i].GetChannel() * auto out_size = npu_odims_[i].GetNumber() * npu_odims_[i].GetChannel() *
npu_odims_[i].GetHeight() * npu_odims_[i].GetWidth(); npu_odims_[i].GetHeight() * npu_odims_[i].GetWidth();
if (param.outputs[i]->dims().production() != out_size) { if (param.outputs[i].second->dims().production() != out_size) {
param.outputs[i]->Resize({npu_odims_[i].GetNumber(), param.outputs[i].second->Resize({npu_odims_[i].GetNumber(),
npu_odims_[i].GetChannel(), npu_odims_[i].GetChannel(),
npu_odims_[i].GetHeight(), npu_odims_[i].GetHeight(),
npu_odims_[i].GetWidth()}); npu_odims_[i].GetWidth()});
} }
LOG(INFO) << param.outputs[i]->dims(); LOG(INFO) << param.outputs[i].second->dims();
npu_otensors_[i].reset(new hiai::AiTensor); npu_otensors_[i].reset(new hiai::AiTensor);
npu_otensors_[i]->Init(&(npu_odims_[i])); npu_otensors_[i]->Init(&(npu_odims_[i]));
} }
...@@ -80,7 +80,7 @@ bool GraphCompute::input_dims_changed() const { ...@@ -80,7 +80,7 @@ bool GraphCompute::input_dims_changed() const {
auto& param = this->Param<param_t>(); auto& param = this->Param<param_t>();
CHECK_EQ(param.inputs.size(), npu_idims_.size()); CHECK_EQ(param.inputs.size(), npu_idims_.size());
for (size_t i = 0; i < param.inputs.size(); ++i) { for (size_t i = 0; i < param.inputs.size(); ++i) {
auto param_idims = param.inputs[i]->dims(); auto param_idims = param.inputs[i].second->dims();
CHECK(!param_idims.empty()); CHECK(!param_idims.empty());
CHECK_EQ(param_idims.size(), 4); CHECK_EQ(param_idims.size(), 4);
std::vector<int> idims{static_cast<int>(npu_idims_[i].GetNumber()), std::vector<int> idims{static_cast<int>(npu_idims_[i].GetNumber()),
...@@ -105,7 +105,7 @@ void GraphCompute::Run() { ...@@ -105,7 +105,7 @@ void GraphCompute::Run() {
CHECK_EQ(param.outputs.size(), npu_otensors_.size()); CHECK_EQ(param.outputs.size(), npu_otensors_.size());
for (size_t i = 0; i < param.inputs.size(); ++i) { for (size_t i = 0; i < param.inputs.size(); ++i) {
auto* itensor = param.inputs[i]; auto* itensor = param.inputs[i].second;
CHECK(itensor); CHECK(itensor);
const auto* i_data = itensor->data<float>(); const auto* i_data = itensor->data<float>();
std::memcpy( std::memcpy(
...@@ -126,10 +126,10 @@ void GraphCompute::Run() { ...@@ -126,10 +126,10 @@ void GraphCompute::Run() {
CHECK_EQ(hiai::AI_SUCCESS, CHECK_EQ(hiai::AI_SUCCESS,
model_client_->Process( model_client_->Process(
model_context_, npu_itensors_, npu_otensors_, 1000, istamp)); model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
LOG(INFO) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us"; VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
for (size_t i = 0; i < param.outputs.size(); ++i) { for (size_t i = 0; i < param.outputs.size(); ++i) {
auto* otensor = param.outputs[i]; auto* otensor = param.outputs[i].second;
CHECK(otensor); CHECK(otensor);
auto* o_data = otensor->mutable_data<float>(); auto* o_data = otensor->mutable_data<float>();
auto* npu_obuffer = static_cast<float*>(npu_otensors_[i]->GetBuffer()); auto* npu_obuffer = static_cast<float*>(npu_otensors_[i]->GetBuffer());
......
add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops) add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
# lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
# lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps}) # lite_cc_library(sgd_compute_x86 SRCS sgd_compute.cc DEPS ${lite_kernel_deps})
...@@ -55,6 +55,8 @@ lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS ba ...@@ -55,6 +55,8 @@ lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS ba
lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86) lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86) lite_cc_test(test_elementwise_compute_x86 SRCS elementwise_compute_test.cc DEPS elementwise_compute_x86)
lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS activation_compute_x86) lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS activation_compute_x86)
lite_cc_test(test_tanh_compute_x86 SRCS tanh_compute_test.cc DEPS activation_compute_x86)
lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_compute_x86)
lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86) lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86)
lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86) lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86) lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
......
...@@ -35,3 +35,25 @@ REGISTER_LITE_KERNEL(relu, ...@@ -35,3 +35,25 @@ REGISTER_LITE_KERNEL(relu,
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))}) .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize(); .Finalize();
// float
REGISTER_LITE_KERNEL(tanh,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::TanhCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
// float
REGISTER_LITE_KERNEL(gelu,
kX86,
kFloat,
kNCHW,
paddle::lite::kernels::x86::GeluCompute<float>,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
.Finalize();
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <algorithm>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/backends/x86/math/blas.h"
#include "lite/core/kernel.h" #include "lite/core/kernel.h"
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
...@@ -115,6 +117,76 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> { ...@@ -115,6 +117,76 @@ class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
virtual ~ReluCompute() = default; virtual ~ReluCompute() = default;
}; };
// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
template <typename T>
struct TanhFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.tanh();
}
};
template <typename T>
class TanhCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override {
auto& param = *param_.get_mutable<operators::ActivationParam>();
param.Out->template mutable_data<T>();
Activate<TanhFunctor<T>>(param.X, param.Out);
}
virtual ~TanhCompute() = default;
};
// gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2)))
template <typename T>
struct GeluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
// Because the execute or device context can not be deliver here, it keep the
// marco for NVCC.
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
auto x_data = x.data();
auto out_data = out.data();
int n = std::min(x.size(), out.size());
std::memset(out_data, 0, n * sizeof(T));
paddle::lite::x86::math::CBlas<T>::AXPY(
n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
paddle::lite::x86::math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
for (int i = 0; i < n; i++) {
out_data[i] += static_cast<T>(1);
}
paddle::lite::x86::math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
for (int i = 0; i < n; i++) {
out_data[i] *= static_cast<T>(0.5);
}
#else
auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
#endif
}
};
template <typename T>
class GeluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
public:
using param_t = operators::ActivationParam;
void Run() override {
auto& param = *param_.get_mutable<operators::ActivationParam>();
param.Out->template mutable_data<T>();
Activate<GeluFunctor<T>>(param.X, param.Out);
}
virtual ~GeluCompute() = default;
};
} // namespace x86 } // namespace x86
} // namespace kernels } // namespace kernels
} // namespace lite } // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/x86/activation_compute.cc"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(gelu_x86, retrive_op) {
auto gelu =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gelu");
ASSERT_FALSE(gelu.empty());
ASSERT_TRUE(gelu.front());
}
TEST(gelu_x86, init) {
GeluCompute<float> gelu;
ASSERT_EQ(gelu.precision(), PRECISION(kFloat));
ASSERT_EQ(gelu.target(), TARGET(kX86));
}
TEST(gelu_x86, run_test) {
lite::Tensor x, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
int sign = i % 2 == 0 ? 1 : -1;
x_data[i] = static_cast<float>(i * sign) * 0.8f;
}
// GeluCompute gelu;
GeluCompute<float> gelu;
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<X86Context>();
gelu.SetContext(std::move(ctx));
gelu.SetParam(param);
gelu.Run();
LOG(INFO) << "output: ";
std::vector<float> ref_data{0.,
-0.169484,
1.512321,
-0.019674,
3.197801,
-0.000126719,
4.8,
-0.,
6.4000001,
-0.,
8.,
-0.};
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i];
EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(gelu, kX86, kFloat, kNCHW, def);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <utility>
#include <vector>
#include "lite/core/op_registry.h"
#include "lite/kernels/x86/activation_compute.cc"
namespace paddle {
namespace lite {
namespace kernels {
namespace x86 {
TEST(tanh_x86, retrive_op) {
auto tanh =
KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("tanh");
ASSERT_FALSE(tanh.empty());
ASSERT_TRUE(tanh.front());
}
TEST(tanh_x86, init) {
TanhCompute<float> tanh;
ASSERT_EQ(tanh.precision(), PRECISION(kFloat));
ASSERT_EQ(tanh.target(), TARGET(kX86));
}
TEST(tanh_x86, run_test) {
lite::Tensor x, out;
constexpr int batch_size = 1;
std::vector<int64_t> x_shape{batch_size, 3, 2, 2};
x.Resize(lite::DDim(x_shape));
std::vector<int64_t> out_shape{batch_size, 3, 2, 2};
out.Resize(lite::DDim(out_shape));
auto x_data = x.mutable_data<float>();
auto out_data = out.mutable_data<float>();
for (int64_t i = 0; i < x.dims().production(); i++) {
int sign = i % 2 == 0 ? 1 : -1;
x_data[i] = static_cast<float>(i * sign) * 0.08f;
}
// TanhCompute tanh;
TanhCompute<float> tanh;
operators::ActivationParam param;
param.X = &x;
param.Out = &out;
std::unique_ptr<KernelContext> ctx(new KernelContext);
ctx->As<X86Context>();
tanh.SetContext(std::move(ctx));
tanh.SetParam(param);
tanh.Run();
LOG(INFO) << "output: ";
std::vector<float> ref_data{0.,
-0.079829,
0.158648,
-0.235495,
0.309506,
-0.379949,
0.446243,
-0.507977,
0.564899,
-0.616909,
0.664036,
-0.706419};
for (int i = 0; i < out.dims().production(); i++) {
LOG(INFO) << out_data[i];
EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
}
}
} // namespace x86
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_KERNEL(tanh, kX86, kFloat, kNCHW, def);
if(NOT LITE_WITH_XPU)
return ()
endif()
add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
add_subdirectory(bridges)
lite_cc_library(xpu_bridge_registry SRCS registry.cc)
set(xpu_bridge_deps xpu_bridge_registry xpu_builder op)
lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
set(xpu_bridges
xpu_bridge_registry
xpu_bridge_act_op
xpu_bridge_conv_op
xpu_bridge_elementwise_ops
xpu_bridge_pool_op
xpu_bridge_softmax_op
xpu_bridge_mul_op
CACHE INTERNAL "xpu_bridges")
set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops})
lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// create act node and set params from op
auto x_var_name = op_info->Input("X").front();
CHECK(input_nodes.count(x_var_name));
std::shared_ptr<xtcl::xExpr> act_node = nullptr;
if (op_type == "relu") {
act_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name)));
} else {
// TODO(hong19860320) supports more activation ops
LOG(FATAL) << "[XPU] Unsupported activation type " << op_type;
}
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = act_node;
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/test_helper.h"
#include "lite/operators/activation_ops.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
void relu_ref(const std::shared_ptr<operators::ActivationOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
auto x_data = x->data<float>();
auto out_data = out->mutable_data<float>();
DDim x_dims = x->dims();
DDim out_dims = out->dims();
CHECK_EQ(x_dims.production(), out_dims.production());
for (int i = 0; i < out_dims.production(); i++) {
out_data[i] = std::max(0.f, x_data[i]);
}
}
void test_relu(int bs, int ic, int ih, int iw) {
// prepare input&output variables
Scope scope;
std::string x_var_name("x");
std::string out_var_name("out");
std::string out_ref_var_name("out_ref");
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
// initialize input&output data
FillTensor<float, int>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("relu");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
// create and convert op to XPU model, and run it on XPU
auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
LauchOp(op, {x_var_name}, {out_var_name});
out_ref->CopyDataFrom(*out);
// execute reference implementation and save to output tensor
relu_ref(op);
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(NPUBridges, relu) {
for (auto bs : {1, 3}) {
for (auto ic : {3, 4}) {
for (auto ih : {2, 5}) {
for (auto iw : {5, 9}) {
VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
<< " iw: " << iw;
test_relu(bs, ic, ih, iw);
}
}
}
}
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(relu);
USE_XPU_BRIDGE(relu);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " << op_type << "... ";
// get input, filter and op attributes
auto input_var_name = op_info->Input("Input").front();
auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
auto input_dims = input->dims();
auto filter_var_name = op_info->Input("Filter").front();
auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
auto filter_dims = filter->dims();
auto bs = input_dims[0];
auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4);
CHECK_EQ(filter_dims.size(), 4);
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
CHECK_EQ(strides.size(), 2);
CHECK_EQ(paddings.size(), 2);
CHECK_EQ(dilations.size(), 2);
std::vector<int64_t> output_shape({bs, oc});
for (size_t i = 0; i < 2; i++) {
const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
output_shape.push_back(
(input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1);
}
DDim output_dims(output_shape);
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// create filter node
CHECK(!input_nodes.count(filter_var_name));
auto filter_const_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateTensor(filter_var_name,
lite::xpu::CvtShape(filter_dims),
::xtcl::Float(32)));
auto filter_const_tensor = lite::xpu::CvtTensor(filter);
graph_ctx->params->emplace(
std::make_pair(filter_var_name, *filter_const_tensor));
// create conv node and set input, filter, bias nodes and attributes
auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
conv_attrs->strides = std::move(lite::xpu::CvtShape(strides));
conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings));
conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations));
conv_attrs->groups = groups;
// conv_attrs->channels = nullptr;
conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
conv_attrs->data_layout = "NCHW";
conv_attrs->kernel_layout = "OIHW";
conv_attrs->out_layout = "";
// conv_attrs->out_dtype = "";
CHECK(input_nodes.count(input_var_name));
auto conv_node =
std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D(
*input_nodes.at(input_var_name), *filter_const_node, conv_attrs));
graph_ctx->builder->SetLayer(unique_op_type);
// create bias node if has bias
// supports the bias nodes with the following dimensions
// 0: {oc}
// 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow}
if (lite::xpu::HasInputArg(op_info, scope, "Bias")) {
auto bias_var_name = op_info->Input("Bias").front();
auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production();
std::vector<int64_t> bias_shape;
bool is_channel_bias = false;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {oc};
is_channel_bias = true;
} else if (bias_data_size == output_data_size / bs) {
// 1: {1, oc, oh, ow}
bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
} else if (bias_data_size == output_data_size) {
// 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize();
} else {
LOG(ERROR) << "bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
}
std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
if (input_nodes.count(bias_var_name)) {
// bias node from input node
bias_node = input_nodes.at(bias_var_name);
} else {
// bias node with const tensor
auto bias_const_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateTensor(bias_var_name,
lite::xpu::CvtShape(bias_shape),
::xtcl::Float(32)));
auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
graph_ctx->params->emplace(
std::make_pair(bias_var_name, *bias_const_tensor));
bias_node = bias_const_node;
}
std::shared_ptr<xtcl::xExpr> add_node = nullptr;
if (is_channel_bias) {
add_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBiasAdd(*conv_node, *bias_node, 1));
} else {
add_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node));
}
graph_ctx->builder->SetLayer(unique_op_type + "/add");
conv_node = add_node;
}
// output converted nodes
node_map_type output_nodes;
if (fuse_relu) {
// append relu node if fuse_relu is true
auto relu_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateRelu(*conv_node));
graph_ctx->builder->SetLayer(unique_op_type + "/relu");
output_nodes[op_info->Output("Output").front()] = relu_node;
} else {
output_nodes[op_info->Output("Output").front()] = conv_node;
}
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter);
REGISTER_XPU_BRIDGE(depthwise_conv2d,
paddle::lite::kernels::xpu::bridges::ConvConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/test_helper.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto input =
scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
auto filter =
scope->FindVar(op_info->Input("Filter").front())->GetMutable<Tensor>();
auto output =
scope->FindVar(op_info->Output("Output").front())->GetMutable<Tensor>();
std::vector<int32_t> strides =
op_info->GetAttr<std::vector<int32_t>>("strides");
std::vector<int32_t> paddings =
op_info->GetAttr<std::vector<int32_t>>("paddings");
int32_t groups = op_info->GetAttr<int32_t>("groups");
std::vector<int32_t> dilations =
op_info->GetAttr<std::vector<int32_t>>("dilations");
bool fuse_relu = op_info->GetAttr<bool>("fuse_relu");
auto input_dims = input->dims();
auto filter_dims = filter->dims();
auto output_dims = output->dims();
auto input_data = input->mutable_data<float>();
auto filter_data = filter->mutable_data<float>();
auto output_data = output->mutable_data<float>();
int kernel_w = filter_dims[3];
int kernel_h = filter_dims[2];
int stride_w = strides[1];
int stride_h = strides[0];
int dila_w = dilations[1];
int dila_h = dilations[0];
int pad_w = paddings[1];
int pad_h = paddings[0];
int batch_size = input_dims[0];
int in_ch_size = input_dims[1];
int in_h = input_dims[2];
int in_w = input_dims[3];
int out_ch_size = output_dims[1];
int out_h = output_dims[2];
int out_w = output_dims[3];
int out_c_group = out_ch_size / groups;
int in_c_group = in_ch_size / groups;
Tensor* bias = nullptr;
float* bias_data = nullptr;
bool is_channel_bias = false;
if (op_info->HasInput("Bias")) {
auto bias_var_names = op_info->Input("Bias");
if (bias_var_names.size() > 0) {
auto bias_var_name = bias_var_names.front();
bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
auto bias_dims = bias->dims();
is_channel_bias = bias_dims.production() == out_ch_size;
bias_data = bias->mutable_data<float>();
}
}
for (int n = 0; n < batch_size; ++n) {
for (int g = 0; g < groups; ++g) {
for (int oc = 0; oc < out_c_group; ++oc) {
for (int oh = 0; oh < out_h; ++oh) {
for (int ow = 0; ow < out_w; ++ow) {
int out_idx = n * groups * out_c_group * out_h * out_w +
g * out_c_group * out_h * out_w + oc * out_h * out_w +
oh * out_w + ow;
float out_value =
bias_data != nullptr
? (is_channel_bias ? bias_data[g * out_c_group + oc]
: bias_data[out_idx])
: 0;
// + out_value *= beta;
for (int ic = 0; ic < in_c_group; ++ic) {
for (int kh = 0; kh < kernel_h; ++kh) {
for (int kw = 0; kw < kernel_w; ++kw) {
int iw = ow * stride_w - pad_w + kw * (dila_w);
int ih = oh * stride_h - pad_h + kh * (dila_h);
if (iw < 0 || iw >= in_w) continue;
if (ih < 0 || ih >= in_h) continue;
int in_idx = n * in_ch_size * in_h * in_w +
g * in_c_group * in_h * in_w + ic * in_h * in_w +
ih * in_w + iw;
int filter_idx =
g * out_c_group * in_c_group * kernel_h * kernel_w +
oc * in_c_group * kernel_h * kernel_w +
ic * kernel_h * kernel_w + kh * kernel_w + kw;
out_value += input_data[in_idx] * filter_data[filter_idx];
}
}
}
if (fuse_relu) {
out_value = out_value > 0 ? out_value : 0;
}
output_data[out_idx] = out_value;
}
}
}
}
}
}
void test_conv(int bs,
int ic,
int oc,
int ih,
int iw,
bool has_bias,
bool is_channel_bias,
bool fuse_relu,
bool depthwise,
int dilation,
int stride,
int padding,
int kernel) {
// prepare input&output variables
Scope scope;
std::string input_var_name("input");
std::string filter_var_name("filter");
std::string bias_var_name("bias");
std::string output_var_name("output");
std::string output_ref_var_name("output_ref");
auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
auto* filter = scope.Var(filter_var_name)->GetMutable<Tensor>();
auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
auto* output = scope.Var(output_var_name)->GetMutable<Tensor>();
auto* output_ref = scope.Var(output_ref_var_name)->GetMutable<Tensor>();
// get group size and input&filter shape
int groups = 1;
if (depthwise) { // depthwise convolution ?
groups = oc = ic;
}
std::vector<int64_t> input_shape = {bs, ic, ih, iw};
std::vector<int64_t> filter_shape = {oc, ic / groups, kernel, kernel};
std::vector<int64_t> output_shape({bs, oc});
for (size_t i = 0; i < 2; i++) {
const int dkernel = dilation * (kernel - 1) + 1;
int output_size = (input_shape[i + 2] + 2 * padding - dkernel) / stride + 1;
output_shape.push_back(output_size);
}
input->Resize(input_shape);
filter->Resize(filter_shape);
// initialize input&output data
FillTensor<float, int>(input);
FillTensor<float, int>(filter);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType(depthwise ? "depthwise_conv2d" : "conv2d");
opdesc.SetInput("Input", {input_var_name});
opdesc.SetInput("Filter", {filter_var_name});
opdesc.SetOutput("Output", {output_var_name});
opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
opdesc.SetAttr("groups", groups);
opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
if (has_bias) {
if (is_channel_bias) {
bias->Resize({1, oc, 1, 1});
} else {
bias->Resize({1, output_shape[1], output_shape[2], output_shape[3]});
}
FillTensor<float, int>(bias);
opdesc.SetInput("Bias", {bias_var_name});
}
// create and convert op to NPU model, then run it on NPU
auto op = CreateOp<operators::ConvOpLite>(opdesc, &scope);
LauchOp(op, {input_var_name}, {output_var_name});
output_ref->CopyDataFrom(*output);
// execute reference implementation and save to output tensor('out')
conv_ref(op);
// compare results
auto* output_data = output->mutable_data<float>();
auto* output_ref_data = output_ref->mutable_data<float>();
for (int i = 0; i < output->dims().production(); i++) {
VLOG(5) << i;
EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
}
}
TEST(NPUBridges, conv) {
#if 0
for (auto bs : {1, 2}) {
for (auto ic : {3, 6}) {
for (auto oc : {6, 9}) {
for (auto ih : {14, 28}) {
for (auto iw : {14, 28}) {
for (auto has_bias : {false, true}) {
for (auto is_channel_bias : {false, true}) {
for (auto fuse_relu : {false, true}) {
for (auto depthwise : {false, true}) {
for (auto dilation : {1, 2}) {
for (auto stride : {1, 2}) {
for (auto kernel : {1, 3, 5}) {
std::vector<int> paddings = {kernel / 2};
if (kernel / 2 != 0) {
paddings.push_back(0);
}
for (auto padding : paddings) {
VLOG(3) << "bs: " << bs << " ic: " << ic
<< " oc: " << oc << " ih: " << ih
<< " iw: " << iw
<< " has_bias: " << has_bias
<< " is_channel_bias: " << is_channel_bias
<< " fuse_relu: " << fuse_relu
<< " depthwise: " << depthwise
<< " dilation: " << dilation
<< " stride: " << stride
<< " padding: " << padding
<< " kernel: " << kernel;
test_conv(bs,
ic,
oc,
ih,
iw,
has_bias,
is_channel_bias,
fuse_relu,
depthwise,
dilation,
stride,
padding,
kernel);
}
}
}
}
}
}
}
}
}
}
}
}
}
#else
test_conv(1, 1, 1, 4, 4, false, false, false, false, 1, 1, 1, 3);
test_conv(1, 1, 1, 4, 4, true, true, false, false, 1, 1, 1, 3);
test_conv(1, 1, 1, 4, 4, true, false, false, false, 1, 1, 1, 3);
#endif
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(conv2d);
USE_XPU_BRIDGE(conv2d);
USE_LITE_OP(depthwise_conv2d);
USE_XPU_BRIDGE(depthwise_conv2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes
auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front();
auto axis = op_info->GetAttr<int>("axis");
auto x_tensor = scope->FindMutableTensor(x_var_name);
auto y_tensor = scope->FindMutableTensor(y_var_name);
auto x_dims = x_tensor->dims();
auto y_dims = y_tensor->dims();
// create x and y node
std::shared_ptr<xtcl::xExpr> x_node = nullptr;
if (input_nodes.count(x_var_name)) {
x_node = input_nodes.at(x_var_name);
} else {
x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
}
std::shared_ptr<xtcl::xExpr> y_node = nullptr;
if (input_nodes.count(y_var_name)) {
y_node = input_nodes.at(y_var_name);
} else {
y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
}
// create elementwise node and set input, attributes
std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
if (y_dims.size() == 1) {
elementwise_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBiasAdd(*x_node, *y_node, axis));
} else if (x_dims.size() == y_dims.size()) {
elementwise_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node));
} else {
LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x "
"and y of the same dimension. But recieved x's dimension: "
<< x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
}
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = elementwise_node;
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(elementwise_add,
paddle::lite::kernels::xpu::bridges::ElementwiseConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/elementwise_ops.h"
#include <gtest/gtest.h>
#include <random>
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/test_helper.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
template <typename dtype>
void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
auto x_data = x->data<dtype>();
auto y_data = y->data<dtype>();
dtype* out_data = out->mutable_data<dtype>();
auto x_dims = x->dims();
auto y_dims = y->dims();
int axis = op_info->GetAttr<int>("axis");
if (axis < 0) {
axis = x_dims.size() - y_dims.size();
}
int batch = 1;
int channels = 1;
int num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
num *= x_dims[i];
}
// do elementwise add/sub/max...
std::string elt_type = "add";
if (elt_type == "add") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr + diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (elt_type == "sub") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr - diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (elt_type == "mul") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *din_ptr * diny_data;
dout_ptr++;
din_ptr++;
}
}
}
} else if (elt_type == "max") {
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const dtype* din_ptr = x_data + offset;
const dtype diny_data = y_data[j];
dtype* dout_ptr = out_data + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = std::max(*din_ptr, diny_data);
dout_ptr++;
din_ptr++;
}
}
}
} else {
LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
}
}
void test_elementwise_add(std::vector<int64_t> x_dims,
std::vector<int64_t> y_dims,
int axis) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string y_var_name = "y";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize(x_dims);
if (y_dims.size() == 0) {
y->Resize(x_dims);
} else {
y->Resize(y_dims);
}
// initialize input&output data
FillTensor<float>(x);
FillTensor<float>(y);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("elementwise_add");
opdesc.SetInput("X", {x_var_name});
opdesc.SetInput("Y", {y_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("axis", axis);
// create and convert op to XPU model, then run it on XPU
auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
out_ref->CopyDataFrom(*out);
// execute reference implementation and save to output tensor
elementwise_add_ref<float>(op);
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
// xpu's bias_add only support y with one dimension
TEST(XPUBridges, elementwise_add) {
test_elementwise_add({1, 2, 3, 4}, {1}, 0);
test_elementwise_add({1, 2, 3, 4}, {2}, 1);
test_elementwise_add({2, 2, 3, 4}, {3}, 2);
test_elementwise_add({2, 2, 3, 4}, {4}, 3);
test_elementwise_add({2, 2, 3, 4}, {4}, -1);
test_elementwise_add({2, 2, 3, 4}, {}, -1);
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(elementwise_add);
USE_XPU_BRIDGE(elementwise_add);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto scope = op->scope();
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes
auto x_var_name = op_info->Input("X").front();
auto y_var_name = op_info->Input("Y").front();
auto y_tensor = scope->FindMutableTensor(y_var_name);
auto y_dims = y_tensor->dims();
CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
CHECK_EQ(x_num_col_dims, 1) << "xpu now only support x_num_col_dims == 1";
auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
// create x node
std::shared_ptr<xtcl::xExpr> x_node = nullptr;
x_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name)));
graph_ctx->builder->SetLayer(unique_op_type + "/X");
// transpose y
DDimLite y_dims_t(std::vector<int64_t>{1, 1});
y_dims_t[0] = y_dims[1];
y_dims_t[1] = y_dims[0];
auto y_var_name_t = unique_op_type + "/Y";
Tensor* y_tensor_t = new Tensor();
y_tensor_t->Resize(y_dims_t);
auto y_data_t = y_tensor_t->mutable_data<float>();
auto y_data = y_tensor->mutable_data<float>();
for (int i = 0; i < y_dims_t[0]; i++) {
for (int j = 0; j < y_dims_t[1]; j++) {
y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
}
}
// create y node
std::shared_ptr<xtcl::xExpr> y_const_node = nullptr;
y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32)));
auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t);
graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor));
delete y_tensor_t;
// create mul node and set params from op
std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
mul_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateDense(
*x_node, *y_const_node, static_cast<int>(y_dims[1])));
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = mul_node;
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/mul_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/test_helper.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
void mul_ref(const std::shared_ptr<operators::MulOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
int32_t x_num_col_dims = op_info->GetAttr<int32_t>("x_num_col_dims");
int32_t y_num_col_dims = op_info->GetAttr<int32_t>("y_num_col_dims");
auto x_data = x->mutable_data<float>();
auto y_data = y->mutable_data<float>();
auto out_data = out->mutable_data<float>();
auto x_mat_dims = x->dims().Flatten2D(x_num_col_dims);
auto y_mat_dims = y->dims().Flatten2D(y_num_col_dims);
CHECK_EQ(x_mat_dims[1], y_mat_dims[0]);
const int M = x_mat_dims[0];
const int K = x_mat_dims[1];
const int N = y_mat_dims[1];
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
out_data[m * N + n] = 0;
for (int k = 0; k < K; ++k) {
out_data[m * N + n] += x_data[m * K + k] * y_data[k * N + n];
}
}
}
}
void test_mul(const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& y_shape,
int x_num_col_dims,
int y_num_col_dims) {
Scope scope;
std::string x_var_name("X");
std::string y_var_name("Y");
std::string out_var_name("Out");
std::string out_ref_var_name("out_ref");
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize(x_shape);
y->Resize(y_shape);
FillTensor<float>(x);
FillTensor<float>(y);
// create mul op
cpp::OpDesc mul_op_desc;
mul_op_desc.SetType("mul");
mul_op_desc.SetInput("X", {x_var_name});
mul_op_desc.SetInput("Y", {y_var_name});
mul_op_desc.SetOutput("Out", {out_var_name});
mul_op_desc.SetAttr("x_num_col_dims", static_cast<int>(x_num_col_dims));
mul_op_desc.SetAttr("y_num_col_dims", static_cast<int>(y_num_col_dims));
auto mul_op = CreateOp<operators::MulOpLite>(mul_op_desc, &scope);
LauchOp(mul_op, {x_var_name}, {out_var_name});
out_ref->CopyDataFrom(*out);
mul_ref(mul_op);
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(XPUBridges, mul) {
test_mul({1, 2, 3, 4}, {24, 2}, 1, 1);
test_mul({2, 2, 3, 4}, {24, 2}, 1, 1);
test_mul({2, 7}, {7, 3}, 1, 1);
// test_mul({1, 8, 8, 1}, {1, 8, 2, 2}, 2, 2);
// test_mul({1, 5, 5, 1}, {1, 5, 7, 7}, 2, 2);
// test_mul({1, 4, 1, 1}, {4, 8}, 1, 1);
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(mul);
USE_XPU_BRIDGE(mul);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/kernels/xpu/bridges/registry.h"
USE_XPU_BRIDGE(relu);
USE_XPU_BRIDGE(conv2d);
USE_XPU_BRIDGE(depthwise_conv2d);
USE_XPU_BRIDGE(elementwise_add);
USE_XPU_BRIDGE(pool2d);
USE_XPU_BRIDGE(softmax);
USE_XPU_BRIDGE(mul);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get input, and attributes
auto x_var_name = op_info->Input("X").front();
auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto global_pooling = op_info->GetAttr<bool>("global_pooling");
auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto exclusive = op_info->GetAttr<bool>("exclusive");
// create pool node and set params from op
CHECK(input_nodes.count(x_var_name));
std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
if (pooling_type == "max") {
if (global_pooling) {
pool_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateGlobalMaxPool2D(
*input_nodes.at(x_var_name)));
} else {
pool_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name),
lite::xpu::CvtShape(ksize),
lite::xpu::CvtShape(strides),
lite::xpu::CvtShape(paddings),
"NCHW",
ceil_mode));
}
} else if (pooling_type == "avg") {
if (global_pooling) {
pool_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateGlobalAvgPool2D(
*input_nodes.at(x_var_name)));
} else {
pool_node = std::make_shared<xtcl::xExpr>(
// !exclusive ---> count_include_pad
graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name),
lite::xpu::CvtShape(ksize),
lite::xpu::CvtShape(strides),
lite::xpu::CvtShape(paddings),
"NCHW",
ceil_mode,
!exclusive));
}
} else {
LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
}
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = pool_node;
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/pool_op.h"
#include <gtest/gtest.h>
#include "lite/core/op_registry.h"
#include "lite/kernels/xpu/bridges/registry.h"
#include "lite/kernels/xpu/bridges/test_helper.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
Scope* scope = op->scope();
const OpInfo* op_info = op->op_info();
auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
auto out =
scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
auto& in_dims = x->dims();
auto& out_dims = out->dims();
const float* src_ptr = x->data<const float>();
float* dst_ptr = out->mutable_data<float>();
std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
bool exclusive = op_info->GetAttr<bool>("exclusive");
std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
bool global_pooling = op_info->GetAttr<bool>("global_pooling");
int in_n = in_dims[0];
int in_c = in_dims[1];
int in_h = in_dims[2];
int in_w = in_dims[3];
int size_in_n = in_c * in_h * in_w;
int size_in_c = in_h * in_w;
int out_h = out_dims[2];
int out_w = out_dims[3];
int size_out_n = in_c * out_h * out_w;
int size_out_c = out_h * out_w;
int window_h = ksize[0];
int window_w = ksize[1];
int stride_h = strides[0];
int stride_w = strides[1];
int pad_h = paddings[0];
int pad_w = paddings[1];
if (global_pooling == true) {
for (int n = 0; n < in_n; ++n) {
for (int c = 0; c < in_c; ++c) {
const float* src = src_ptr + n * size_in_n + c * size_in_c;
float res = src[0];
if (pooling_type == "max") {
for (int i = 1; i < size_in_c; ++i) {
float cur_val = src[i];
res = cur_val > res ? cur_val : res;
}
} else if (pooling_type == "avg") {
for (int i = 1; i < size_in_c; ++i) {
float cur_val = src[i];
res += cur_val;
}
res /= size_in_c;
}
dst_ptr[n * size_out_n + c] = res;
}
}
} else {
for (int n = 0; n < in_n; ++n) {
for (int c = 0; c < in_c; ++c) {
for (int h = 0; h < out_h; ++h) {
int sh = h * stride_h;
int eh = sh + window_h;
sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
for (int w = 0; w < out_w; ++w) {
int sw = w * stride_w;
int ew = sw + window_w;
sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
int pooling_size = (ew - sw) * (eh - sh);
if (pooling_size == 0) continue;
float res = 0.f;
for (int kh = sh; kh < eh; ++kh) {
for (int kw = sw; kw < ew; ++kw) {
int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
if (kh == sh && kw == sw) {
res = src_ptr[src_idx];
} else {
if (pooling_type == "max") {
res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
}
if (pooling_type == "avg") {
res += src_ptr[src_idx];
}
}
}
}
if (pooling_type == "avg") {
if (exclusive) {
res /= pooling_size;
} else {
res /= window_h * window_w;
}
}
dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
}
}
}
}
}
}
void test_pool(int bs,
int ic,
int ih,
int iw,
std::string pooling_type,
bool ceil_mode,
bool global_pooling,
bool exclusive,
int ksize,
int stride,
int padding) {
// prepare input&output variables
Scope scope;
std::string x_var_name = "x";
std::string out_var_name = "out";
std::string out_ref_var_name = "out_ref";
auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
x->Resize({bs, ic, ih, iw});
// initialize input&output data
FillTensor<float>(x);
// initialize op desc
cpp::OpDesc opdesc;
opdesc.SetType("pool2d");
opdesc.SetInput("X", {x_var_name});
opdesc.SetOutput("Out", {out_var_name});
opdesc.SetAttr("pooling_type", pooling_type);
opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
opdesc.SetAttr("global_pooling", global_pooling);
opdesc.SetAttr("exclusive", exclusive);
opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
opdesc.SetAttr("ceil_mode", ceil_mode);
// create and convert op to XPU model, then run it on XPU
auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
LauchOp(op, {x_var_name}, {out_var_name});
out_ref->CopyDataFrom(*out);
// execute reference implementation and save to output tensor
pool_ref(op);
// compare results
auto* out_data = out->mutable_data<float>();
auto* out_ref_data = out_ref->mutable_data<float>();
for (int i = 0; i < out->dims().production(); i++) {
EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
}
}
TEST(XPUBridges, pool) {
for (auto pooling_type : {"max", "avg"}) {
for (auto bs : {1, 3}) {
for (auto ic : {2}) {
for (auto ih : {3}) {
for (auto iw : {4}) {
test_pool(bs, ic, ih, iw, pooling_type, true, true, true, 0, 1, 0);
}
}
}
}
}
for (auto pooling_type : {"max"}) {
for (auto ceil_mode : {true, false}) {
for (auto ksize : {2, 3}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1}) {
for (auto bs : {1, 3}) {
for (auto ic : {2}) {
for (auto ih : {3}) {
for (auto iw : {4}) {
test_pool(bs,
ic,
ih,
iw,
pooling_type,
ceil_mode,
false,
true,
ksize,
stride,
padding);
}
}
}
}
}
}
}
}
}
for (auto pooling_type : {"avg"}) {
for (auto ceil_mode : {true, false}) {
for (auto exclusive : {true, false}) {
for (auto ksize : {2, 3}) {
for (auto stride : {1, 2}) {
for (auto padding : {0, 1}) {
for (auto bs : {1, 3}) {
for (auto ic : {2}) {
for (auto ih : {3}) {
for (auto iw : {4}) {
test_pool(bs,
ic,
ih,
iw,
pooling_type,
ceil_mode,
false,
exclusive,
ksize,
stride,
padding);
}
}
}
}
}
}
}
}
}
}
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
USE_LITE_OP(pool2d);
USE_XPU_BRIDGE(pool2d);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/xpu/bridges/registry.h"
#include <utility>
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
Factory& Factory::Instance() {
static Factory g_xpu_bridge;
return g_xpu_bridge;
}
bool Factory::HasType(const std::string& op_type) const {
return map_.count(op_type);
}
void Factory::Insert(const std::string& op_type, const func_type& func_name) {
map_.insert(std::make_pair(op_type, func_name));
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <xtcl/xtcl.h>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/utils/macros.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
// xpu network builder and constant tensors
class graph_ctx_type {
public:
std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
};
// var_name, xpu node pointer
using node_map_type =
std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
using func_type = std::function<node_map_type(
const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
using cvt_map_type = std::unordered_map<std::string, func_type>;
class Factory {
public:
static Factory& Instance();
const cvt_map_type& AllFunctions() const { return map_; }
bool HasType(const std::string& op_type) const;
void Insert(const std::string& op_type, const func_type& func_name);
Factory() = default;
private:
cvt_map_type map_;
DISALLOW_COPY_AND_ASSIGN(Factory);
};
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
// some platform-independent defintion
#if defined(_WIN32)
#define UNUSED
#define __builtin_expect(EXP, C) (EXP)
#else
#define UNUSED __attribute__((unused))
#endif
#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg) \
struct __test_global_namespace_##uniq_name##__ {}; \
static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \
__test_global_namespace_##uniq_name##__>::value, \
msg)
#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name) \
STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE( \
__reg_xpu_bridge_##op_type##__, \
"REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
int __reg_xpu_bridge_##op_type##_Insert() { \
paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert( \
#op_type, cvt_func_name); \
return 0; \
}
#define USE_XPU_BRIDGE(op_type) \
extern int __reg_xpu_bridge_##op_type##_Insert(); \
static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
__reg_xpu_bridge_##op_type##_Insert();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/builder.h"
#include "lite/kernels/xpu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace bridges {
node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op,
graph_ctx_type* graph_ctx,
const node_map_type& input_nodes) {
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto unique_op_type = lite::xpu::UniqueName(op_type);
LOG(INFO) << "[XPU] Converting " + op_type + "...";
// check context
CHECK(graph_ctx != nullptr);
CHECK(graph_ctx->builder != nullptr);
CHECK(graph_ctx->params != nullptr);
// get op's attributes
auto x_var_name = op_info->Input("X").front();
auto axis = op_info->GetAttr<int>("axis");
// create softmax node and set params from ops
CHECK(input_nodes.count(x_var_name));
std::shared_ptr<xtcl::xExpr> softmax_node = nullptr;
softmax_node = std::make_shared<xtcl::xExpr>(
graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis));
graph_ctx->builder->SetLayer(unique_op_type);
// output converted nodes
node_map_type output_nodes;
output_nodes[op_info->Output("Out").front()] = softmax_node;
return output_nodes;
}
} // namespace bridges
} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_XPU_BRIDGE(softmax,
paddle::lite::kernels::xpu::bridges::SoftmaxConverter);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "lite/operators/graph_op.h" #include "lite/operators/graph_op.h"
#include <utility>
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
namespace paddle { namespace paddle {
...@@ -34,7 +35,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { ...@@ -34,7 +35,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
for (auto var : inputs) { for (auto var : inputs) {
CHECK(scope->FindVar(var)); CHECK(scope->FindVar(var));
param_.inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>()); param_.inputs.push_back(
std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
} }
param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>(); param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
...@@ -42,7 +44,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) { ...@@ -42,7 +44,8 @@ bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
for (auto var : outputs) { for (auto var : outputs) {
CHECK(scope->FindVar(var)); CHECK(scope->FindVar(var));
param_.outputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>()); param_.outputs.push_back(
std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
} }
return true; return true;
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include <utility>
#include <vector> #include <vector>
#include "lite/api/paddle_place.h" #include "lite/api/paddle_place.h"
#include "lite/core/scope.h" #include "lite/core/scope.h"
...@@ -69,9 +70,9 @@ struct CalibParam { ...@@ -69,9 +70,9 @@ struct CalibParam {
}; };
struct GraphParam { struct GraphParam {
std::vector<const lite::Tensor*> inputs{}; std::vector<std::pair<std::string, const lite::Tensor*>> inputs{};
lite::Tensor* weight{}; lite::Tensor* weight{};
std::vector<lite::Tensor*> outputs{}; std::vector<std::pair<std::string, lite::Tensor*>> outputs{};
}; };
/// -------------------------- NN operators ------------------------------------ /// -------------------------- NN operators ------------------------------------
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册