From 08b43cce6d2d5e2f57a4317461eb26f88af9bd3c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 1 Mar 2022 11:24:52 +0800 Subject: [PATCH] [Phi] Support kps backend and kernel registry (#39941) * support kps backend and compile * resolve conflict * fix kps backend trans * test in xpu2 device * remove dummy kernel --- cmake/generic.cmake | 1 + cmake/phi.cmake | 60 +++++++++++++++++++++---- paddle/fluid/framework/phi_utils.cc | 4 ++ paddle/phi/backends/gpu/gpu_context.h | 8 ++++ paddle/phi/backends/xpu/xpu_context.h | 8 ++++ paddle/phi/common/backend.h | 8 ++++ paddle/phi/core/compat/convert_utils.cc | 8 ++++ paddle/phi/tests/common/test_backend.cc | 4 ++ 8 files changed, 93 insertions(+), 8 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index f7c17bd7cf..51ed537ce5 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME) else() xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS}) find_fluid_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (xpu_library_DEPS) add_dependencies(${TARGET_NAME} ${xpu_library_DEPS}) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index d9132b8445..f6e1575837 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST) file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./gpudnn\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") + elseif (${kernel_path} MATCHES "./kps\/") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") @@ -97,6 +99,7 @@ function(kernel_library TARGET) set(gpu_srcs) set(xpu_srcs) set(gpudnn_srcs) + set(kps_srcs) set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) @@ -128,6 +131,9 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) endif() @@ -137,6 +143,15 @@ function(kernel_library TARGET) list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) endif() endif() + if (WITH_XPU_KP) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) + # Change XPU2 file suffix + # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu + file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps) + endif() + endif() else() # TODO(chenweihang): impl compile by source later endif() @@ -150,6 +165,7 @@ function(kernel_library TARGET) list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${xpu_srcs}) list(APPEND all_srcs ${gpudnn_srcs}) + list(APPEND all_srcs ${kps_srcs}) foreach(src ${all_srcs}) file(READ ${src} target_content) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) @@ -159,11 +175,11 @@ function(kernel_library TARGET) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) endif() foreach(include_kernel ${include_kernels}) - if ("${kernel_library_SUB_DIR}" STREQUAL "") - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) - else() - string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) - endif() + if ("${kernel_library_SUB_DIR}" STREQUAL "") + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel}) + else() + string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel}) + endif() string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name}) list(APPEND kernel_deps ${kernel_name}) endforeach() @@ -176,11 +192,20 @@ function(kernel_library TARGET) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH gpudnn_srcs gpudnn_srcs_len) + list(LENGTH kps_srcs kps_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len) + # kernel source file level + # level 1: base device kernel + # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # level 2: device-independent kernel + # - common_srcs + # level 3: Kernel implemented by reusing device-independent kernel + # - selected_rows_srcs + # Build Target according different src organization if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND + ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. if (WITH_GPU) @@ -193,6 +218,11 @@ function(kernel_library TARGET) hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -200,7 +230,7 @@ function(kernel_library TARGET) endif() endif() # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) if (WITH_GPU) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -209,6 +239,10 @@ function(kernel_library TARGET) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() + elseif (WITH_XPU_KP) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) @@ -222,6 +256,9 @@ function(kernel_library TARGET) elseif (WITH_ROCM) hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + elseif (WITH_XPU_KP) + xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) else() cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) @@ -232,6 +269,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -240,6 +279,8 @@ function(kernel_library TARGET) nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + elseif (WITH_XPU_KP) + xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() @@ -249,7 +290,7 @@ function(kernel_library TARGET) if (${target_build_flag} EQUAL 1) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) # append target into PHI_KERNELS property get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) @@ -275,6 +316,9 @@ function(kernel_library TARGET) if (${gpudnn_srcs_len} GREATER 0) kernel_declare(${gpudnn_srcs}) endif() + if (${kps_srcs_len} GREATER 0) + kernel_declare(${kps_srcs}) + endif() if (${selected_rows_srcs_len} GREATER 0) kernel_declare(${selected_rows_srcs}) endif() diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc index 355291beb6..1a39a87fb9 100644 --- a/paddle/fluid/framework/phi_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { library_type = LibraryType::kMKLDNN; } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; + } else if (kernel_key.backend() == phi::Backend::KPS) { + library_type = LibraryType::kKP; } else { // do nothing } @@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey( backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { backend = phi::Backend::GPUDNN; + } else if (kernel_type.library_type_ == LibraryType::kKP) { + backend = phi::Backend::KPS; } else { // do } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 603ce0817c..b9d843982d 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -227,4 +227,12 @@ class GPUContext : public DeviceContext { // must use different function name for cudnn kernel using GPUDNNContext = GPUContext; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +using KPSContext = GPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h index 3005d1707e..b87489c567 100644 --- a/paddle/phi/backends/xpu/xpu_context.h +++ b/paddle/phi/backends/xpu/xpu_context.h @@ -66,4 +66,12 @@ class XPUContext : public DeviceContext { std::unique_ptr impl_; }; +// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend, +// because we want to implement a KPS-based kernel and make it run +// on GPU and XPU at the same time, so we need KPSContext when registering +// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time! +#if PADDLE_WITH_XPU_KP +using KPSContext = XPUContext; +#endif + } // namespace phi diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 4b7bf65be3..a9e12f5d81 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -52,6 +52,9 @@ enum class Backend : uint8_t { MKLDNN, GPUDNN, // cuDNN and hipDNN + // paddle kernel primitives backend + KPS, + // end of backend types NUM_BACKENDS, @@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::GPUDNN: os << "GPUDNN"; break; + case Backend::KPS: + os << "KPS"; + break; default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); @@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::MKLDNN; } else if (s == std::string("GPUDNN")) { return Backend::GPUDNN; + } else if (s == std::string("KPS")) { + return Backend::KPS; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + phi::GetOrRegisterGlobalDeviceTypeId(s)); diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 3b7a733ede..b85db07bd9 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { case phi::Backend::XPU: return phi::XPUPlace( set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); +#endif + case phi::Backend::KPS: +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + return phi::GPUPlace( + set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); +#elif defined(PADDLE_WITH_XPU_KP) + return phi::XPUPlace( + set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0); #endif default: { #ifdef PADDLE_WITH_CUSTOM_DEVICE diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index fa4ffc84bf..5d6862c368 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -44,6 +44,9 @@ TEST(Backend, OStream) { oss << phi::Backend::GPUDNN; EXPECT_EQ(oss.str(), "GPUDNN"); oss.str(""); + oss << phi::Backend::KPS; + EXPECT_EQ(oss.str(), "KPS"); + oss.str(""); try { oss << phi::Backend::NUM_BACKENDS; } catch (const std::exception& exception) { @@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); + EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS")); EXPECT_EQ(static_cast( static_cast(phi::Backend::NUM_BACKENDS) + 1), pexp::StringToBackend("CustomBackend")); -- GitLab