diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index df2e59b7647bf0231362a4220e8610f50243f1c5..2684529930e7ce2b1dba0bbfb3fb95968e0eadc7 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: description: Format files with ClangFormat. entry: bash ./tools/codestyle/clang_format.hook -i language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps)$ - repo: local hooks: - id: cpplint-cpp-source @@ -48,7 +48,7 @@ repos: name: copyright_checker entry: python ./tools/codestyle/copyright.hook language: system - files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$ + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$ exclude: | (?x)^( paddle/utils/.* diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 5e60f1f2b99fee38fefc9b584f0d1d75b7c05e5b..415c0fe9bef9eab89e670d8b3f6f7c330b316ed8 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 8469dc4c02ee37b333254d6d35b0eb48354d4b86..8843dd2628767e8cac167db0ff115d0b63aca53a 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -125,6 +125,9 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.xpu) list(APPEND xpu_kp_cc_srcs ${TARGET}.xpu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) + endif() endif() if(WITH_ASCEND_CL) string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") @@ -162,6 +165,8 @@ function(op_library TARGET) list(APPEND xpu_cc_srcs ${src}) elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.xpu$") list(APPEND xpu_kp_cc_srcs ${src}) + elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") + list(APPEND xpu_kp_cc_srcs ${src}) elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") list(APPEND npu_cc_srcs ${src}) elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$") @@ -384,7 +389,15 @@ function(op_library TARGET) # pybind USE_OP_DEVICE_KERNEL for XPU KP if (WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0) - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, KP);\n") + foreach(xpu_kp_src ${xpu_kp_cc_srcs}) + set(op_name "") + find_register(${xpu_kp_src} "REGISTER_OP_KERNEL" op_name) + if(NOT ${op_name} EQUAL "") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, KP);\n") + message(STATUS "Building KP Target: ${op_name}") + set(pybind_flag 1) + endif() + endforeach() endif() # pybind USE_OP_DEVICE_KERNEL for NPU diff --git a/cmake/pten.cmake b/cmake/pten.cmake index 6049f6e21e5662a8b45e6f77898f10c2220a70b5..5645ac6cfa3039afdad0514abade5c9ea9b35408 100644 --- a/cmake/pten.cmake +++ b/cmake/pten.cmake @@ -58,26 +58,32 @@ endfunction() function(kernel_declare TARGET_LIST) foreach(kernel_path ${TARGET_LIST}) file(READ ${kernel_path} kernel_impl) - # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL - # NOTE(chenweihang): now we don't recommend to use digit in kernel name - string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}") + string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}") if (NOT first_registry STREQUAL "") + # some gpu kernel only can run on cuda, not support rocm, so we add this branch + if (WITH_ROCM) + string(FIND "${first_registry}" "cuda_only" pos) + if(pos GREATER 1) + continue() + endif() + endif() # parse the first kernel name - string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}") - string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}") + string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}") + string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}") string(REPLACE "," "" kernel_name "${kernel_name}") string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}") + string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}") # append kernel declare into declarations.h # TODO(chenweihang): default declare ALL_LAYOUT for each kernel if (${kernel_path} MATCHES "./cpu\/") - file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./gpu\/") - file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./xpu\/") - file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary - file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") endif() endif() endforeach() @@ -285,9 +291,9 @@ endfunction() function(append_op_util_declare TARGET) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content) - string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}") - string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}") - string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}") + string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}") + string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}") + string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}") string(APPEND util_declare ");\n") file(APPEND ${op_utils_header} "${util_declare}") endfunction() diff --git a/cmake/xpu_kp.cmake b/cmake/xpu_kp.cmake index f8ab9693db0c9ebe845ae7d77a562fd005f5130d..adab3e1423c91522092dac5503d8c58dcc8370db 100644 --- a/cmake/xpu_kp.cmake +++ b/cmake/xpu_kp.cmake @@ -17,7 +17,7 @@ if(NOT WITH_XPU_KP) endif() if(NOT XPU_TOOLCHAIN) - set(XPU_TOOLCHAIN /workspace/paddle/xpu-demo/XTDK) + set(XPU_TOOLCHAIN /workspace/output/XTDK-ubuntu_x86_64) get_filename_component(XPU_TOOLCHAIN ${XPU_TOOLCHAIN} REALPATH) endif() if(NOT IS_DIRECTORY ${XPU_TOOLCHAIN}) @@ -102,7 +102,7 @@ macro(compile_kernel COMPILE_ARGS) set(XTDK_DIR ${XPU_TOOLCHAIN}) set(CXX_DIR ${HOST_SYSROOT}) - set(XPU_CXX_FLAGS -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG ) + set(XPU_CXX_FLAGS -fforce-enable-int128 -Wno-error=pessimizing-move -Wno-error=constant-conversion -Wno-error=c++11-narrowing -Wno-error=shift-count-overflow -Wno-error=unused-local-typedef -Wno-error=deprecated-declarations -Wno-deprecated-declarations -std=c++14 -m64 -fPIC -fno-omit-frame-pointer -Wall -Wno-inconsistent-missing-override -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=unused-local-typedefs -Wno-error=ignored-attributes -Wno-error=int-in-bool-context -Wno-error=parentheses -Wno-error=address -Wno-ignored-qualifiers -Wno-ignored-attributes -Wno-parentheses -DNDEBUG ) #include path get_property(dirs DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) @@ -127,9 +127,11 @@ macro(compile_kernel COMPILE_ARGS) kernel_build/${kernel_name}.bin.o COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND + cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} - -I. -o kernel_build/${kernel_name}.bin.o.sec ${kernel_path}/${kernel_name}.xpu + -I. -o kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.xpu --xpu-device-only -c -v COMMAND ${XTDK_DIR}/bin/xpu2-elfconv kernel_build/${kernel_name}.bin.o.sec kernel_build/${kernel_name}.bin.o ${XPU_CLANG} --sysroot=${CXX_DIR} @@ -148,9 +150,11 @@ macro(compile_kernel COMPILE_ARGS) kernel_build/${kernel_name}.host.o COMMAND ${CMAKE_COMMAND} -E make_directory kernel_build + COMMAND + cp ${kernel_path}/${kernel_name}.kps kernel_build/${kernel_name}.xpu COMMAND ${XPU_CLANG} --sysroot=${CXX_DIR} -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=1 ${OPT_LEVEL} -fno-builtin -mcpu=xpu2 -fPIC ${XPU_CXX_DEFINES} ${XPU_CXX_FLAGS} ${XPU_CXX_INCLUDES} - -I. -o kernel_build/${kernel_name}.host.o ${kernel_path}/${kernel_name}.xpu + -I. -o kernel_build/${kernel_name}.host.o kernel_build/${kernel_name}.xpu --xpu-host-only -c -v WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} @@ -185,7 +189,7 @@ macro(xpu_add_library TARGET_NAME) # Distinguish .xpu file from other files foreach(cur_xpu_src IN LISTS xpu_srcs_lists) get_filename_component(language_type_name ${cur_xpu_src} EXT) - if(${language_type_name} STREQUAL ".xpu") + if(${language_type_name} STREQUAL ".kps") list(APPEND xpu_kernel_lists ${cur_xpu_src}) else() list(APPEND cc_kernel_lists ${cur_xpu_src}) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 5ae2e26e87c7b33a75325f5b585ca115bd3b6308..06b0583eddf24e344b4494f17472ad4bc9c18881 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(collective) +add_subdirectory(store) if(NOT WITH_PSCORE) add_subdirectory(fleet_executor) return() diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5daaf29ae2895234374c736b39b2bacf50051562 --- /dev/null +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_library(processgroup SRCS ProcessGroup.cc DEPS pten pten_api eager_api) + +if(WITH_NCCL) + cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context pten pten_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h new file mode 100644 index 0000000000000000000000000000000000000000..f30b96e72d4536b0773c9b69b6cb90b2c8c2dc87 --- /dev/null +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -0,0 +1,198 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t r = cmd; \ + if (r != ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + platform::dynload::ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper. +// EventManage is different from paddle::platform::CudaEvent. +// It uses lazy initialization and is only created when the +// Record() method is called for the first time; it also monitors +// device information to ensure that recorded stream and event +// are on the same device. + +class EventManager { + public: + EventManager() {} + explicit EventManager(unsigned int flags) : flags_{flags} {} + + ~EventManager() { + if (is_created_) { + platform::CUDADeviceGuard guard(device_index_); + cudaEventDestroy(event_); + } + } + + EventManager(const EventManager&) = delete; + EventManager& operator=(const EventManager&) = delete; + + EventManager(EventManager&& other) { + std::swap(flags_, other.flags_); + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + EventManager& operator=(EventManager&& other) { + std::swap(flags_, other.flags_); + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + gpuEvent_t GetRawCudaEvent() const { return event_; } + + void Record(const paddle::platform::CUDADeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::CUDADeviceGuard guard(device_index_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream())); + } + + bool Query() const { + gpuError_t err = cudaEventQuery(event_); + if (err == cudaSuccess) { + return true; + } else if (err == cudaErrorNotReady) { + return false; + } else { + PADDLE_ENFORCE_GPU_SUCCESS(err); + return false; + } + } + + void Synchronize() const { + if (is_created_) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); + } + } + + void Block(const paddle::platform::CUDADeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::CUDADeviceGuard guard(device_index_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0)); + } + } + + private: + unsigned int flags_ = cudaEventDefault; + bool is_created_{false}; + gpuEvent_t event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::CUDADeviceGuard guard(device_index); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_)); + is_created_ = true; + } +}; + +// NOTE(shenliang03): NCCLCommManager is more lightweight than +// platform::NCCLComm + +class NCCLCommManager { + public: + explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {} + + NCCLCommManager() : NCCLCommManager(nullptr) {} + + ~NCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (nccl_comm_) { + platform::dynload::ncclCommDestroy(nccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + ncclUniqueId comm_id) { + auto nccl_manager = std::make_shared(); + NCCLCHECK(platform::dynload::ncclCommInitRank(&(nccl_manager->nccl_comm_), + num_ranks, comm_id, rank)); + + nccl_manager->nccl_id_ = comm_id; + nccl_manager->rank_ = rank; + return nccl_manager; + } + + ncclUniqueId GetNcclId() const { + std::unique_lock lock(mutex_); + return nccl_id_; + } + + ncclComm_t GetNcclComm() const { + std::unique_lock lock(mutex_); + return nccl_comm_; + } + + NCCLCommManager(const NCCLCommManager&) = delete; + NCCLCommManager& operator=(const NCCLCommManager&) = delete; + NCCLCommManager& operator=(NCCLCommManager&& other) = delete; + + NCCLCommManager(NCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(nccl_comm_, other.nccl_comm_); + } + + protected: + ncclComm_t nccl_comm_; + ncclUniqueId nccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc new file mode 100644 index 0000000000000000000000000000000000000000..42ca3bd5f5be49e72662d563ba6e20f3097840ef --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +namespace paddle { +namespace distributed { + +ProcessGroup::Task::Task(int rank, const std::vector& inputTensors, + CommType comm_type) + : rank_(rank), comm_type_(comm_type) {} + +ProcessGroup::Task::~Task() = default; + +bool ProcessGroup::Task::IsCompleted() { + std::lock_guard lock(mutex_); + return is_completed_; +} + +bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) { + return false; +} + +void ProcessGroup::Task::Synchronize() {} + +ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h new file mode 100644 index 0000000000000000000000000000000000000000..dde8622d9007e1372739d0fedde4938f85eda323 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/enforce.h" + +constexpr auto kWaitTimeout = std::chrono::milliseconds(0); + +namespace paddle { +namespace distributed { + +using Tensor = paddle::experimental::Tensor; + +enum class CommType : std::uint8_t { + BROADCAST = 0, + ALLREDUCE = 1, + ALLREDUCE_SPARSE = 2, // TODO(shenliang03): to support sparse in allreduce + REDUCE = 3, + ALLGATHER = 4, + GATHER = 5, + SCATTER = 6, + REDUCE_SCATTER = 7, + ALLTOALL = 8, + SEND = 9, + RECV = 10, + BARRIER = 11, + UNKNOWN = 100, +}; + +struct ProcessGroupStrategy { + int nranks_{1}; + int local_rank_{0}; + std::vector trainer_endpoints_{}; + std::string current_endpoint_{""}; + int nrings_{1}; +}; + +class ProcessGroup { + public: + class Task { + public: + Task(int rank, const std::vector& inputTensors, + CommType opType = CommType::UNKNOWN); + + virtual ~Task(); + virtual bool IsCompleted(); + virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + virtual void Synchronize(); + + protected: + const int rank_; + CommType comm_type_; + std::mutex mutex_; + bool is_completed_ = false; + }; + + explicit ProcessGroup(int rank, int size); + virtual ~ProcessGroup() {} + + int GetRank() const { return rank_; } + + int GetSize() const { return size_; } + + virtual const std::string GetBackendName() const = 0; + + virtual std::shared_ptr AllReduce( + std::vector& /* tensors */, + const AllreduceOptions& = AllreduceOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support allreduce", GetBackendName())); + } + + virtual std::shared_ptr Broadcast( + std::vector& /* tensors */, + const BroadcastOptions& = BroadcastOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support allreduce", GetBackendName())); + } + + protected: + const int rank_; + const int size_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe2325423b460d7b42e08b03cf9b083bc94fc7b6 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -0,0 +1,321 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" + +DECLARE_bool(nccl_blocking_wait); +DECLARE_bool(use_stream_safe_cuda_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, ncclMin}, + {ReduceOp::MAX, ncclMax}, + {ReduceOp::SUM, ncclSum}, + {ReduceOp::PRODUCT, ncclProd}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ(it != red_type.end(), true, + platform::errors::InvalidArgument( + "Invalid nccl reduction. Must be ncclMin | ncclMax | " + "ncclProd | ncclSum")); + return it->second; +} + +std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) { + const uint8_t* bytes = reinterpret_cast(&ncclID); + std::ostringstream oss; + for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +bool CheckTensorsInCudaPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kGPU; + }); +} + +void SyncDefaultStream( + const std::vector& places, + std::vector& ncclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + ncclEvents[i].Record(*dev_ctx[i]); + ncclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupNCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + ncclComms_.resize(places.size()); +} + +ProcessGroupNCCL::NCCLTask::~NCCLTask() {} + +void ProcessGroupNCCL::NCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent()); + } +} + +bool ProcessGroupNCCL::NCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sheniang03): Add timeout for wait, now timeout unused +bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_nccl_blocking_wait) { + // NOTE(shenliang03): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, + int rank, int size) + : ProcessGroup(rank, size), strategy_(strategy) {} + +void ProcessGroupNCCL::BcastNCCLId( + std::vector& nccl_ids, // NOLINT + int root, int server_fd) { + if (strategy_.local_rank_ == root) { + std::vector other_trainers; + for (auto& ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) { + other_trainers.push_back(ep); + } + } + platform::SendBroadCastCommID(other_trainers, &nccl_ids); + } else { + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &nccl_ids); + } +} + +void ProcessGroupNCCL::BroadcastUniqueNCCLID( + std::vector& nccl_ids) { // NOLINT + + int server_fd = -1; + if (rank_ != 0) { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastNCCLId(nccl_ids, 0, server_fd); +} + +// create NCCLManager cache for places_key +void ProcessGroupNCCL::CreateNCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the NCCL Communicator since " + "the GPU place are not known")); + + std::vector> nccl_comms; + nccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector nccl_ids; + nccl_ids.resize(1); + auto& nccl_id = nccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + } + BroadcastUniqueNCCLID(nccl_ids); + + VLOG(3) << "init nccl rank: " << strategy_.local_rank_ + << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + + for (size_t i = 0; i < places.size(); ++i) { + platform::CUDADeviceGuard guard(places[i]); + nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id); + dev_ctx[i].reset(new CUDADeviceContext(places[i])); + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupNCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // construct uninitialize guard for device + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + auto dense_tensor = + std::dynamic_pointer_cast(inputs[i].impl()); + memory::RecordStream(dense_tensor->Holder(), + places_to_ctx_[key][i]->stream()); + } + } + + { + platform::NCCLGroupGuard nccl_guard; + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream); + } + } + + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupNCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclBcast( + input_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h new file mode 100644 index 0000000000000000000000000000000000000000..9f06566d1c86386acad3758be283e716f46c1951 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#endif + +constexpr const char* NCCL_BACKEND_NAME = "NCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using CUDAStream = platform::stream::CUDAStream; +using CUDADeviceContext = paddle::platform::CUDADeviceContext; + +class ProcessGroupNCCL : public ProcessGroup { + public: + class NCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + NCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~NCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> ncclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(NCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + protected: + ProcessGroupStrategy strategy_; + std::shared_ptr nccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_ncclcomm_; + + std::unordered_map> places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + private: + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueNCCLID(std::vector& nccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + void CreateNCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h new file mode 100644 index 0000000000000000000000000000000000000000..654d06686957bd4242fa474c215ccf7c117e5910 --- /dev/null +++ b/paddle/fluid/distributed/collective/Types.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace paddle { +namespace distributed { + +// TODO(shenliang03): To support AVG for reduce +enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT }; + +struct AllreduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; +}; + +struct BroadcastOptions { + int source_rank = 0; + int source_root = 0; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index e684d75bfb8320df06813bbe4e61fcd7d0c9d934..c1408130b5e577e54a4062316a4868701338864d 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, input_tensor_ptr = input_tensor->mutable_data(dims, place); } else if (input_data.dtype == DistModelDataType::INT32) { input_tensor_ptr = input_tensor->mutable_data(dims, place); + } else if (input_data.dtype == DistModelDataType::FLOAT16) { + input_tensor_ptr = input_tensor->mutable_data(dims, place); } else { LOG(ERROR) << "unsupported feed type " << input_data.dtype; return false; @@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() { feeds_to_dtype_.insert({var_name, DistModelDataType::INT32}); } else if (real_var->GetDataType() == framework::proto::VarType::INT64) { feeds_to_dtype_.insert({var_name, DistModelDataType::INT64}); + } else if (real_var->GetDataType() == framework::proto::VarType::FP16) { + feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16}); } else { LOG(ERROR) << "Don't support feed var dtype for: " << real_var->GetDataType(); @@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector *output_data, } else if (type == framework::proto::VarType::INT32) { rst = FetchResult(fetch, output); output->dtype = DistModelDataType::INT32; + } else if (type == framework::proto::VarType::FP16) { + rst = FetchResult(fetch, output); + output->dtype = DistModelDataType::FLOAT16; } else { LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only " - "supports float32, int64 and int32 fetch type for now."; + "supports float32, float16, int64 and int32 fetch type " + "for now."; } if (!rst) { LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx]; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h index 6bdd858d6cf9ed78c1a655c28ed58574374ce3fb..dc8b2596803e074a7ca8cea069bf7d93ef1615e7 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/macros.h" namespace paddle { @@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype() { return DistModelDataType::FLOAT32; } +template <> +constexpr DistModelDataType DistModelGetDtype() { + return DistModelDataType::FLOAT16; +} + class DistModelDataBuf { public: explicit DistModelDataBuf(size_t length) diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h index 66784c53c0026afa988119a506ef065181b0cb4d..27b282a945d1521c0a863bb0bb176c9492296b07 100644 --- a/paddle/fluid/distributed/ps/service/communicator/communicator_common.h +++ b/paddle/fluid/distributed/ps/service/communicator/communicator_common.h @@ -31,7 +31,8 @@ struct CommContext { const std::vector &origin_names, int id, bool merge_add_ = true, bool is_sparse_ = true, bool is_distributed_ = false, int table_id_ = -1, - bool is_tensor_table_ = false) + bool is_tensor_table_ = false, bool is_datanorm_table_ = false, + int64_t program_id_ = -1) : var_name(name), splited_varnames(names), epmap(emap), @@ -42,7 +43,9 @@ struct CommContext { is_sparse(is_sparse_), is_distributed(is_distributed_), table_id(table_id_), - is_tensor_table(is_tensor_table_) {} + program_id(program_id_), + is_tensor_table(is_tensor_table_), + is_datanorm_table(is_datanorm_table_) {} CommContext(const CommContext &ctx) { var_name = ctx.var_name; @@ -55,7 +58,9 @@ struct CommContext { origin_varnames = ctx.origin_varnames; is_distributed = ctx.is_distributed; table_id = ctx.table_id; + program_id = ctx.program_id; is_tensor_table = ctx.is_tensor_table; + is_datanorm_table = ctx.is_datanorm_table; } std::string print() const { @@ -78,7 +83,9 @@ struct CommContext { ss << " is_sparse: " << is_sparse; ss << " is_distributed: " << is_distributed << "\n"; ss << " table_id: " << table_id << "\n"; + ss << " program_id: " << program_id << "\n"; ss << " is_tensor_table: " << is_tensor_table << "\n"; + ss << " is_datanorm_table: " << is_datanorm_table << "\n"; return ss.str(); } @@ -93,7 +100,9 @@ struct CommContext { bool is_sparse; bool is_distributed; int table_id; + int64_t program_id; bool is_tensor_table; + bool is_datanorm_table; }; } // namespace distributed diff --git a/paddle/fluid/distributed/store/CMakeLists.txt b/paddle/fluid/distributed/store/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fde447d97dd99783a77a9a2ad89b4457b55ca74 --- /dev/null +++ b/paddle/fluid/distributed/store/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(tcp_store SRCS tcp_store.cc tcp_utils.cc DEPS enforce glog) diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h new file mode 100644 index 0000000000000000000000000000000000000000..2673314d222d2b32e42c42a3a94df71a1887914a --- /dev/null +++ b/paddle/fluid/distributed/store/store.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/distributed/store/tcp_utils.h" + +namespace paddle { +namespace distributed { + +class Store { + public: + Store() = delete; + explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {} + virtual ~Store() = default; + + virtual int64_t add(const std::string& key, int64_t value) = 0; + virtual std::vector get(const std::string& key) = 0; + virtual void wait(const std::string& key) = 0; + + virtual const std::chrono::seconds& timeout() const { return _timeout; } + + private: + std::chrono::seconds _timeout; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc new file mode 100644 index 0000000000000000000000000000000000000000..de85ac0d910e93257a308052ca1fcf193680a183 --- /dev/null +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -0,0 +1,272 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/distributed/store/tcp_store.h" +#include "paddle/fluid/distributed/store/tcp_utils.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +namespace detail { + +constexpr int INFTIME = -1; + +std::unique_ptr MasterDaemon::start(SocketType socket) { + return std::make_unique(socket); +} + +MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { + _background_thread = std::thread{&MasterDaemon::run, this}; +} + +MasterDaemon::~MasterDaemon() { + _background_thread.join(); + tcputils::close_socket(_listen_socket); + for (SocketType socket : _sockets) { + tcputils::close_socket(socket); + } +} + +void MasterDaemon::_do_add(SocketType socket) { + int64_t new_value{}; + std::string key = tcputils::receive_string(socket); + new_value = tcputils::receive_value(socket); + std::vector old_value; + auto it = _store.find(key); + if (it != _store.end()) { + old_value = it->second; + char* buffer = reinterpret_cast(it->second.data()); + size_t len = old_value.size(); + new_value += std::stoll(std::string(buffer, len)); + } + + std::string new_value_str = std::to_string(new_value); + _store[key] = + std::vector(new_value_str.begin(), new_value_str.end()); + VLOG(3) << "TCPStore: new value (" << new_value << ") for key (" << key + << ")."; + tcputils::send_value(socket, new_value); +} + +void MasterDaemon::_do_get(SocketType socket) { + std::string key = tcputils::receive_string(socket); + auto iter = _store.find(key); + PADDLE_ENFORCE_NE( + iter, _store.end(), + platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); + std::vector value = iter->second; + VLOG(3) << "TCPStore: value (" + << std::stoll(std::string(reinterpret_cast(value.data()), + value.size())) + << ") for key (" << key << ")."; + tcputils::send_vector(socket, value); +} + +void MasterDaemon::_do_stop(SocketType socket) { + ReplyType value = ReplyType::STOP_WAIT; + _stop = true; + tcputils::send_value(socket, value); +} + +void MasterDaemon::_do_wait(SocketType socket) { + std::string key = tcputils::receive_string(socket); + auto iter = _store.find(key); + auto reply = ReplyType::STOP_WAIT; + if (iter == _store.end()) { + reply = ReplyType::WAITING; + } + VLOG(3) << "TCPStore: wait reply (" << static_cast(reply) + << ") for key (" << key << ")."; + tcputils::send_value(socket, reply); +} + +void MasterDaemon::run() { + std::vector fds; +#ifdef _WIN32 + fds.push_back({_listen_socket, POLLIN}); +#else + fds.push_back({.fd = _listen_socket, .events = POLLIN, .revents = 0}); +#endif + + while (!_stop) { + for (size_t i = 0; i < fds.size(); i++) { + fds[i].revents = 0; + } + +#ifdef _WIN32 + ::WSAPoll(fds.data(), fds.size(), INFTIME); +#else + ::poll(fds.data(), fds.size(), INFTIME); +#endif + + if (fds[0].revents != 0) { + auto socket = tcputils::tcp_accept(_listen_socket); + _sockets.emplace_back(socket); +#ifdef _WIN32 + fds.push_back({socket, POLLIN}); +#else + fds.push_back({.fd = socket, .events = POLLIN, .revents = 0}); +#endif + } + + for (size_t i = 1; i < fds.size(); i++) { + if (fds[i].revents == 0) { + continue; + } + + Command command = tcputils::receive_value(fds[i].fd); + VLOG(3) << "TCPStore: recv command: " << static_cast(command) << "."; + + switch (command) { + case Command::ADD: + _do_add(fds[i].fd); + break; + case Command::GET: + _do_get(fds[i].fd); + break; + case Command::WAIT: + _do_wait(fds[i].fd); + break; + case Command::STOP: + _do_stop(fds[i].fd); + break; + } + } + } +} + +std::unique_ptr TCPServer::create(uint16_t port) { + int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); + auto server = std::make_unique(); + server->_master_daemon = MasterDaemon::start(socket); + return server; +} + +std::unique_ptr TCPClient::connect(const std::string host, + uint16_t port) { + int socket = tcputils::tcp_connect(host, std::to_string(port), AF_INET); + return std::make_unique(socket); +} + +void TCPClient::send_command_for_key(Command type, const std::string& key) { + tcputils::send_value(_socket, type); + if (key.empty()) { + return; + } + tcputils::send_string(_socket, key); +} + +template +void TCPClient::send_value(const T& value) { + tcputils::send_bytes(_socket, &value, 1); +} + +template +T TCPClient::receive_value() { + T res; + tcputils::receive_bytes(_socket, &res, 1); + return res; +} + +template +void TCPClient::send_vector(const std::vector& value) { + tcputils::send_vector(_socket, value); +} + +template +std::vector TCPClient::receive_vector() { + return tcputils::receive_vector(_socket); +} + +} // namespace detail + +TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, + size_t num_workers, std::chrono::seconds timeout) + : Store(timeout), _is_master(is_master), _num_workers(num_workers) { + if (_is_master) { + _server = detail::TCPServer::create(port); + } + + _client = detail::TCPClient::connect(host, port); + waitWorkers(); +} + +void TCPStore::waitWorkers() { + if (_num_workers == 0) { + return; + } + add(_init_key, 1); + + if (_server) { + auto begin = std::chrono::steady_clock::now(); + do { + auto value = get(_init_key); + int completed = std::stoi(std::string(value.begin(), value.end())); + VLOG(3) << completed << " worker ready, total " << _num_workers; + if (completed >= _num_workers) { + break; + } + const auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { + PADDLE_ENFORCE_EQ( + completed, _num_workers, + platform::errors::InvalidArgument( + "TCPStore timeouted and not all workers got ready.")); + } + } while (true); + } + VLOG(3) << "TCPStore initialized."; +} + +int64_t TCPStore::add(const std::string& key, int64_t value) { + _client->send_command_for_key(Command::ADD, _key_prefix + key); + _client->send_value(value); + return _client->receive_value(); +} + +std::vector TCPStore::get(const std::string& key) { + wait(key); + _client->send_command_for_key(Command::GET, _key_prefix + key); + VLOG(3) << "TCPStore get."; + return _client->receive_vector(); +} + +void TCPStore::wait(const std::string& key) { + ReplyType reply; + do { + _client->send_command_for_key(Command::WAIT, _key_prefix + key); + + reply = _client->receive_value(); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + } while (reply != ReplyType::STOP_WAIT); +} + +TCPStore::~TCPStore() { + _client->send_command_for_key(Command::STOP, ""); + ReplyType ret = _client->receive_value(); + PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, + platform::errors::InvalidArgument( + "The reply for TCPStore destructure must be 0.")); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h new file mode 100644 index 0000000000000000000000000000000000000000..cd706dd6640acf5e0b5b3714175dac7a6cecb25a --- /dev/null +++ b/paddle/fluid/distributed/store/tcp_store.h @@ -0,0 +1,114 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/distributed/store/tcp_utils.h" + +namespace paddle { +namespace distributed { + +enum class ReplyType { WAITING, STOP_WAIT }; +enum class Command { ADD, GET, WAIT, STOP }; + +namespace detail { + +class MasterDaemon { + public: + static std::unique_ptr start(SocketType listen_socket); + MasterDaemon() = delete; + explicit MasterDaemon(SocketType listen_socket); + ~MasterDaemon(); + + private: + void run(); + void _do_add(SocketType socket); + void _do_wait(SocketType socket); + void _do_get(SocketType socket); + void _do_stop(SocketType socket); + SocketType _listen_socket; + std::vector _sockets; + std::unordered_map> _store; + std::thread _background_thread{}; + bool _stop = false; +}; + +class TCPServer { + public: + TCPServer() = default; + static std::unique_ptr create(std::uint16_t port); + + private: + std::unique_ptr _master_daemon; +}; + +class TCPClient { + public: + explicit TCPClient(SocketType socket) : _socket{socket} {} + static std::unique_ptr connect(const std::string host, + uint16_t port); + ~TCPClient() { tcputils::close_socket(_socket); } + void send_command_for_key(Command type, const std::string& key); + + template + void send_value(const T& value); + + template + void send_vector(const std::vector& value); + template + std::vector receive_vector(); + + template + T receive_value(); + + private: + SocketType _socket; +}; + +} // namespace detail + +class TCPStore : public Store { + public: + static constexpr std::uint16_t kDefaultPort = 6170; + explicit TCPStore(std::string host, uint16_t port = kDefaultPort, + bool is_master = false, size_t num_workers = 1, + std::chrono::seconds timeout = tcputils::kDefaultTimeout); + + ~TCPStore(); + + int64_t add(const std::string& key, int64_t value) override; + std::vector get(const std::string& key) override; + void wait(const std::string& key) override; + + private: + void waitWorkers(); + std::unique_ptr _server; + std::unique_ptr _client; + + const std::string _init_key = "init/"; + const std::string _key_prefix = "/"; + std::chrono::seconds _timeout; + bool _is_master; + int _num_workers; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..d0561d0b9a9c5b01c32620e72d21ed562e42637e --- /dev/null +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/store/tcp_utils.h" +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { +namespace tcputils { + +std::error_code socket_error() { +#ifdef _WIN32 + return std::error_code{::WSAGetLastError(), std::generic_category()}; +#else + return std::error_code{errno, std::generic_category()}; +#endif +} + +void close_socket(SocketType socket) { +#ifdef _WIN32 + ::closesocket(socket); +#else + ::close(socket); +#endif +} + +::addrinfo* get_addr_info(const std::string host, const std::string port, + int ai_flags, int family) { + ::addrinfo hints{}, *res; + hints.ai_flags = ai_flags; + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + + const char* node = host.empty() ? nullptr : host.c_str(); + + int n; + n = ::getaddrinfo(node, port.c_str(), &hints, &res); + const char* gai_err = ::gai_strerror(n); + const char* proto = + (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); + PADDLE_ENFORCE_EQ( + n, 0, platform::errors::InvalidArgument( + "%s network %s:%s cannot be obtained. Details: %s.", proto, + host, port, gai_err)); + + return res; +} + +void free_addr_info(::addrinfo* hint) { + PADDLE_ENFORCE_NOT_NULL( + hint, platform::errors::InvalidArgument( + "The parameter for free_addr_info cannot be null.")); + ::freeaddrinfo(hint); +} + +SocketType tcp_connect(const std::string host, const std::string port, + int family, std::chrono::seconds timeout) { + int ai_flags = AI_NUMERICSERV | AI_V4MAPPED | AI_ALL; + ::addrinfo* res = get_addr_info(host, port, ai_flags, family); + + SocketType sockfd = -1; + bool retry = true; + auto deadline = std::chrono::steady_clock::now() + timeout; + do { + for (::addrinfo* cur = res; cur != nullptr; cur = cur->ai_next) { + sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + PADDLE_ENFORCE_GT(sockfd, 0, platform::errors::InvalidArgument( + "Create socket to connect %s:%s failed. " + "Details: %s. ", + host, port, socket_error().message())); + + if (::connect(sockfd, cur->ai_addr, cur->ai_addrlen) == 0) { + retry = false; + break; + } + VLOG(0) << "Retry to connect to " << host << ":" << port + << " while the server is not yet listening."; + close_socket(sockfd); + sockfd = -1; + std::this_thread::sleep_for(kDelay); + if (timeout != kNoTimeout && + std::chrono::steady_clock::now() >= deadline) { + retry = false; + break; + } + } + + if (timeout != kNoTimeout && std::chrono::steady_clock::now() >= deadline) { + retry = false; + } + } while (retry); + + free_addr_info(res); + + PADDLE_ENFORCE_GT(sockfd, 0, + platform::errors::InvalidArgument( + "Network %s:%s cannot be connected.", host, port)); + VLOG(0) << "Successfully connected to " << host << ":" << port; + + return sockfd; +} + +SocketType tcp_listen(const std::string host, const std::string port, + int family) { + int ai_flags = AI_PASSIVE | AI_NUMERICSERV; + ::addrinfo* res = get_addr_info(host, port, ai_flags, family); + ::addrinfo* cur = res; + SocketType sockfd{}; + + std::string node = host.empty() ? "IP_ANY" : host; + while (cur) { + sockfd = ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (sockfd < 0) { + VLOG(0) << "Cannot create socket on " << node << ":" << port + << ". Details: " << socket_error().message(); + cur = cur->ai_next; + continue; + } + + int on = 1; +#ifdef _WIN32 + int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, + reinterpret_cast(&on), sizeof(on)); +#else + int ret = ::setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); +#endif + if (ret < 0) { + VLOG(0) << "Set the address reuse option failed on the server."; + } + if (::bind(sockfd, res->ai_addr, res->ai_addrlen) == 0) { + break; + } + close_socket(sockfd); + sockfd = -1; + cur = cur->ai_next; + } + + PADDLE_ENFORCE_GT(sockfd, 0, + platform::errors::InvalidArgument( + "Bind network on %s:%s failedd.", node, port)); + + ::listen(sockfd, LISTENQ); + + VLOG(0) << "The server starts to listen on " << node << ":" << port; + return sockfd; +} + +SocketType tcp_accept(SocketType socket) { + ::sockaddr_storage addr_s{}; + ::socklen_t addr_len = sizeof(addr_s); + SocketType new_socket = + ::accept(socket, reinterpret_cast<::sockaddr*>(&addr_s), &addr_len); + PADDLE_ENFORCE_GT( + new_socket, 0, + platform::errors::InvalidArgument( + "The server failed to accept a new connection. Details: %s.", + socket_error().message())); +#ifndef _WIN32 + ::fcntl(new_socket, F_SETFD, FD_CLOEXEC); +#endif + auto value = 1; +#ifdef _WIN32 + ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY, + reinterpret_cast(&value), sizeof(value)); +#else + ::setsockopt(new_socket, IPPROTO_TCP, TCP_NODELAY, &value, sizeof(value)); +#endif + return new_socket; +} + +void send_string(SocketType socket, const std::string& s) { + std::string::size_type size = s.size(); + send_bytes(socket, &size, 1); + send_bytes(socket, s.data(), size); +} + +std::string receive_string(SocketType socket) { + std::string::size_type size; + receive_bytes(socket, &size, 1); + std::vector v(size); + receive_bytes(socket, v.data(), size); + return std::string(v.data(), v.size()); +} + +} // namespace tcputils +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/store/tcp_utils.h b/paddle/fluid/distributed/store/tcp_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..60cb3de124da3593f3d07ffadcf3b12c2deedf29 --- /dev/null +++ b/paddle/fluid/distributed/store/tcp_utils.h @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef _WIN32 +#include +#include +#pragma comment(lib, "Ws2_32.lib") +#else +#include +#include +#include +#include +#include +#include +#endif +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" + +// Utility functions for TCP socket. +namespace paddle { +namespace distributed { + +#ifdef _WIN32 +using SocketType = SOCKET; +#else +using SocketType = int; +#endif + +namespace tcputils { + +constexpr int LISTENQ = 2048; +constexpr std::chrono::seconds kDelay = std::chrono::seconds(3); +constexpr std::chrono::seconds kNoTimeout = std::chrono::seconds::zero(); +constexpr std::chrono::seconds kDefaultTimeout = std::chrono::seconds(360); + +std::error_code socket_error(); +void close_socket(SocketType socket); +::addrinfo* get_addr_info(const std::string host, const std::string port, + int ai_flags, int family); +void free_addr_info(::addrinfo*); +SocketType tcp_connect(const std::string host, const std::string port, + int family, std::chrono::seconds timeout = kNoTimeout); +SocketType tcp_listen(const std::string host, const std::string port, + int family); +SocketType tcp_accept(SocketType socket); + +void send_string(SocketType socket, const std::string& s); +std::string receive_string(SocketType socket); + +template +void send_bytes(SocketType socket, const T* buffer, size_t len) { + size_t to_send = len * sizeof(T); + if (to_send == 0) { + return; + } + + auto ptr = reinterpret_cast(buffer); + + while (to_send > 0) { + auto byte_sent = ::send(socket, ptr, to_send, 0); + PADDLE_ENFORCE_GT(byte_sent, 0, platform::errors::InvalidArgument( + "TCP send error. Details: %s.", + socket_error().message())); + to_send -= byte_sent; + ptr += byte_sent; + } +} + +template +void receive_bytes(SocketType socket, T* buffer, size_t len) { + size_t to_recv = len * sizeof(T); + if (to_recv == 0) { + return; + } + auto ptr = reinterpret_cast(buffer); + + while (to_recv > 0) { + auto byte_received = ::recv(socket, ptr, to_recv, 0); + PADDLE_ENFORCE_GT(byte_received, 0, platform::errors::InvalidArgument( + "TCP receive error. Details: %s.", + socket_error().message())); + + to_recv -= byte_received; + ptr += byte_received; + } +} + +template +void send_vector(SocketType socket, const std::vector& v) { + size_t size = v.size(); + send_bytes(socket, &size, 1); + send_bytes(socket, v.data(), size); +} + +template +std::vector receive_vector(SocketType socket) { + size_t size; + receive_bytes(socket, &size, 1); + std::vector res(size); + receive_bytes(socket, res.data(), size); + return res; +} + +template +void send_value(SocketType socket, const T& v) { + send_bytes(socket, &v, 1); +} + +template +T receive_value(SocketType socket) { + T v; + receive_bytes(socket, &v, 1); + return v; +} + +} // namespace tcputils +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 8b6752dfec743d774c8656d36421f31e7dab9799..2e377e43ca3ec96c183a7b51830b71210d5d0290 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -25,6 +25,8 @@ #include "glog/logging.h" +namespace egr { + static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, const paddle::experimental::Tensor& t) { if (!tensor->defined() || !tensor->initialized()) { @@ -36,17 +38,10 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } } -namespace egr { - -void GradNodeAccumulation::RetainGrad( - const std::function& hook) { - retain_grad_hook_ = hook; -} - std::vector> GradNodeAccumulation:: operator()( const std::vector>& grads) { + VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; PADDLE_ENFORCE(grads.size() == 1, paddle::platform::errors::Fatal( "GradNodeAccumulation should take exactly 1 grad tensor" @@ -58,17 +53,18 @@ operator()( "However received: %d in slot %d .", grads[0].size(), 0)); // Apply Gradient Hooks + paddle::experimental::Tensor grad_out; if (GradientHooksRegistered()) { std::vector> hooked_grads = ApplyGradientHooks(grads); - // TODO(jiabin): It's little weird - CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]); + grad_out = hooked_grads[0][0]; } else { - CopyOrAddTensor(&accumulated_grad, grads[0][0]); + grad_out = grads[0][0]; } - if (retain_grad_hook_ != nullptr) { - retain_grad_hook_(accumulated_grad); + if (!weak_grad_.expired()) { + auto grad = weak_grad_.lock(); + CopyOrAddTensor(grad.get(), grad_out); } // Apply Reduce Hooks @@ -76,7 +72,7 @@ operator()( ApplyReduceHooks(); } - return {{accumulated_grad}}; + return {{grad_out}}; } void GradNodeAccumulation::RegisterReduceHook( diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index be2ccc263e806d8874b0e18f93376bd62745940c..787149ab305263fdbef2866e901e8af5116bc268 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" namespace egr { @@ -21,7 +22,10 @@ namespace egr { class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node - GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); } + explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + weak_grad_ = meta->WeakGrad(); + SetDefaultGradInOutMeta(); + } ~GradNodeAccumulation() override = default; @@ -30,10 +34,7 @@ class GradNodeAccumulation : public GradNodeBase { const std::vector>& grads) override; - void RetainGrad(const std::function& hook); - - paddle::experimental::Tensor* Grad() { return &accumulated_grad; } + std::string name() { return "GradNodeAccumulation"; } /** * Register ReduceHook @@ -47,7 +48,7 @@ class GradNodeAccumulation : public GradNodeBase { void ApplyReduceHooks(); private: - paddle::experimental::Tensor accumulated_grad; + std::weak_ptr weak_grad_; std::function diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index ee6a3afc6ffd39e264b039adaf8bae716c9e483e..748afe6d1f313daacbbe276b2a00a9687402e617 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -52,9 +52,15 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, } } -void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { - // TODO(jiabin): Support More Tensor type here +static void RetainGradForRegularNode( + const paddle::experimental::Tensor& tensor) { AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); + if (meta->RetainGrads()) { + return; + } else { + meta->SetRetainGrads(true); + } + std::weak_ptr weak_grad_tensor = meta->WeakGrad(); @@ -70,12 +76,8 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { grad_tensor->set_impl(t.impl()); return *grad_tensor.get(); } else { - PADDLE_THROW(paddle::platform::errors::Fatal( - "Detected uninitialized variable, causing segmentation " - "fault " - "inside the hook." - "Tensor has to be initialized while we need to set it." - "please check tensor initialization status.")); + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); } } else { VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; @@ -83,21 +85,17 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { } }; - if (IsLeafTensor(tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = EagerUtils::grad_node(tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->RetainGrad(hook); + // Append to GradientHooks + RegisterGradientHookForTensor(tensor, hook); +} +void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { + if (IsLeafTensor(tensor)) { + // Leaf tensor's grad will always be retained + // Refer to implementation of AccumulationNode for more details + return; } else { - // Append to GradientHooks - RegisterGradientHookForTensor(tensor, hook); + RetainGradForRegularNode(tensor); } } diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index c06edef7017be133c1cbb1e92e71390b9ab38e74..628c0c500b3c4ade711f3b7ba6a9fa4b6b69a7c6 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -47,7 +47,7 @@ paddle::experimental::Tensor CreateTensorWithValue( auto meta = EagerUtils::autograd_meta(&out); if (is_leaf) { - auto accumulation_node = std::make_shared(); + auto accumulation_node = std::make_shared(meta); meta->SetGradNode(accumulation_node); meta->SetStopGradient(false); } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 63f25f5528100cccdacf7c1c1ca67095b448160c..e1f4d6ee9a129e41b7e01fec7f414d8c8fbc880f 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -554,6 +554,21 @@ static bool CheckOpProto(proto::OpProto* op_proto) { return true; } +static bool BeSameAsInput(const std::string& output_name, + const std::set& input_names) { + if (output_name.size() < 4) { + return false; + } + + if (output_name.substr(output_name.size() - 3, 3) == "Out") { + if (input_names.count(output_name.substr(0, output_name.size() - 3))) { + return true; + } + } + + return false; +} + /* --------------------------------------- */ /* --------- Preprocess Ins/Outs --------- */ /* --------------------------------------- */ @@ -1016,33 +1031,20 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // Skip Intermediate Tensor + if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; get_autograd_meta_str += paddle::string::Sprintf( GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - if (op_passing_outs_map[op_type].count(output_name)) { - const std::string output_var_args_name = output_name + "Var"; - const char* FWD_OUT_SYNC_BACK_TEMPLATE = - " egr::EagerUtils::OverwriteOutputs(%s, %s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name); - } } else { const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = " egr::AutogradMeta* %s = " "egr::EagerUtils::autograd_meta(&%s);\n"; get_autograd_meta_str += paddle::string::Sprintf( GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - - if (op_passing_outs_map[op_type].count(output_name)) { - const std::string output_var_args_name = output_name + "Var"; - const char* FWD_OUT_SYNC_BACK_TEMPLATE = - " egr::EagerUtils::OverwriteOutputs(%s, %s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name); - } } } VLOG(6) << "Generated outputs autograd_meta"; @@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_autograd_name = "p_autograd_" + output_name; size_t output_position = fwd_outputs_name_pos_map.at(output_name); + // Intermediate Tensor does not require SetHistory, nor RetainGrad + if (output.duplicable()) { pass_stop_gradient_args += ", &" + output_autograd_name; const char* SET_OUT_RANK_TEMPLATE = @@ -1180,11 +1184,13 @@ static std::string GenerateGradNodeCreationContent( SET_GRAD_IN_META_TEMPLATE, output_autograd_name, output_position); } - VLOG(6) << "Generated Call RetainGradForTensor"; - const char* RETAIN_GRAD_TEMPLATE = - " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; - grad_node_creation_str += - paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); + if (!output.intermediate()) { + VLOG(6) << "Generated Call RetainGradForTensor"; + const char* RETAIN_GRAD_TEMPLATE = + " egr::EagerUtils::CheckAndRetainGrad(%s);\n"; + grad_node_creation_str += + paddle::string::Sprintf(RETAIN_GRAD_TEMPLATE, output_name); + } } VLOG(6) << "Generated SetGradIn/OutMeta"; @@ -1324,19 +1330,21 @@ static std::pair GenerateForwardFunctionContents( generated_function_body += "\n"; // Handle Dispensable Inputs + std::set input_names; for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); + input_names.insert(input_name); if (input.dispensable()) { if (input.duplicable()) { const char* FWD_INS_CONTENT_TEMPLATE = " if(%s.size() > 0) " - "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;"; + "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n"; generated_function_body += paddle::string::Sprintf( FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name); } else { const char* FWD_INS_CONTENT_TEMPLATE = " if(%s.initialized()) " - "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s)\n;"; + "ins[\"%s\"] = egr::EagerUtils::TrySyncToVars(%s);\n"; generated_function_body += paddle::string::Sprintf( FWD_INS_CONTENT_TEMPLATE, input_name, input_name, input_name); } @@ -1372,11 +1380,21 @@ static std::pair GenerateForwardFunctionContents( core_ops_args_type_info[op_type].push_back("tensor"); } - const char* FWD_OUTS_CONTENT_TEMPLATE = - "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },"; - outs_contents_str += paddle::string::Sprintf( - FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name); + if (BeSameAsInput(output_name, input_names)) { + if (!output.dispensable()) { + std::string input_name = + output_name.substr(0, output_name.size() - 3); + const char* FWD_OUTS_CONTENT_TEMPLATE = "{ \"%s\", ins[\"%s\"] },"; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, input_name); + } + } else { + const char* FWD_OUTS_CONTENT_TEMPLATE = + "{ \"%s\", egr::EagerUtils::TrySyncToVars(%s) },"; + outs_contents_str += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, output_name, output_var_name); + } core_ops_args_info[op_type].push_back(output_var_name); } else { @@ -1415,6 +1433,23 @@ static std::pair GenerateForwardFunctionContents( generated_function_body += outs_map_str; generated_function_body += "\n"; + for (const proto::OpProto::Var& output : out_vars) { + const std::string& output_name = output.name(); + if (op_passing_outs_map[op_type].count(output_name)) { + if (BeSameAsInput(output_name, input_names)) { + if (output.dispensable()) { + std::string input_name = + output_name.substr(0, output_name.size() - 3); + const char* FWD_OUTS_CONTENT_TEMPLATE = + " if (ins.count(\"%s\")) outs[\"%s\"] = ins[\"%s\"];\n"; + generated_function_body += paddle::string::Sprintf( + FWD_OUTS_CONTENT_TEMPLATE, input_name, output_name, input_name); + } + } + } + } + generated_function_body += "\n"; + VLOG(6) << "Generated Outs Map"; // [Generation] Get Attrs @@ -1448,33 +1483,61 @@ static std::pair GenerateForwardFunctionContents( std::string output_varname = LegalizeVariableName(output_name); if (output.duplicable()) { - const char* FWD_OUT_TENSORS_TEMPLATE = - " std::vector %s = " - "egr::EagerUtils::GetOutputs(outs[\"%s\"]);\n"; - out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, - output_varname, output_name); + if (op_passing_outs_map[op_type].count(output_name)) { + if (output.dispensable()) { + const char* FWD_OUT_TENSORS_TEMPLATE = + " std::vector %s;\n" + " if (outs.count(\"%s\")) " + "egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" + " egr::EagerUtils::Output2Result(%s, &%s);\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, + output_name, output_var_args_name, output_var_args_name, + output_varname); + } else { + const char* FWD_OUT_TENSORS_TEMPLATE = + " std::vector %s;\n" + " egr::EagerUtils::GetOutputs(outs[\"%s\"], %s);\n" + " egr::EagerUtils::Output2Result(%s, &%s);\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSORS_TEMPLATE, output_varname, output_name, + output_var_args_name, output_var_args_name, output_varname); + } + } else { + const char* FWD_OUT_TENSORS_TEMPLATE = + " std::vector %s;\n" + " egr::EagerUtils::GetOutputs(outs[\"%s\"], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSORS_TEMPLATE, output_varname, + output_name, output_varname); + } return_types[return_position] = "std::vector"; - if (op_passing_outs_map[op_type].count(output_name) && - bwd_info.GenerateForwardOnly()) { - const char* FWD_OUT_SYNC_BACK_TEMPLATE = - " egr::EagerUtils::OverwriteOutputs(outs[\"%s\"], %s);\n"; - out_tensor_str += paddle::string::Sprintf( - FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name); - } } else { - const char* FWD_OUT_TENSOR_TEMPLATE = - " paddle::experimental::Tensor %s = " - "egr::EagerUtils::GetOutput(outs[\"%s\"][0]);\n"; - out_tensor_str = paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, - output_varname, output_name); - - if (op_passing_outs_map[op_type].count(output_name) && - bwd_info.GenerateForwardOnly()) { - const char* FWD_OUT_SYNC_BACK_TEMPLATE = - " egr::EagerUtils::OverwriteOutputs(outs[\"%s\"][0], %s);\n"; - out_tensor_str += paddle::string::Sprintf( - FWD_OUT_SYNC_BACK_TEMPLATE, output_name, output_var_args_name); + if (op_passing_outs_map[op_type].count(output_name)) { + if (output.dispensable()) { + const char* FWD_OUT_TENSOR_TEMPLATE = + " if (outs.count(\"%s\")) " + "egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" + " paddle::experimental::Tensor& %s = *%s;\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, output_name, + output_var_args_name, output_varname, output_var_args_name); + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], %s);\n" + " paddle::experimental::Tensor& %s = *%s;\n"; + out_tensor_str = paddle::string::Sprintf( + FWD_OUT_TENSOR_TEMPLATE, output_name, output_var_args_name, + output_varname, output_var_args_name); + } + } else { + const char* FWD_OUT_TENSOR_TEMPLATE = + " paddle::experimental::Tensor %s;\n" + " egr::EagerUtils::GetOutput(outs[\"%s\"][0], &%s);\n"; + out_tensor_str = + paddle::string::Sprintf(FWD_OUT_TENSOR_TEMPLATE, output_varname, + output_name, output_varname); } return_types[return_position] = "paddle::experimental::Tensor"; } @@ -1494,6 +1557,7 @@ static std::pair GenerateForwardFunctionContents( GenerateGradNodeCreationContent(fwd_info, bwd_info); generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; + // [Generation] Call RetainGradForTensor VLOG(6) << "Generated GradNode Creation codes"; } @@ -1588,12 +1652,25 @@ static std::string GenerateSingleOpBase( const std::string& attrs_name = "attrs_map" + std::to_string(*outs_size); // [Generation] Get Ins Map + std::unordered_set dispensable_input_name_set; + for (const auto& in : in_vars) { + if (in.dispensable()) dispensable_input_name_set.insert(in.name()); + } + std::unordered_set duplicable_input_name_set; + for (const auto& in : in_vars) { + if (in.duplicable()) duplicable_input_name_set.insert(in.name()); + } std::string ins_contents_str = ""; for (auto iter : grad_ins) { const std::string& grad_input_name = iter.first; if (grad_ins_fwd_slotname_map.count(grad_input_name)) { // Fwd Tensor + const std::string& fwd_name = + grad_ins_fwd_slotname_map.at(grad_input_name); + if (dispensable_input_name_set.count(fwd_name)) { + continue; + } std::string struct_fwd_input_name = grad_ins_fwd_slotname_map.at(grad_input_name) + "_"; const char* GRAD_INS_FWD_CONTENT_TEMPLATE = @@ -1634,14 +1711,41 @@ static std::string GenerateSingleOpBase( paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str); generated_grad_function_body += ins_map_str; - VLOG(6) << "Generated Ins Map"; + for (auto iter : grad_ins) { + const std::string& grad_input_name = iter.first; - // [Generation] Get Outs Map - std::unordered_set duplicable_input_name_set; - for (const auto& in : in_vars) { - if (in.duplicable()) duplicable_input_name_set.insert(in.name()); + if (grad_ins_fwd_slotname_map.count(grad_input_name)) { + // Fwd Tensor + const std::string& fwd_name = + grad_ins_fwd_slotname_map.at(grad_input_name); + if (dispensable_input_name_set.count(fwd_name)) { + std::string struct_fwd_input_name = + grad_ins_fwd_slotname_map.at(grad_input_name) + "_"; + if (duplicable_input_name_set.count(fwd_name)) { + const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = + " if(this->%s.size() > 0) %s[\"%s\"] = " + "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" + "RecoverTensorWrapper(&this->%s, nullptr));\n"; + generated_grad_function_body += paddle::string::Sprintf( + DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, + ins_name, grad_input_name, struct_fwd_input_name); + } else { + const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = + " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, " + "nullptr);\n if(%s.initialized()) %s[\"%s\"] = " + "egr::EagerUtils::TrySyncToVars(%s);\n"; + generated_grad_function_body += paddle::string::Sprintf( + DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, + struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, + grad_input_name); + } + } + } } + VLOG(6) << "Generated Ins Map"; + + // [Generation] Get Outs Map std::string outs_contents_str = ""; for (auto iter : grad_outs) { const std::string& grad_output_name = iter.first; @@ -1987,6 +2091,7 @@ static std::string GenerateGradNodeHeaderContents( "%s\n" " // SetAttrMap\n" "%s\n" + " std::string name() { return \"GradNode%s\"; }\n" "\n" " private:\n" " // TensorWrappers\n" @@ -2085,8 +2190,8 @@ static std::string GenerateGradNodeHeaderContents( std::string grad_node_str = paddle::string::Sprintf( GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, - attr_members_str); + set_tensor_wrappers_str, set_attr_map_str, op_type, + tensor_wrapper_members_str, attr_members_str); return grad_node_str; } diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 786bf21e8c8a13da69b201cf988f291dbee64a73..c6e56e34627a52bc19df7e8d87371811fcec8697 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -127,6 +127,40 @@ def ReadBwdFile(filepath): ###################### ### Yaml Parsers ### ###################### +def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): + # intermediate_outputs : [name0, name1, ...] + # forward_returns_list : [[ret_name, type, orig_pos], ...] + """ + Check whether intermediate_outputs are positioned + at the very end of forward_returns_list + """ + + intermediate_positions = range( + len(forward_returns_list) - len(intermediate_outputs), + len(forward_returns_list)) + for ret_name, _, pos in forward_returns_list: + if ret_name in intermediate_outputs: + assert pos in intermediate_positions + + +def ParseDispensable(string): + # string: "X, Y" + return [v.strip() for v in string.split(",")] + + +def ParseIntermediate(string): + return [v.strip() for v in string.split(",")] + + +def ParseNoNeedBuffer(string): + # string: "x, y" + no_need_buffer_set = set() + for name in string.split(","): + no_need_buffer_set.add(name.strip()) + + return no_need_buffer_set + + def ParseYamlArgs(string): # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y @@ -397,7 +431,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list, def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, - backward_attrs_list): + backward_attrs_list, no_need_buffer_set): # Inputs: # fwd_api_name = "" # backward_fwd_input_map = { "name" : [type, is_fwd_input, orig_position] ...} @@ -410,15 +444,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, set_tensor_wrapper_methods_str = "" tensor_wrapper_members_str = "" for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): + if tname in no_need_buffer_set: + no_need_buffer = "true" + else: + no_need_buffer = "false" + tensor_wrapper_name = GetSavedName(tname) if IsPlainTensorType(ttype): SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """ void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{ - {} = egr::TensorWrapper({}, full_reserved); + {} = egr::TensorWrapper({}, full_reserved, {}); }} """ set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format( - tname, tname, tensor_wrapper_name, tname) + tname, tname, tensor_wrapper_name, tname, no_need_buffer) PLAIN_TENSOR_MEMBER_TEMPLATE = """ egr::TensorWrapper {}; @@ -430,12 +469,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ void SetTensorWrapper{}(const std::vector& {}, bool full_reserved) {{ for(const auto& eager_tensor : {}) {{ - {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) ); + {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) ); }}; }} """ set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format( - tname, tname, tname, tensor_wrapper_name) + tname, tname, tname, tensor_wrapper_name, no_need_buffer) VECTOR_TENSOR_MEMBER_TEMPLATE = """ std::vector {}; @@ -562,11 +601,11 @@ std::vector> {}::operator()(const std: return node_definition_str -def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, - forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list): +def GenerateNodeCreationCodes( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -640,10 +679,17 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, # SetTensorWrappers set_tensor_wrappers_list = [] for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + is_optional = (name in optional_inputs) if is_fwd_input: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" + if is_optional: + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + if is_optional: + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -732,7 +778,8 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_inputs_position_map, forward_outputs_position_map, forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list): + backward_grad_output_map, backward_attrs_list, + optional_inputs, intermediate_outputs): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -741,6 +788,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, # backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...} # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...} # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] + # optional_inputs = ["name0", ...] # Get Function Args num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys( @@ -750,17 +798,18 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, inputs_call_list = ["" for i in range(num_inputs)] for name, (ttype, pos) in forward_inputs_position_map.items(): inputs_call_list[pos] = f"{name}" + is_optional = (name in optional_inputs) if IsPlainTensorType(ttype): - inputs_args_definition_list[ - pos] = f"const paddle::experimental::Tensor& {name}" - inputs_args_declaration_list[ - pos] = f"const paddle::experimental::Tensor& {name}" + if is_optional: + arg_str = f"const paddle::optional& {name}" + else: + arg_str = f"const paddle::experimental::Tensor& {name}" else: assert IsVectorTensorType(ttype) - inputs_args_definition_list[ - pos] = f"const std::vector& {name}" - inputs_args_declaration_list[ - pos] = f"const std::vector& {name}" + arg_str = f"const std::vector& {name}" + + inputs_args_definition_list[pos] = arg_str + inputs_args_declaration_list[pos] = arg_str for name, atype, default_val, pos in forward_attrs_list: inputs_call_list[pos] = name @@ -776,13 +825,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, inputs_call_args_str = ", ".join(inputs_call_list) # Forward Full Logic - forward_call_str = f"auto api_result = paddle::experimental::{fwd_api_name}({inputs_call_args_str});" + if len(intermediate_outputs) == 0: + function_name = fwd_api_name + else: + function_name = fwd_api_name + "_intermediate" + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs - num_outputs = len(forward_outputs_position_map.keys()) + num_outputs = len(forward_outputs_position_map.keys()) - len( + intermediate_outputs) returns_type_list = ["" for i in range(num_outputs)] returns_list = ["" for i in range(num_outputs)] for name, (rtype, pos) in forward_outputs_position_map.items(): + if name in intermediate_outputs: + continue if num_outputs == 1: returns_list[0] = f"api_result" else: @@ -808,7 +864,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, fwd_api_name, bwd_api_name, forward_inputs_position_map, forward_outputs_position_map, forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list) + backward_grad_output_map, backward_attrs_list, optional_inputs) FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ @@ -997,6 +1053,10 @@ if __name__ == "__main__": assert 'output' in fwd_api.keys() assert 'backward' in fwd_api.keys() + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) + fwd_api_name = fwd_api['api'] fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] @@ -1008,6 +1068,12 @@ if __name__ == "__main__": assert 'args' in bwd_api.keys() assert 'output' in bwd_api.keys() assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + bwd_forward_str = bwd_api['forward'] bwd_args_str = bwd_api['args'] bwd_returns_str = bwd_api['output'] @@ -1019,6 +1085,12 @@ if __name__ == "__main__": print("Prased Forward Attrs List: ", forward_attrs_list) print("Parsed Forward Returns List: ", forward_returns_list) + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) + + IntermediateValidationCheck(intermediate_outputs, forward_returns_list) + # Collect Original Forward Inputs/Outputs and then perform validation checks orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( fwd_args_str, fwd_returns_str) @@ -1062,7 +1134,8 @@ if __name__ == "__main__": # Node Declaration Generation node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list) + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) print("Generated Node Declaration: ", node_declaration_str) node_definition_str += GenerateNodeDefinition( @@ -1076,7 +1149,8 @@ if __name__ == "__main__": fwd_api_name, bwd_api_name, forward_inputs_position_map, forward_outputs_position_map, forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list) + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) print("Generated Forward Definition: ", forward_definition_str) print("Generated Forward Declaration: ", forward_declaration_str) forward_definition_str += definition_declaration_pair[0] diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index a95d6dce29aad275bd4df220b68f1f5b2302c189..5a536067dbe4955efef136f5e5ba75b84d87f187 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,7 +14,7 @@ import os import argparse -from eager_gen import ReadFwdFile, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -70,10 +70,12 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map): + forward_attrs_list, forward_outputs_position_map, + optional_inputs): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] + # optional_inputs = [name0, ...] # Get EagerTensor from args # Get dygraph function call args @@ -82,7 +84,14 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, dygraph_function_call_list = ["" for i in range(num_args)] get_eager_tensor_str = "" for name, (ttype, pos) in forward_inputs_position_map.items(): - get_eager_tensor_str += f" auto& {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + else: + if is_optional: + get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + else: + get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" dygraph_function_call_list[pos] = f"{name}" parse_attributes_str = "" @@ -267,6 +276,11 @@ if __name__ == "__main__": fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + # Collect Original Forward Inputs/Outputs and then perform validation checks forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( fwd_args_str, fwd_returns_str) @@ -283,7 +297,7 @@ if __name__ == "__main__": python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map) + forward_outputs_position_map, optional_inputs) python_c_function_list.append(python_c_function_str) python_c_function_reg_list.append(python_c_function_reg_str) print("Generated Python-C Function: ", python_c_function_str) diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index f4b2b8e08d4fa465c1c3d868659d69f55c4223ea..9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta { "Should Not set NULL as GradNode pointer, since " "our default Edge and autogradMeta has nullptr for " "grad node. Set Nullptr will lead error.")); + grad_node_ = grad_node; } @@ -127,6 +128,12 @@ class AutogradMeta : public AbstractAutogradMeta { stop_gradient_ = static_cast(stop_gradient); } + void WeakSetStopGradient(bool stop_gradient) { + if (stop_gradient_ == -1) { + stop_gradient_ = static_cast(stop_gradient); + } + } + bool Persistable() const { return persistable_; } void SetPersistable(bool persistable) { persistable_ = persistable; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 598b368c6426a0fde9b286fc92d8e5a01660ef0a..27c376b4c80c6b4256d3e34ae98f39545551e19a 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -69,13 +69,16 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "adj_edges is designed to has the same size of grad " "inputs's slot num.")); if (meta && !meta->StopGradient()) { - VLOG(6) << "Add Edges for slot: " << slot_id; auto node = meta->GetMutableGradNode(); if (node) { + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index 8603d84fe8df597a69f041e2fec41d05dfe16448..f699f9ab28e2d37c893e7a4fdec9acfa6c5a280f 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -147,6 +147,8 @@ class GradNodeBase { std::vector> ApplyGradientHooks( const std::vector>& tensors); + virtual std::string name() { return "GradNodeBase"; } + private: // TODO(jiabin): Use SmallVector instead after merge PR from develop diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 6cc17b0a9c5faf9d54a78d21cc4970880140f8fd..31aaa93c41643f565836c536d7001c01d2a0826d 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -34,7 +34,8 @@ class TensorWrapper { public: TensorWrapper() = default; explicit TensorWrapper(const paddle::experimental::Tensor& tensor, - bool full_reserved = false) { + bool full_reserved = false, + bool no_need_buffer = false) { /** * Normally, we should fully reserved all non-output or non-leaf fwd tensor * here. And for fwd output tensor, we should not reserve its autogradmeta, @@ -48,16 +49,30 @@ class TensorWrapper { } // shallow copy tensor_impl here - intermidiate_tensor_.set_impl(tensor.impl()); + if (no_need_buffer) { + if (phi::DenseTensor::classof(tensor.impl().get())) { + // Only Copy Meta + phi::DenseTensor* dense_tensor = + static_cast(tensor.impl().get()); + auto tw_dense_tensor = std::make_shared(); + tw_dense_tensor->set_meta(dense_tensor->meta()); + intermidiate_tensor_.set_impl(tw_dense_tensor); + } else { + PADDLE_THROW(paddle::platform::errors::Fatal( + "Unrecognized tensor type for no_need_buffer feature")); + } + } else { + intermidiate_tensor_.set_impl(tensor.impl()); + } + intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - PADDLE_ENFORCE_NOT_NULL( - EagerUtils::unsafe_autograd_meta(tensor), - paddle::platform::errors::Fatal( - "Full reserved Tensor should not have null autograd meta, since " - "tensor_wrapper is used to build backward info. There is no way " - "for us to build it with null autograd_meta.")); - // copy output_rank - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + + // If an output is marked "intermedaite", we won't create + // autograd_meta for it. + // In that case, simply skip OutRankInfo Copy + if (EagerUtils::nullable_autograd_meta(tensor)) { + out_rank_info_ = EagerUtils::OutRankInfo(tensor); + } } paddle::experimental::Tensor recover( diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index 682e55e7d92945e13a219c956f373e800c174325..880bd2684102710d9d432c7186d007f7e155badd 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -17,11 +17,13 @@ #include "gtest/gtest.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" // TODO(jiabin): remove nolint here!!! @@ -37,7 +39,7 @@ TEST(AccumulationNode, Tensor) { .get(), meta); dt0->mutable_data( - paddle::platform::CPUPlace())[0] = 10.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f); paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0); std::shared_ptr dt1 = std::make_shared( @@ -47,84 +49,100 @@ TEST(AccumulationNode, Tensor) { meta); dt1->mutable_data( - paddle::platform::CPUPlace())[0] = 20.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f); paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + + // Initialize Grad Tensor std::shared_ptr grad_dt = std::make_shared( std::make_unique( paddle::platform::CPUPlace()) .get(), meta); - paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt); + grad_dt->mutable_data( + paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); // AccumulationNode - GradNodeAccumulation node = GradNodeAccumulation(); - - // Hook, RetainGrad - std::function - hook = [&grad_et](const paddle::experimental::Tensor& t) { - grad_et.set_impl(t.impl()); - return grad_et; - }; - node.RetainGrad(hook); + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); // operator() - paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0]; + paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0]; auto* ret_et0_ptr = std::dynamic_pointer_cast(ret_et0.impl()) ->data(); CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f)); - paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0]; + paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0]; + auto* ret_et1_ptr = std::dynamic_pointer_cast(ret_et1.impl()) ->data(); - CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f)); + CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f)); - // Retain Grad - auto* ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = std::dynamic_pointer_cast(grad->impl()) + ->data(); + CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f)); // Reduce Hook case 1: Call RegisterReduceHook and run operator() VLOG(6) << "Test Reduce Hook"; + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + auto reduce_hook_1 = [&](void) -> void { - auto* grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - grad_et_ptr[0] = 36.0; + auto* input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) + ->mutable_data( + paddle::platform::CPUPlace()); + input_et_ptr[0] = 36.0; VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_1); + node->RegisterReduceHook(reduce_hook_1); // operator() - paddle::experimental::Tensor _ret = node({{et0}})[0][0]; + paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; // Check operator() result, should be 36.0 auto* _ret_ptr = std::dynamic_pointer_cast(_ret.impl()) ->data(); - CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f)); // Check Retain Grad, should be 36.0 - auto* _ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) + auto* _ret_input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) ->data(); - CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f)); // Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly VLOG(6) << "Test Reduce Hook"; auto reduce_hook_2 = [&](void) -> void { auto* ret_et0_ptr = std::dynamic_pointer_cast(et0.impl()) - ->data(); + ->mutable_data( + paddle::platform::CPUPlace()); ret_et0_ptr[0] = 100.0; // set to 100.0 VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_2); - node.ApplyReduceHooks(); + node->RegisterReduceHook(reduce_hook_2); + node->ApplyReduceHooks(); // Check ApplyReduceHooks result CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 771b324a69b5a99e9f4857552cc8c5d8b25b5c90..a4bc56bd606f3fbb0f9152d58acb5c8edeecf905 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) { auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } std::vector outs = {target_tensor}; @@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) { std::dynamic_pointer_cast(node0_ptr)); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); - // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } @@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) { std::vector res0 = {&meta0}; node0_ptr->AddEdges(&res0, 0); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node1 -> AccumulationNode via Edge - auto meta1 = egr::AutogradMeta(); - meta1.SetStopGradient(false); - meta1.SetSingleOutRankWithSlot(0, 0); - meta1.SetGradNode(acc_node_ptr); - std::vector res1 = {&meta1}; + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; node1_ptr->AddEdges(&res1, 0); } @@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) { std::vector res1 = {&meta1}; node1_ptr->AddEdges(&res1, 0); + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); - AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta2->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node2 -> AccumulationNode via Edge - auto meta2 = egr::AutogradMeta(); - meta2.SetStopGradient(false); - meta2.SetSingleOutRankWithSlot(0, 0); - meta2.SetGradNode(acc_node_ptr); - std::vector res2 = {&meta2}; + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; node2_ptr->AddEdges(&res2, 0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index a44ca6fcffbff537dbcb46017f7c4953bd3d984c..524872b2e55638d25697388aa50724f49f6e3818 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { paddle::experimental::Tensor& target_tensor = target_tensors[0]; paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); - { - auto scale_node_ptr = std::make_shared(1, 1); - scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); - - scale_node_ptr->SetDefaultGradInOutMeta(); - - auto acc_node_ptr = std::make_shared(); - - AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(scale_node_ptr)); - auto_grad_meta->SetSingleOutRankWithSlot(0, 0); - auto_grad_meta->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 - - auto meta = AutogradMeta(); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetStopGradient(false); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); - auto_grad_meta1->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); - auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - } + + auto scale_node_ptr = std::make_shared(1, 1); + scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); + + scale_node_ptr->SetDefaultGradInOutMeta(); + + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(scale_node_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 + + AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor); + auto acc_node_ptr = std::make_shared(meta); + meta->SetStopGradient(false); + meta->SetSingleOutRankWithSlot(0, 0); + meta->SetGradNode(acc_node_ptr); + std::vector res = {meta}; + scale_node_ptr->AddEdges(&res, 0); RunBackward(target_tensors, {}); diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc index 990f700056158fabe2314aa2f5bc9946c0e5076c..217055e4e9e4a19e695f42bf57c2331c9b98e2bd 100644 --- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc @@ -159,7 +159,7 @@ TEST(EagerUtils, PassStopGradient) { CHECK(auto_grad0->StopGradient() == false); egr::EagerUtils::PassStopGradient(true, auto_grad0.get(), auto_grad1.get(), auto_grad2.get(), auto_grad3.get()); - CHECK(auto_grad0->StopGradient() == true); + CHECK(auto_grad0->StopGradient() == false); CHECK(auto_grad1->StopGradient() == true); CHECK(auto_grad2->StopGradient() == true); CHECK(auto_grad3->StopGradient() == true); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index bf2f620dd19bae44fabcbffedc1dcef6a1b52bf9..fbc71168fe41697aa3175f1541350852a62a3220 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -79,9 +79,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); - // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad { @@ -102,16 +99,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); egr_utils_api::RetainGradForTensor( target_tensor); // result: 1.0 + 3.0 = 4.0 - } - - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); + egr_utils_api::RetainGradForTensor( + target_tensor); // result: 1.0 + 3.0 = 4.0 } // Retain Grad for leaf tensor1 @@ -123,9 +112,16 @@ TEST(RetainGrad, HookBeforeRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + + auto_grad_meta->SetStopGradient(false); + auto_grad_meta->SetGradNode(acc_node_ptr); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); @@ -160,8 +156,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad @@ -184,16 +178,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); } - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - } - // Retain Grad for leaf tensor1 paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { @@ -203,17 +187,18 @@ TEST(RetainGrad, HookAfterRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + auto_grad_meta->SetGradNode(acc_node_ptr); + auto_grad_meta->SetStopGradient(false); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RetainGradForTensor( - leaf_tensor); // RetainGrad for leaf tensor gets - // postponed, result: 4.0*5.0 + 3.0 = - // 23.0 egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 9c6c7d4d540c6b4a42b5bb9b266d7175c55b15ad..7464ad74135853a6d5f6b0f6eec3b950f527a599 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/tensor_wrapper.h" @@ -21,7 +22,6 @@ #include "paddle/phi/common/layout.h" #include "paddle/phi/core/tensor_meta.h" -#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/variable.h" @@ -109,6 +109,16 @@ std::shared_ptr EagerUtils::grad_node( } } +paddle::experimental::Tensor* EagerUtils::mutable_grad( + const paddle::experimental::Tensor& target) { + auto* meta = nullable_autograd_meta(target); + if (meta) { + return meta->MutableGrad(); + } else { + return nullptr; + } +} + void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { @@ -220,53 +230,62 @@ paddle::experimental::Tensor EagerUtils::GetOutput( return paddle::experimental::Tensor(out->GetTensorBase(), out->name()); } -void EagerUtils::OverwriteOutputs(const std::shared_ptr& out, - paddle::experimental::Tensor* tensor) { +void EagerUtils::GetOutput(const std::shared_ptr& out, + paddle::experimental::Tensor* out_var) { PADDLE_ENFORCE_NOT_NULL( - tensor, paddle::platform::errors::Fatal( - "Tensor is null and cannot be copied. " - "We are tring to OverwriteOutput from its " - "shared_ptr, this error may indicate some outputs " - "are nullptr")); - tensor->set_impl(out->GetTensorBase()); + out_var, paddle::platform::errors::Fatal( + "Tensor is null and cannot be copied. " + "We are tring to OverwriteOutput from its " + "shared_ptr, this error may indicate some outputs " + "are nullptr")); + out_var->set_impl(out->GetTensorBase()); } -void EagerUtils::OverwriteOutputs( +void EagerUtils::GetOutputs( const std::vector>& outs, - const std::vector& tensors) { - PADDLE_ENFORCE_EQ( - outs.size(), tensors.size(), - paddle::platform::errors::Fatal( - "We are tring to OverwriteOutputs which passed in and it expected " - "elements num of outs and origin outputs are equal, but we got outs " - "size of: %d, and tensors passed in size is: %d", - outs.size(), tensors.size())); + std::vector* result) { for (size_t i = 0; i < outs.size(); i++) { - OverwriteOutputs(outs[i], tensors[i]); + result->emplace_back(outs[i]->GetTensorBase()); } } -void EagerUtils::OverwriteOutputs(const paddle::experimental::Tensor& out, - paddle::experimental::Tensor* tensor) { - PADDLE_ENFORCE_NOT_NULL( - tensor, paddle::platform::errors::Fatal( - "Tensor is null and cannot be copied. " - "We are tring to OverwriteOutput from its " - "shared_ptr, this error may indicate some outputs " - "are nullptr")); - *tensor = out; -} -void EagerUtils::OverwriteOutputs( - const std::vector& outs, - const std::vector& tensors) { +void EagerUtils::GetOutputs( + const std::vector>& outs, + const std::vector& out_var) { for (size_t i = 0; i < outs.size(); i++) { PADDLE_ENFORCE_NOT_NULL( - tensors[i], paddle::platform::errors::Fatal( + out_var[i], paddle::platform::errors::Fatal( "Tensor is null and cannot be copied. " "We are tring to OverwriteOutput from its " "shared_ptr, this error may indicate some outputs " "are nullptr")); - *tensors[i] = outs[i]; + out_var[i]->set_impl(outs[i]->GetTensorBase()); + } +} + +void EagerUtils::GetOutputs(const std::shared_ptr& out, + std::vector* result) { + result->emplace_back(out->GetTensorBase()); +} + +void EagerUtils::GetOutputs( + const std::shared_ptr& out, + const std::vector& out_var) { + PADDLE_ENFORCE_NOT_NULL( + out_var[0], paddle::platform::errors::Fatal( + "Tensor is null and cannot be copied. " + "We are tring to OverwriteOutput from its " + "shared_ptr, this error may indicate some outputs " + "are nullptr")); + out_var[0]->set_impl(out->GetTensorBase()); +} + +void EagerUtils::Output2Result( + const std::vector& out_var, + std::vector* result) { + result->reserve(out_var.size()); + for (size_t i = 0; i < out_var.size(); i++) { + result->emplace_back(*out_var[i]); } } @@ -333,7 +352,8 @@ std::shared_ptr EagerUtils::GetGradAccumulationNode( } else { if (!autograd_ptr->StopGradient()) { VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name(); - autograd_ptr->SetGradNode(std::make_shared()); + autograd_ptr->SetGradNode( + std::make_shared(autograd_ptr)); return autograd_ptr->GetMutableGradNode(); } else { return nullptr; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index 00013faa345e213a125a2fe6c70eef1e0b9160ef..fa5735e6f32a0ca7762b9ba94cce26ac8ac567dd 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -77,7 +77,7 @@ class PassStopGradientIter : public IterHelper { VLOG(2) << "Tensor is NULL"; return; } - element->SetStopGradient(stop_gradient_); + element->WeakSetStopGradient(stop_gradient_); } bool stop_gradient_ = true; @@ -102,6 +102,8 @@ class EagerUtils { static std::shared_ptr grad_node( const paddle::experimental::Tensor& target); + static paddle::experimental::Tensor* mutable_grad( + const paddle::experimental::Tensor& target); // Set history is used to set backward info during forward process, it will // set forward var's autograd meta's grad node as current backward node. @@ -173,17 +175,24 @@ class EagerUtils { const std::vector>& outs); static paddle::experimental::Tensor GetOutput( const std::shared_ptr& out); - // Sync Back to origin output Tensor - static void OverwriteOutputs(const std::shared_ptr& out, - paddle::experimental::Tensor* tensor); - static void OverwriteOutputs(const paddle::experimental::Tensor& out, - paddle::experimental::Tensor* tensor); - static void OverwriteOutputs( + static void GetOutput(const std::shared_ptr& out, + paddle::experimental::Tensor* out_var); + static void GetOutputs( const std::vector>& outs, - const std::vector& tensors); - static void OverwriteOutputs( - const std::vector& outs, - const std::vector& tensors); + std::vector* result); + static void GetOutputs( + const std::vector>& outs, + const std::vector& out_var); + static void GetOutputs(const std::shared_ptr& out, + std::vector* result); + static void GetOutputs( + const std::shared_ptr& out, + const std::vector& out_var); + + static void Output2Result( + const std::vector& out_var, + std::vector* result); + // end Intermidate needed static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor); diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 78f5bb077aaf189ff0d21aba853d62aebe46f53e..7d527e24a0079e8e8fc9f591ee35131c25a38a8b 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -437,8 +437,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS - tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api) +cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) @@ -459,4 +458,3 @@ else() cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place) endif() cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils) -cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor) diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc index 3a00d9424646a5d7caae251edc55c62e5d024105..49a1e0774a6b1a7a1afd154029850ceb52040759 100644 --- a/paddle/fluid/framework/custom_kernel.cc +++ b/paddle/fluid/framework/custom_kernel.cc @@ -18,355 +18,24 @@ limitations under the License. */ #endif #include "paddle/fluid/framework/custom_kernel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_kernel_info_helper.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/api/ext/op_kernel_info.h" -#include "paddle/phi/core/compat/convert_utils.h" -#include "paddle/phi/core/kernel_context.h" -#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/core/custom_kernel.h" namespace paddle { - namespace framework { -// set phi::Kernel args_def_ from op_kernel_info -// because we can not set directly to phi::Kernel without exposing -// phi::KernelArgsDef when parsing custom user function -static void ParseArgs(const OpKernelInfo& op_kernel_info, - phi::KernelArgsDef* args_def) { - auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info); - auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info); - auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info); - - for (auto& input : input_defs) { - auto type_index = - input.is_vector - ? std::type_index(typeid(const std::vector&)) - : std::type_index(typeid(const phi::DenseTensor&)); - args_def->AppendInput(input.backend, input.layout, input.dtype, type_index); - } - for (auto& output : output_defs) { - auto type_index = - output.is_vector - ? std::type_index(typeid(const std::vector&)) - : std::type_index(typeid(const phi::DenseTensor&)); - args_def->AppendOutput(output.backend, output.layout, output.dtype, - type_index); - } - for (auto& attr : attribute_defs) { - args_def->AppendAttribute(attr.type_index); - } -} - -// custom pten kernel call function define -static void RunKernelFunc(phi::KernelContext* ctx, - const OpKernelInfo& op_kernel_info) { - VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin..."; - - // input and output size is not params' num - // but actual Tensors' size - size_t input_size = ctx->InputsSize(); - size_t output_size = ctx->OutputsSize(); - size_t attr_size = ctx->AttrsSize(); - - // parameters' num of unified user kernel function - auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info); - auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info); - auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info); - - PADDLE_ENFORCE_GE(input_size, input_defs.size(), - platform::errors::InvalidArgument( - "the size of ctx inputs size (%d) must be larger than " - "the size of kernel input_defs (%d).", - input_size, input_defs.size())); - - PADDLE_ENFORCE_GE(output_size, output_defs.size(), - platform::errors::InvalidArgument( - "the size of ctx outputs size (%d) must be larger than " - "the size of kernel output_defs (%d).", - output_size, output_defs.size())); - - PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(), - platform::errors::InvalidArgument( - "the size of ctx attribute size (%d) must be equal to " - "to the size of kernel attribute_defs (%d).", - attr_size, attribute_defs.size())); - - VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size() - << "[tensor size:" << input_size << "]" - << " Attribute num: " << attribute_defs.size() - << " Output num: " << output_defs.size() - << "[tensor size:" << output_size << "]."; - - // Inputs mapping - std::vector custom_ins; - std::vector> custom_vec_ins; - for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) { - VLOG(3) << "Mapping Input[" << in_idx << "]"; - const std::pair range = ctx->InputRangeAt(in_idx); - - // is_vector tells if this Input is Tensor or std::vector - if (!input_defs.at(in_idx).is_vector) { - paddle::experimental::Tensor custom_t; - auto& ctx_tensor = ctx->InputAt(range.first); - custom_t.set_impl(std::make_shared(ctx_tensor)); - custom_ins.emplace_back(custom_t); - } else { - std::vector custom_vec_in; - auto ctx_tensor_vec = - ctx->MoveInputsBetween(range.first, range.second); - for (auto& ctx_tensor : ctx_tensor_vec) { - paddle::experimental::Tensor custom_t; - custom_t.set_impl(std::make_shared(ctx_tensor)); - custom_vec_in.emplace_back(custom_t); - } - custom_vec_ins.emplace_back(custom_vec_in); - } - VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first - << "," << range.second << ")."; - } - - // Attributes mapping - std::vector custom_attrs; - for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) { - VLOG(3) << "Mapping Attribute[" << attr_idx << "]"; - if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) { - bool arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(int))) { - int arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(float))) { - float arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(double))) { - double arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(int64_t))) { - int64_t arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(phi::dtype::float16))) { - phi::dtype::float16 arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(DataType))) { - DataType arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(const Scalar&))) { - const Scalar& arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(const std::vector&))) { - const std::vector& arg = - ctx->AttrAt&>(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(const ScalarArray&))) { - const ScalarArray& arg = ctx->AttrAt(attr_idx); - custom_attrs.emplace_back(arg); - } else if (attribute_defs[attr_idx].type_index == - std::type_index(typeid(const std::vector&))) { - const std::vector& arg = - ctx->AttrAt&>(attr_idx); - custom_attrs.emplace_back(arg); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported attribute attribute_defs[%d].type_index", attr_idx)); - } - VLOG(3) << "Mapped Attribute[" << attr_idx << "]"; - } - - // Outputs mapping - std::vector custom_outs; - std::vector> custom_vec_outs; - std::vector> custom_outs_ptr; - std::vector>> - custom_vec_outs_ptr; - - for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) { - VLOG(3) << "Mapping Output[" << out_idx << "]"; - const std::pair range = ctx->OutputRangeAt(out_idx); - - // is_vector tells if this Output is Tensor or std::vector - if (!output_defs.at(out_idx).is_vector) { - auto* ctx_tensor = ctx->MutableOutputAt(range.first); - auto* custom_t = new paddle::experimental::Tensor(); - auto custom_t_ptr = std::make_shared(*ctx_tensor); - custom_t->set_impl(custom_t_ptr); - custom_outs.emplace_back(custom_t); - custom_outs_ptr.emplace_back(custom_t_ptr); - } else { - std::vector custom_vec_out; - std::vector> custom_vec_out_ptr; - auto ctx_tensor_vec = ctx->MutableOutputBetween( - range.first, range.second); - for (auto ctx_tensor : ctx_tensor_vec) { - auto* custom_t = new paddle::experimental::Tensor(); - auto custom_t_ptr = std::make_shared(*ctx_tensor); - custom_t->set_impl(custom_t_ptr); - custom_vec_out.emplace_back(custom_t); - custom_vec_out_ptr.emplace_back(custom_t_ptr); - } - custom_vec_outs.emplace_back(custom_vec_out); - custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr); - } - VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first - << "," << range.second << ")."; - } - - // DeviceContext - // In pten, the first paramter XXContext is decided when registering - // through template param, but custom kernel function use unified - // DeviceContext as first parameter of user_kernel_fn, we use backend - // from OpKernelInfo to decide XXContext. In temporary simple - // DeviceContext, we just set necessary info to dev_ctx(such as stream - // in NPUContext), more related work should be done when - // phi::DeviceContext is exposed to outer. - DeviceContext dev_ctx; - auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info); - if (backend == phi::Backend::CPU) { - // do nothing - } else { -#ifdef PADDLE_WITH_CUSTOM_DEVICE - size_t device_type_id_ = static_cast(backend) - - static_cast(phi::Backend::ALL_BACKEND); - std::string device_type = phi::GetGlobalDeviceType(device_type_id_); - if (!device_type.empty()) { - auto custom_ctx = - ctx->GetDeviceContext(); - dev_ctx.set_stream(custom_ctx.stream()); - return; - } -#endif - LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend - << " with compiled Paddle."; - return; - } - - auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info); - // call user function - user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs, - &custom_outs, &custom_vec_outs); - - VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function."; - - // NOTE: Map back the output tensors with stored shared_ptrs. - for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) { - VLOG(3) << "Mapping Back Output[" << out_idx << "]"; - const std::pair range = ctx->OutputRangeAt(out_idx); - - // is_vector tells if this Output is Tensor or std::vector - if (!output_defs.at(out_idx).is_vector) { - auto* ctx_tensor = ctx->MutableOutputAt(range.first); - *ctx_tensor = *(custom_outs_ptr.back().get()); - custom_outs_ptr.pop_back(); - } else { - auto ctx_tensor_vec = ctx->MutableOutputBetween( - range.first, range.second); - auto custom_vec_ptr_out = custom_vec_outs_ptr.back(); - for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) { - *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get()); - custom_vec_ptr_out.pop_back(); - } - custom_vec_outs_ptr.pop_back(); - } - VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first - << "," << range.second << "]."; - } - - // delete newed paddle::Tensor for outputs while calling user kernel function - for (size_t i = 0; i < custom_outs.size(); ++i) { - delete custom_outs[i]; - } - for (size_t i = 0; i < custom_vec_outs.size(); ++i) { - for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) { - delete custom_vec_outs[i][j]; - } - } -} - -void RegisterKernelWithMetaInfo( - const std::vector& op_kernel_infos) { - for (size_t i = 0; i < op_kernel_infos.size(); ++i) { - auto& kernel_info = op_kernel_infos[i]; - auto op_type = OpKernelInfoHelper::GetOpName(kernel_info); - auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info); - - VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key; - - // 1.Check whether this kernel is valid for a specific operator - PADDLE_ENFORCE_EQ( - phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true, - platform::errors::InvalidArgument( - "[CUSTOM KERNEL] %s is not ready for custom kernel registering.", - op_type)); - - // 2.Check whether kernel_key has been already registed - PADDLE_ENFORCE_EQ( - phi::KernelFactory::Instance().kernels()[op_type].find(kernel_key), - phi::KernelFactory::Instance().kernels()[op_type].end(), - platform::errors::InvalidArgument( - "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been " - "already existed in Paddle, please contribute PR if need " - "to optimize the kernel code. Custom kernel do NOT support " - "to replace existing kernel in Paddle.", - op_type, kernel_key)); - - // phi::KernelFn - phi::KernelFn kernel_fn = [kernel_info](phi::KernelContext* ctx) { - VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda."; - RunKernelFunc(ctx, kernel_info); - }; - // variadic_kernel_fn - void* variadic_kernel_fn = - OpKernelInfoHelper::GetVariadicKernelFn(kernel_info); - phi::Kernel kernel(kernel_fn, variadic_kernel_fn); - // args info - ParseArgs(kernel_info, kernel.mutable_args_def()); - // register custom kernel to phi::KernelFactory - phi::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel; - VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type - << ">'s kernel " << kernel_key << " to Paddle. " - << "It will be used like native ones."; - } -} - -void RegisterKernelWithMetaInfoMap( - const paddle::OpKernelInfoMap& op_kernel_info_map) { - auto& kernel_info_map = op_kernel_info_map.GetMap(); - VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: " - << kernel_info_map.size(); - - // pair: {op_type, OpKernelInfo} - for (auto& pair : kernel_info_map) { - VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first; - RegisterKernelWithMetaInfo(pair.second); - } -} - void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { #ifdef _LINUX - typedef OpKernelInfoMap& get_op_kernel_info_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetOpKernelInfoMap")); + typedef phi::CustomKernelMap& get_custom_kernel_map_t(); + auto* func = reinterpret_cast( + dlsym(dso_handle, "PD_GetCustomKernelMap")); if (func == nullptr) { LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetOpKernelInfoMap symbol in this lib."; + << "PD_GetCustomKernelMap symbol in this lib."; return; } - auto& op_kernel_info_map = func(); - RegisterKernelWithMetaInfoMap(op_kernel_info_map); + auto& custom_kernel_map = func(); + phi::RegisterCustomKernels(custom_kernel_map); LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; #else VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h index 30bccc97000f8847ddcf7ebddb4eabd6a6992afe..31084a34413ea4324c69062303ef84621a463aaf 100644 --- a/paddle/fluid/framework/custom_kernel.h +++ b/paddle/fluid/framework/custom_kernel.h @@ -14,22 +14,13 @@ limitations under the License. */ #pragma once -#include "paddle/phi/api/ext/op_kernel_info.h" +#include namespace paddle { namespace framework { +// Load custom kernel lib and register void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle); -// Load custom kernel api: register kernel after user compiled -void LoadOpKernelInfoAndRegister(const std::string& dso_name); - -// Register custom kernel api: register kernel directly -void RegisterKernelWithMetaInfoMap( - const paddle::OpKernelInfoMap& op_kernel_info_map); - -// Interface for selective register custom kernel. -void RegisterKernelWithMetaInfo( - const std::vector& op_kernel_infos); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc index a6232667193438d9ca2346cd573d60e5dc5f802a..44b9ca90fc540b39d5b3ae53f3ddcee2c8d74d6f 100644 --- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) DECLARE_bool(sync_nccl_allreduce); @@ -47,6 +48,8 @@ GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle( #endif void GradMergeAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); PADDLE_ENFORCE_GT(local_scopes_.size(), 0, platform::errors::PreconditionNotMet( "The number of local scope should be > 0, but got %zu.", @@ -96,6 +99,8 @@ FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle( #endif void FusedGradMergeAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event( + Name(), platform::TracerEventType::Communication, 1); PADDLE_ENFORCE_GT(local_scopes_.size(), 0, platform::errors::PreconditionNotMet( "The number of local scope should be > 0, but got %zu.", diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 189724a5455200bdfbd0497aee53bc949df412e9..17346f5fd939324e6c2d709fb09be2cb65669429 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -10,6 +10,8 @@ IF(WITH_GPU) nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) + nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm) + nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h new file mode 100644 index 0000000000000000000000000000000000000000..a6508bf96c00f835da4aee79503f16fa5451e794 --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h @@ -0,0 +1,144 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "heter_comm.h" +#include "paddle/fluid/platform/enforce.h" +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +struct GpuPsGraphNode { + int64_t node_id; + int neighbor_size, neighbor_offset; + // this node's neighbor is stored on [neighbor_offset,neighbor_offset + + // neighbor_size) of int64_t *neighbor_list; +}; + +struct GpuPsCommGraph { + int64_t *neighbor_list; + GpuPsGraphNode *node_list; + int neighbor_size, node_size; + // the size of neighbor array and graph_node_list array + GpuPsCommGraph() + : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {} + GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_, + int neighbor_size_, int node_size_) + : neighbor_list(neighbor_list_), + node_list(node_list_), + neighbor_size(neighbor_size_), + node_size(node_size_) {} +}; + +/* +suppose we have a graph like this + +0----3-----5----7 + \ |\ |\ + 17 8 9 1 2 + +we save the nodes in arbitrary order, +in this example,the order is +[0,5,1,2,7,3,8,9,17] +let us name this array u_id; +we record each node's neighbors: +0:3,17 +5:3,7 +1:7 +2:7 +7:1,2,5 +3:0,5,8,9 +8:3 +9:3 +17:0 + +by concatenating each node's neighbor_list in the order we save the node id. +we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0] +this is the neighbor_list of GpuPsCommGraph +given this neighbor_list and the order to save node id, +we know, +node 0's neighbors are in the range [0,1] of neighbor_list +node 5's neighbors are in the range [2,3] of neighbor_list +node 1's neighbors are in the range [4,4] of neighbor_list +node 2:[5,5] +node 7:[6,6] +node 3:[9,12] +node 8:[13,13] +node 9:[14,14] +node 17:[15,15] +... +by the above information, +we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph +of size 9, +where node_list[i].id = u_id[i] +then we have: +node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0 +node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2 +node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4 +node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5 +node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6 +node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9 +node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13 +node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14 +node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15 +*/ +struct NeighborSampleResult { + int64_t *val; + int *actual_sample_size, sample_size, key_size; + NeighborSampleResult(int _sample_size, int _key_size) + : sample_size(_sample_size), key_size(_key_size) { + actual_sample_size = NULL; + val = NULL; + }; + ~NeighborSampleResult() { + if (val != NULL) cudaFree(val); + if (actual_sample_size != NULL) cudaFree(actual_sample_size); + } +}; + +struct NodeQueryResult { + int64_t *val; + int actual_sample_size; + NodeQueryResult() { + val = NULL; + actual_sample_size = 0; + }; + ~NodeQueryResult() { + if (val != NULL) cudaFree(val); + } +}; +class GpuPsGraphTable : public HeterComm { + public: + GpuPsGraphTable(std::shared_ptr resource) + : HeterComm(1, resource) { + load_factor_ = 0.25; + } + void build_graph_from_cpu(std::vector &cpu_node_list); + NodeQueryResult *graph_node_sample(int gpu_id, int sample_size); + NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key, + int sample_size, int len); + NodeQueryResult *query_node_list(int gpu_id, int start, int query_size); + void clear_graph_info(); + void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num, + int sample_size, int *h_left, + int *h_right, + int64_t *src_sample_res, + int *actual_sample_size); + + private: + std::vector gpu_graph_list; +}; +} +}; +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h" +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h new file mode 100644 index 0000000000000000000000000000000000000000..839c7e5468c6c6938c6b4cda3dd879c7366e7d6e --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h @@ -0,0 +1,447 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace framework { +/* +comment 0 +this kernel just serves as an example of how to sample nodes' neighbors. +feel free to modify it +index[0,len) saves the nodes' index +actual_size[0,len) is to save the sample size of each node. +for ith node in index, actual_size[i] = min(node i's neighbor size, sample size) +sample_result is to save the neighbor sampling result, its size is len * +sample_size; + +*/ + +__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index, + int* actual_size, + int64_t* sample_result, int sample_size, + int len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + auto node_index = index[i]; + actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size + ? graph.node_list[node_index].neighbor_size + : sample_size; + int offset = graph.node_list[node_index].neighbor_offset; + for (int j = 0; j < actual_size[i]; j++) { + sample_result[sample_size * i + j] = graph.neighbor_list[offset + j]; + } + } +} + +/* + comment 1 + + gpu i triggers a neighbor_sample task, + when this task is done, + this function is called to move the sample result on other gpu back + to gup i and aggragate the result. + the sample_result is saved on src_sample_res and the actual sample size for + each node is saved on actual_sample_size. + the number of actual sample_result for + key[x] (refer to comment 2 for definition of key) + is saved on actual_sample_size[x], since the neighbor size of key[x] might be + smaller than sample_size, + is saved on src_sample_res [x*sample_size, x*sample_size + + actual_sample_size[x]) + + since before each gpu runs the neighbor_sample task,the key array is shuffled, + but we have the idx array to save the original order. + when the gpu i gets all the sample results from other gpus, it relies on + idx array to recover the original order. + that's what fill_dvals does. + +*/ +void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu( + int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right, + int64_t* src_sample_res, int* actual_sample_size) { + for (int i = 0; i < gpu_num; i++) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto shard_len = h_right[i] - h_left[i] + 1; + // int cur_step = path_[gpu_id][i].nodes_.size() - 1; + // auto& node = path_[gpu_id][i].nodes_[cur_step]; + auto& node = path_[gpu_id][i].nodes_.front(); + cudaMemcpyAsync( + reinterpret_cast(src_sample_res + h_left[i] * sample_size), + node.val_storage + sizeof(int64_t) * shard_len, + node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault, + node.out_stream); + cudaMemcpyAsync(reinterpret_cast(actual_sample_size + h_left[i]), + node.val_storage + sizeof(int) * shard_len, + sizeof(int) * shard_len, cudaMemcpyDefault, + node.out_stream); + } + for (int i = 0; i < gpu_num; ++i) { + if (h_left[i] == -1 || h_right[i] == -1) { + continue; + } + auto& node = path_[gpu_id][i].nodes_.front(); + cudaStreamSynchronize(node.out_stream); + } +} + +/* +TODO: +how to optimize it to eliminate the for loop +*/ +__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals, + int* d_shard_actual_sample_size, + int* d_actual_sample_size, int* idx, + int sample_size, int len) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < len) { + d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i]; + // d_vals[idx[i]] = d_shard_vals[i]; + for (int j = 0; j < sample_size; j++) { + d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j]; + } + } +} + +__global__ void node_query_example(GpuPsCommGraph graph, int start, int size, + int64_t* res) { + const size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < size) { + res[i] = graph.node_list[start + i].node_id; + } +} + +void GpuPsGraphTable::clear_graph_info() { + if (tables_.size()) { + for (auto table : tables_) delete table; + } + tables_.clear(); + for (auto graph : gpu_graph_list) { + if (graph.neighbor_list != NULL) { + cudaFree(graph.neighbor_list); + } + if (graph.node_list != NULL) { + cudaFree(graph.node_list); + } + } + gpu_graph_list.clear(); +} +/* +the parameter std::vector cpu_graph_list is generated by cpu. +it saves the graph to be saved on each gpu. + +for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number +== i + +In this function, memory is allocated on each gpu to save the graphs, +gpu i saves the ith graph from cpu_graph_list +*/ + +void GpuPsGraphTable::build_graph_from_cpu( + std::vector& cpu_graph_list) { + PADDLE_ENFORCE_EQ( + cpu_graph_list.size(), resource_->total_gpu(), + platform::errors::InvalidArgument("the cpu node list size doesn't match " + "the number of gpu on your machine.")); + clear_graph_info(); + for (int i = 0; i < cpu_graph_list.size(); i++) { + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + gpu_graph_list.push_back(GpuPsCommGraph()); + auto table = + new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_); + tables_.push_back(table); + if (cpu_graph_list[i].node_size > 0) { + std::vector keys; + std::vector offset; + cudaMalloc((void**)&gpu_graph_list[i].node_list, + cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode)); + cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list, + cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode), + cudaMemcpyHostToDevice); + for (int j = 0; j < cpu_graph_list[i].node_size; j++) { + keys.push_back(cpu_graph_list[i].node_list[j].node_id); + offset.push_back(j); + } + build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8); + gpu_graph_list[i].node_size = cpu_graph_list[i].node_size; + } else { + gpu_graph_list[i].node_list = NULL; + gpu_graph_list[i].node_size = 0; + } + if (cpu_graph_list[i].neighbor_size) { + cudaMalloc((void**)&gpu_graph_list[i].neighbor_list, + cpu_graph_list[i].neighbor_size * sizeof(int64_t)); + cudaMemcpy(gpu_graph_list[i].neighbor_list, + cpu_graph_list[i].neighbor_list, + cpu_graph_list[i].neighbor_size * sizeof(int64_t), + cudaMemcpyHostToDevice); + gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size; + } else { + gpu_graph_list[i].neighbor_list = NULL; + gpu_graph_list[i].neighbor_size = 0; + } + } + cudaDeviceSynchronize(); +} +NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id, + int64_t* key, + int sample_size, + int len) { + /* + comment 2 + this function shares some kernels with heter_comm_inl.h + arguments definitions: + gpu_id:the id of gpu. + len:how many keys are used,(the length of array key) + sample_size:how many neighbors should be sampled for each node in key. + + the code below shuffle the key array to make the keys + that belong to a gpu-card stay together, + the shuffled result is saved on d_shard_keys, + if ith element in d_shard_keys_ptr is + from jth element in the original key array, then idx[i] = j, + idx could be used to recover the original array. + if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] = + b, + if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1 + + for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2 + when we run this neighbor_sample function, + the key is shuffled to [0,2,4,6,8,1,3,5,7] + the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0, + the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1, + h_left = [0,5],h_right = [4,8] + + */ + NeighborSampleResult* result = new NeighborSampleResult(sample_size, len); + if (len == 0) { + return result; + } + cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t)); + cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int)); + int* actual_sample_size = result->actual_sample_size; + int64_t* val = result->val; + int total_gpu = resource_->total_gpu(); + int dev_id = resource_->dev_id(gpu_id); + platform::CUDAPlace place = platform::CUDAPlace(dev_id); + platform::CUDADeviceGuard guard(dev_id); + auto stream = resource_->local_stream(gpu_id, 0); + + int grid_size = (len - 1) / block_size_ + 1; + + int h_left[total_gpu]; // NOLINT + int h_right[total_gpu]; // NOLINT + + auto d_left = memory::Alloc(place, total_gpu * sizeof(int)); + auto d_right = memory::Alloc(place, total_gpu * sizeof(int)); + int* d_left_ptr = reinterpret_cast(d_left->ptr()); + int* d_right_ptr = reinterpret_cast(d_right->ptr()); + + cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream); + cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream); + // + auto d_idx = memory::Alloc(place, len * sizeof(int)); + int* d_idx_ptr = reinterpret_cast(d_idx->ptr()); + + auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t)); + int64_t* d_shard_keys_ptr = reinterpret_cast(d_shard_keys->ptr()); + auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t)); + int64_t* d_shard_vals_ptr = reinterpret_cast(d_shard_vals->ptr()); + auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int)); + int* d_shard_actual_sample_size_ptr = + reinterpret_cast(d_shard_actual_sample_size->ptr()); + + split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id); + + fill_shard_key<<>>(d_shard_keys_ptr, key, + d_idx_ptr, len); + + cudaStreamSynchronize(stream); + + cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost); + cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int), + cudaMemcpyDeviceToHost); + + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { + continue; + } + /* + comment 3 + shard_len denotes the size of keys on i-th gpu here, + when we sample on i-th gpu, we allocate shard_len * (1 + sample_size) + int64_t units + of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved + for the respective nodes' indexes + and acutal sample_size. + with nodes' indexes we could get the nodes to sample. + since size of int64_t is 8 bits, while size of int is 4, + the range of [0,shard_len) contains shard_len * 2 int uinits; + The values of the first half of this range will be updated by + the k-v map on i-th-gpu. + The second half of this range is saved for actual sample size of each node. + For node x, + its sampling result is saved on the range + [shard_len + sample_size * x,shard_len + sample_size * x + + actual_sample_size_of_x) + of alloc_mem_i, actual_sample_size_of_x equals ((int + *)alloc_mem_i)[shard_len + x] + */ + create_storage(gpu_id, i, shard_len * sizeof(int64_t), + shard_len * (1 + sample_size) * sizeof(int64_t)); + } + walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL); + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + // auto& node = path_[gpu_id][i].nodes_.back(); + auto& node = path_[gpu_id][i].nodes_.front(); + cudaStreamSynchronize(node.in_stream); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + // use the key-value map to update alloc_mem_i[0,shard_len) + tables_[i]->rwlock_->RDLock(); + tables_[i]->get(reinterpret_cast(node.key_storage), + reinterpret_cast(node.val_storage), + h_right[i] - h_left[i] + 1, + resource_->remote_stream(i, gpu_id)); + } + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + // cudaStreamSynchronize(resource_->remote_stream(i, num)); + // tables_[i]->rwlock_->UNLock(); + platform::CUDADeviceGuard guard(resource_->dev_id(i)); + auto& node = path_[gpu_id][i].nodes_.front(); + auto shard_len = h_right[i] - h_left[i] + 1; + auto graph = gpu_graph_list[i]; + int* res_array = reinterpret_cast(node.val_storage); + int* actual_size_array = res_array + shard_len; + int64_t* sample_array = (int64_t*)(res_array + shard_len * 2); + neighbor_sample_example<<remote_stream(i, gpu_id)>>>( + graph, res_array, actual_size_array, sample_array, sample_size, + shard_len); + } + + for (int i = 0; i < total_gpu; ++i) { + if (h_left[i] == -1) { + continue; + } + cudaStreamSynchronize(resource_->remote_stream(i, gpu_id)); + tables_[i]->rwlock_->UNLock(); + } + // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr); + move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size, + h_left, h_right, d_shard_vals_ptr, + d_shard_actual_sample_size_ptr); + + fill_dvalues<<>>( + d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size, + d_idx_ptr, sample_size, len); + cudaStreamSynchronize(stream); + for (int i = 0; i < total_gpu; ++i) { + int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1; + if (shard_len == 0) { + continue; + } + destroy_storage(gpu_id, i); + } + return result; +} + +NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id, + int sample_size) {} + +NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start, + int query_size) { + NodeQueryResult* result = new NodeQueryResult(); + if (query_size <= 0) return result; + int& actual_size = result->actual_sample_size; + actual_size = 0; + cudaMalloc((void**)&result->val, query_size * sizeof(int64_t)); + int64_t* val = result->val; + int dev_id = resource_->dev_id(gpu_id); + platform::CUDADeviceGuard guard(dev_id); + std::vector idx, gpu_begin_pos, local_begin_pos, sample_size; + int size = 0; + /* + if idx[i] = a, gpu_begin_pos[i] = p1, + gpu_local_begin_pos[i] = p2; + sample_size[i] = s; + then on gpu a, the nodes of positions [p1,p1 + s) should be returned + and saved from the p2 position on the sample_result array + + for example: + suppose + gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7] + start = 3, query_size = 5 + we know [6,8,1,3,5] should be returned; + idx = [0,1] + gpu_begin_pos = [3,0] + local_begin_pos = [0,3] + sample_size = [2,3] + + */ + for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) { + auto graph = gpu_graph_list[i]; + if (graph.node_size == 0) { + continue; + } + if (graph.node_size + size > start) { + int cur_size = min(query_size, graph.node_size + size - start); + query_size -= cur_size; + idx.emplace_back(i); + gpu_begin_pos.emplace_back(start - size); + local_begin_pos.emplace_back(actual_size); + start += cur_size; + actual_size += cur_size; + sample_size.emplace_back(cur_size); + create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t)); + } + size += graph.node_size; + } + for (int i = 0; i < idx.size(); i++) { + int dev_id_i = resource_->dev_id(idx[i]); + platform::CUDADeviceGuard guard(dev_id_i); + auto& node = path_[gpu_id][idx[i]].nodes_.front(); + int grid_size = (sample_size[i] - 1) / block_size_ + 1; + node_query_example<<remote_stream(idx[i], gpu_id)>>>( + gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i], + (int64_t*)node.val_storage); + } + + for (int i = 0; i < idx.size(); i++) { + cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id)); + auto& node = path_[gpu_id][idx[i]].nodes_.front(); + cudaMemcpyAsync(reinterpret_cast(val + local_begin_pos[i]), + node.val_storage, node.val_bytes_len, cudaMemcpyDefault, + node.out_stream); + } + for (int i = 0; i < idx.size(); i++) { + auto& node = path_[gpu_id][idx[i]].nodes_.front(); + cudaStreamSynchronize(node.out_stream); + } + return result; +} +} +}; +#endif diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 7b43e68ff0151e03d426e11ed8266d25a125140a..1fca8cdf8bb801a57ec36ee957b27236f488a4b3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -173,16 +173,18 @@ class HeterComm { void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right, ValType* src_val); - private: + protected: using Table = HashTable; - int block_size_{256}; - float load_factor_{0.75}; std::vector tables_; std::shared_ptr resource_; - CustomGradMerger merger_; - int topo_aware_{0}; std::vector> path_; + float load_factor_{0.75}; + int block_size_{256}; + + private: std::vector storage_; + CustomGradMerger merger_; + int topo_aware_{0}; int feanum_{1800 * 2048}; int multi_node_{0}; std::vector nccl_inner_comms_; diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu new file mode 100644 index 0000000000000000000000000000000000000000..697e0ba2cdf3475d1e7ad48105bc55959461900f --- /dev/null +++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu @@ -0,0 +1,112 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" +#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h" +#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" +#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +using namespace paddle::framework; +TEST(TEST_FLEET, graph_comm) { + int gpu_count = 3; + std::vector dev_ids; + dev_ids.push_back(0); + dev_ids.push_back(1); + dev_ids.push_back(2); + std::shared_ptr resource = + std::make_shared(dev_ids); + resource->enable_p2p(); + GpuPsGraphTable g(resource); + int node_count = 10; + std::vector> neighbors(node_count); + int ind = 0; + int64_t node_id = 0; + std::vector graph_list(gpu_count); + while (ind < node_count) { + int neighbor_size = ind + 1; + graph_list[ind % gpu_count].node_size++; + graph_list[ind % gpu_count].neighbor_size += neighbor_size; + while (neighbor_size--) { + neighbors[ind].push_back(node_id++); + } + ind++; + } + std::vector neighbor_offset(gpu_count, 0), node_index(gpu_count, 0); + for (int i = 0; i < graph_list.size(); i++) { + graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size]; + graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size]; + } + for (int i = 0; i < node_count; i++) { + ind = i % gpu_count; + graph_list[ind].node_list[node_index[ind]].node_id = i; + graph_list[ind].node_list[node_index[ind]].neighbor_offset = + neighbor_offset[ind]; + graph_list[ind].node_list[node_index[ind]].neighbor_size = + neighbors[i].size(); + for (auto x : neighbors[i]) { + graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x; + } + node_index[ind]++; + } + g.build_graph_from_cpu(graph_list); + /* + gpu 0: + 0,3,6,9 + gpu 1: + 1,4,7 + gpu 2: + 2,5,8 + + query(2,6) returns nodes [6,9,1,4,7,2] + */ + int64_t answer[6] = {6, 9, 1, 4, 7, 2}; + int64_t *res = new int64_t[6]; + auto query_res = g.query_node_list(0, 2, 6); + cudaMemcpy(res, query_res->val, 48, cudaMemcpyDeviceToHost); + ASSERT_EQ(query_res->actual_sample_size, 6); + for (int i = 0; i < 6; i++) { + ASSERT_EQ(res[i], answer[i]); + } + delete[] res; + delete query_res; + /* + node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x] + so node 6's neighbors are [21,22...,27] + node 7's neighbors are [28,29,..35] + node 0's neighbors are [0] + query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23] + 6 --index-->2 + 0 --index--->0 + 7 --index-->2 + */ + int64_t cpu_key[3] = {7, 0, 6}; + void *key; + cudaMalloc((void **)&key, 3 * sizeof(int64_t)); + cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice); + auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3); + res = new int64_t[9]; + cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost); + int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23}; + for (int i = 0; i < 9; i++) { + if (expected_sample_val[i] != -1) { + ASSERT_EQ(res[i], expected_sample_val[i]); + } + } + delete[] res; + delete neighbor_sample_res; +} diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index aae36cf455dfee028b18050bdf431ee4601c479e..4bec1baeaaee94942be33a86ff2165dd98da5818 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -376,47 +377,101 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name)); } } + } else if (attr_defs[i].type_index == + std::type_index(typeid(phi::Scalar))) { + if (ctx->HasAttr(attr_name)) { + // TODO(chentianyu03): support other attrs later + auto& attr = attr_reader.GetAttr(attr_name); + if (std::type_index(attr.type()) == std::type_index(typeid(float))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(float, attr))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::string))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(std::string, attr))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(int, attr))); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to Scalar when construct " + "InferMetaContext.", + attr_name)); + } + } else if (ctx->HasInput(attr_name)) { + const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); + if (infershape_input.size() == 1) { + if (ctx->IsRuntime()) { + Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); + infer_meta_context.EmplaceBackAttr( + std::move(experimental::MakePtenScalarFromVar(*var))); + } else { + phi::Scalar tensor_scalar(-1); + tensor_scalar.SetFromTensor(true); + infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar)); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid input.size() when cast op attribute `%s` to Scalar, " + "expected 1, but actually is %d .", + attr_name, infershape_input.size())); + } + } } else if (ctx->HasAttr(attr_name)) { // Emplace Back Attr according to the type of attr. auto& attr = attr_reader.GetAttr(attr_name); - if (std::type_index(attr.type()) == std::type_index(typeid(bool))) { + if (attr_defs[i].type_index == std::type_index(typeid(bool))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); - } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(int))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr)); - } else if (std::type_index(attr.type()) == - std::type_index(typeid(int64_t))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); - } else if (std::type_index(attr.type()) == - std::type_index(typeid(float))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - infer_meta_context.EmplaceBackAttr( - BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + // Emplace Back Attr according to the type of Phi_Kernel args. + const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const std::vector vector_int64_attr(vector_int_attr.begin(), + vector_int_attr.end()); + infer_meta_context.EmplaceBackAttr(vector_int64_attr); + } else { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(phi::DataType))) { + auto data_type = paddle::framework::TransToPtenDataType( + static_cast( + BOOST_GET_CONST(int, attr))); + infer_meta_context.EmplaceBackAttr(data_type); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported attribute type is received when call " diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 592e787109d18c45eb872fb720954ed29b073ea4..53dcc19fcbae88ab5ccfcc498037327946029927 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -118,7 +118,7 @@ REGISTER_OPERATOR(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestOpMaker, InferShapeUtilsTestInferShapeFunctor); -PT_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT, +PD_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT, paddle::framework::InferShapeUtilsTestKernel, int) {} TEST(InferShapeUtilsTest, ALL) { diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0e1e572a51f7fcbc84415bab3808dfaed97dfd08..dad5358590cb1497453681ce940898314a1d06eb 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -147,7 +147,7 @@ if(WITH_IPU) pass_library(ipu_runtime_replacer_pass base DIR ipu) pass_library(inference_process_pass base DIR ipu) pass_library(inference_postprocess_pass base DIR ipu) - pass_library(popart_canonicalization_pass base DIR ipu) + pass_library(popart_canonicalization_pass base DIR ipu DEPS paddle_ipu) pass_library(ipu_inplace_pass base DIR ipu) pass_library(infer_shape_pass base DIR ipu) pass_library(delete_scale_op_pass base DIR ipu) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 63559e201594a659f698f812086bd6e8b8608827..e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2516,6 +2516,15 @@ PDNode *patterns::DuplicatedInputs::operator()() { return op; } +PDNode *patterns::DuplicatedOutputs::operator()() { + auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"}); + op->assert_more([&](Node *node) { + return node->Op()->GetAttrIfExists("mkldnn_data_type") == + "bfloat16"; + }); + return op; +} + PDNode *patterns::MKLDNNInPlace::operator()() { const std::unordered_set &supported_op_types = { "abs", "gelu", "leaky_relu", "relu", "softmax", "sqrt", "swish", "tanh"}; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 79f1d63a1519018ecf3d3b18690746a35ab1dd95..d6400ed6945bf8a60c1d4f357bf58a11d5b87094 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1495,6 +1495,15 @@ struct DuplicatedInputs : public PatternBase { PATTERN_DECL_NODE(op); }; +struct DuplicatedOutputs : public PatternBase { + DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "many_outputs_op") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(op); +}; + // Pattern used for enforcing inplace computation for in-place computation // supporting DNNL ops. softmax, batch_norm and layer_norm struct MKLDNNInPlace : public PatternBase { diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc index 3d8d353cbf530ebe9cc9ea90937b9acf5ddd4a0f..9fe50deaf2d72679bc5c41038936d01cad9de498 100644 --- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -56,7 +56,7 @@ const bool is_regularization_op(const std::string& op_namescope) { } void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { - // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得 + // optimizer values will be extracted when lowering optimizer in ipu_backend OpDesc new_op("popart_optimizer", {}, {}, {}); new_op.SetAttr("op_role", 0); new_op.SetAttr("with_lr_sched", false); @@ -86,7 +86,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { bool is_regularization = is_regularization_op(op_namescope); VLOG(10) << "found optimizer releated op: " << op_type; - // initial larning_rate will be set in LowerOptimier + // initial larning_rate will be set in ipu_backend set_ops.insert(op_type); if (op_type == "sgd") { auto type = std::string{"sgd"}; diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc index 975a4b62cc708859803a2137741caaf413e50210..6806e44f0950535b059e8e7186541ab90973e6ab 100644 --- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" @@ -28,11 +29,8 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { auto custom_ops = Get>("custom_ops"); std::vector missing_ops; - auto nodes = graph->Nodes(); - for (auto* node : nodes) { - if (!node->IsOp()) { - continue; - } + auto sorted_ops = TopologySortOperations(*graph); + for (auto* node : sorted_ops) { auto* op = node->Op(); auto op_type = op->Type(); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index 5f9aefc1e7a0bd372a9155a25d3102ceaf9ee1e1..f1bd34a5ad4f6241585c0b00e9ab65b042388c39 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -52,7 +52,7 @@ bool IsPermittedOutputName(const std::string& output_name) { } void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in, - int* quantize_counter) { + int& quantize_counter) { std::vector input_names; // Find the name of the input linking op to op_in @@ -87,10 +87,10 @@ void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in, IR_NODE_LINK_TO(op_in, quantize_op); IR_NODE_LINK_TO(quantize_op, quantize_out_node); IR_NODE_LINK_TO(quantize_out_node, op); - (*quantize_counter)++; + quantize_counter++; } -void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { +void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) { auto inputs = op->inputs; PADDLE_ENFORCE_GE(inputs.size(), 1, platform::errors::InvalidArgument( @@ -127,7 +127,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { IR_NODE_LINK_TO(inputs[i], quantize_op); IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]); IR_NODE_LINK_TO(quantize_out_nodes[i], op); - (*quantize_counter)++; + quantize_counter++; } op->Op()->SetInput("X", quantize_out_node_names); @@ -136,7 +136,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { // Operators like Concat and Sum have a single input name X, which actually // consists of multiple inputs. Such operators require a different way to find // pattern and add quantize ops. -void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) { +void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) { GraphPatternDetector gpd; patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(), "duplicated_inputs"}; @@ -151,7 +151,7 @@ void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) { // Adding quantize ops before all operators except Concat and Sum, which have // already been handled in AddReoderBeforeDuplicatedInputs -void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) { +void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) { GraphPatternDetector gpd; patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(), "first_bfloat16_ops"}; @@ -169,60 +169,134 @@ void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) { void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const { int quantize_counter = 0; - AddReoderBeforeDuplicatedInputs(graph, &quantize_counter); - AddReoderBeforeSingleInputs(graph, &quantize_counter); + AddReoderBeforeDuplicatedInputs(graph, quantize_counter); + AddReoderBeforeSingleInputs(graph, quantize_counter); PrettyLogDetail("--- added %d quantize ops before bfloat16 op", quantize_counter); } -void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const { +void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out, + int& dequantize_counter) { + if (op->Op()->Type() == "prior_box") return; + + // Find the name of the output linking op to op_out + std::vector output_names; + for (auto name : op->Op()->OutputNames()) + for (auto output_name : op->Op()->Output(name)) + if (output_name == op_out->Name() && IsPermittedOutputName(name)) + output_names.push_back(name); + + if (output_names.empty()) return; + + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + deq_desc.SetInput("Input", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetOutput("Output", std::vector({op_out->Name()})); + deq_desc.SetAttr("Scale", 1.0f); + deq_desc.SetAttr("Shift", 0.0f); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + for (auto name = output_names.begin(); name < output_names.end(); name++) + op->Op()->SetOutput(*name, + std::vector({dequantize_in_node->Name()})); + + UnlinkNodes(op, op_out); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, op_out); + + dequantize_counter++; +} + +void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) { + auto outputs = op->outputs; + PADDLE_ENFORCE_GE(outputs.size(), 1, + platform::errors::InvalidArgument( + "OP(%s)'s outputs(%d) must be equal or greater than 1.", + op->Name(), outputs.size())); + PADDLE_ENFORCE_EQ(op->inputs.size(), 1, + platform::errors::InvalidArgument( + "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(), + op->inputs.size())); + + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + + std::vector dequantize_in_nodes(outputs.size()); + std::vector dequantize_in_node_names(outputs.size()); + + for (size_t i = 0; i < outputs.size(); i++) { + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc); + dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name(); + + deq_desc.SetInput("Input", + std::vector({dequantize_in_node_names[i]})); + deq_desc.SetOutput("Output", + std::vector({outputs[i]->Name()})); + + deq_desc.SetAttr("Scale", 1.f); + deq_desc.SetAttr("Shift", 0.0f); + deq_desc.SetAttr("bfloat16", true); + deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout") + ? op->Op()->GetAttr("data_layout") + : std::string("NCHW")); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + UnlinkNodes(op, outputs[i]); + IR_NODE_LINK_TO(op, dequantize_in_nodes[i]); + IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op); + IR_NODE_LINK_TO(dequantize_op, outputs[i]); + + dequantize_counter++; + } + + op->Op()->SetOutput("Out", dequantize_in_node_names); +} + +// Operators like split have a single output name Out, which actually +// consists of multiple outputs. Such operators require a different way to find +// pattern and add dequantize ops. +void AddReoderAfterDuplicatedOutputs(ir::Graph* graph, + int& dequantize_counter) { + GraphPatternDetector gpd; + patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(), + "duplicated_outputs"}; + duplicated_outputs(); + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs); + AddDequantizes(g, op, dequantize_counter); + }; + gpd(graph, handler); +} + +// Adding dequantize ops after all operators except split, which has +// already been handled in AddReoderAfterDuplicatedOutputs +void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) { GraphPatternDetector gpd; patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(), "last_bfloat16_ops"}; bfloat16_ops(); - int dequantize_counter = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops); GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops); - - if (op->Op()->Type() != "prior_box") { - // Find the name of the output linking op to op_out - std::vector output_names; - for (auto name : op->Op()->OutputNames()) - for (auto output_name : op->Op()->Output(name)) - if (output_name == op_out->Name() && IsPermittedOutputName(name)) - output_names.push_back(name); - - if (output_names.empty()) return; - - VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); - auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); - - OpDesc deq_desc; - deq_desc.SetType("dequantize"); - deq_desc.SetInput("Input", - std::vector({dequantize_in_node->Name()})); - deq_desc.SetOutput("Output", std::vector({op_out->Name()})); - deq_desc.SetAttr("Scale", 1.0f); - deq_desc.SetAttr("Shift", 0.0f); - auto dequantize_op = - g->CreateOpNode(&deq_desc); // OpDesc will be copied. - - for (auto name = output_names.begin(); name < output_names.end(); name++) - op->Op()->SetOutput( - *name, std::vector({dequantize_in_node->Name()})); - - UnlinkNodes(op, op_out); - IR_NODE_LINK_TO(op, dequantize_in_node); - IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); - IR_NODE_LINK_TO(dequantize_op, op_out); - - dequantize_counter++; + GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops); + if (op->Op()->Type() != "split") { + AddDequantize(g, op, op_out, dequantize_counter); } }; gpd(graph, handler); +} + +void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const { + int dequantize_counter = 0; + AddReoderAfterDuplicatedOutputs(graph, dequantize_counter); + AddReoderAfterSingleOutputs(graph, dequantize_counter); PrettyLogDetail("--- added %d dequantize ops after bfloat16 op", dequantize_counter); } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc index f620b4c94fe8906ac957aff041137d73832315da..877ee71fc2d85dd6ac7bcf4c2e41cc92e3e2ef2d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc @@ -45,7 +45,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("Input", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); op->SetAttr("mkldnn_data_type", mkldnn_data_type); - } else if (type == "concat" || type == "sum") { + } else if (type == "concat" || type == "sum" || type == "split") { op->SetInput("X", inputs); op->SetOutput("Out", outputs); op->SetAttr("mkldnn_data_type", mkldnn_data_type); @@ -117,6 +117,7 @@ TEST(CpuBfloat16Pass, convolution) { bool use_mkldnn = true; int quant_op = 3; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescConv(use_mkldnn), quant_op, dequant_op, added_nodes); } @@ -140,6 +141,7 @@ TEST(CpuBfloat16Pass, double_input_ops) { bool use_mkldnn = true; int quant_op = 4; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDoubleInput(use_mkldnn), quant_op, dequant_op, added_nodes); @@ -164,11 +166,35 @@ TEST(CpuBfloat16Pass, duplicated_input_ops) { bool use_mkldnn = true; int quant_op = 5; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDuplicatedInput(use_mkldnn), quant_op, dequant_op, added_nodes); } +ProgramDesc BuildProgramDescDuplicatedOutput(bool use_mkldnn) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32"); + SetOp(&prog, "split", "Split", {"b"}, {"c", "d"}, use_mkldnn, "bfloat16"); + SetOp(&prog, "transpose2", "Transpose", {"c"}, {"e"}, use_mkldnn, "float32"); + SetOp(&prog, "reshape2", "Reshape", {"d"}, {"f"}, use_mkldnn, "bfloat16"); + + return prog; +} + +TEST(CpuBfloat16Pass, duplicated_output_ops) { + bool use_mkldnn = true; + int quant_op = 2; + int dequant_op = 3; + // each added op consists of 2 nodes + int added_nodes = quant_op * 2 + dequant_op * 2; + MainTest(BuildProgramDescDuplicatedOutput(use_mkldnn), quant_op, dequant_op, + added_nodes); +} + ProgramDesc BuildProgramDescDoubleOutputs(bool use_mkldnn) { ProgramDesc prog; for (auto& v : variable_names) { @@ -190,6 +216,7 @@ TEST(CpuBfloat16Pass, double_outputs_ops) { bool use_mkldnn = true; int quant_op = 3; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDoubleOutputs(use_mkldnn), quant_op, dequant_op, added_nodes); diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc index ca42a613411ba6078b00522d2c178856993fa462..d6761d2e82ef300264d9f2bd35b6441de2e00a67 100644 --- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc @@ -428,6 +428,19 @@ PrelnEmbeddingEltwiseLayerNormFusePass:: void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { FusePassBase::Init(name_scope_, graph); + + bool enable_int8 = Get("enable_int8"); + bool use_oss = Get("use_oss"); + bool with_interleaved = Get("with_interleaved"); + bool with_dynamic_shape = Get("with_dynamic_shape"); + if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) { + VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, " + "enable_int8, " + "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, " + "please reconfig."; + return; + } + int fusion_count = PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_); if (fusion_count > 0) { diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc index 1b7b82cbca9e86587467fa0888eca6c6fdc2e162..978360d8f0a95b545b1460620d81eec8642977c2 100644 --- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc @@ -39,7 +39,6 @@ struct PrelnSkipLayerNorm : public PatternBase { void operator()(PDNode *x, PDNode *y); // declare operator node's name - PATTERN_DECL_NODE(fused_skipe_layernorm); PATTERN_DECL_NODE(elementwise); PATTERN_DECL_NODE(layer_norm); // declare variable node's name @@ -62,8 +61,13 @@ void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) { auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr()) ->assert_is_op_output("elementwise_add") ->assert_is_op_input("layer_norm", "X") - ->assert_is_op_input("elementwise_add", "Y"); - + ->assert_more([](Node *x) { + if (x->outputs.size() == 2) { + return true; + } else { + return false; + } + }); // Add links for elementwise_add op. elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var}); @@ -104,6 +108,18 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); FusePassBase::Init("preln_skip_layernorm_fuse", graph); + bool enable_int8 = Get("enable_int8"); + bool use_oss = Get("use_oss"); + bool with_interleaved = Get("with_interleaved"); + bool with_dynamic_shape = Get("with_dynamic_shape"); + if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) { + VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, " + "use_oss, " + "with_interleaved, with_dynamic_shape. Stop this pass, please " + "reconfig. "; + return; + } + int found_subgraph_count = 0; GraphPatternDetector gpd; diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index db194d59d37bafc78cc7da50a664a6788a657a88..bfa14d9296b26e08f56e8ab2f30542524b786cf9 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -39,7 +39,6 @@ struct SkipLayerNorm : public PatternBase { PDNode *operator()(PDNode *x, PDNode *y); // declare operator node's name - PATTERN_DECL_NODE(fused_skipe_layernorm); PATTERN_DECL_NODE(elementwise); PATTERN_DECL_NODE(layer_norm); // declare variable node's name @@ -59,9 +58,10 @@ PDNode *SkipLayerNorm::operator()(PDNode *x, PDNode *y) { y->assert_is_op_input("elementwise_add", "Y"); auto *elementwise = pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add"); - auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr()) - ->AsOutput() - ->assert_is_op_output("elementwise_add"); + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsOutput() + ->assert_is_only_output_of_op("elementwise_add"); // Add links for elementwise_add op. elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var}); diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu index ddda7231887edfc78fa7b1b6adc5cd8324e5b894..006485a698fb3dc93188cd46450ea108e709ff6d 100644 --- a/paddle/fluid/framework/lod_tensor_test.cu +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -31,15 +31,17 @@ TEST(LoD, data) { lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); auto& v = lod[0]; + paddle::framework::MixVector mix_vector_v(&v); paddle::platform::CUDAPlace gpu(0); #ifdef PADDLE_WITH_HIP - hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu), - v.size()); + hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, + mix_vector_v.CUDAMutableData(gpu), v.size()); hipDeviceSynchronize(); #else - test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); + test<<<1, 1>>>(mix_vector_v.CUDAMutableData(gpu), v.size()); cudaDeviceSynchronize(); #endif + mix_vector_v.CopyToCPU(); for (size_t i = 0; i < v.size(); ++i) { EXPECT_EQ(v[i], i * 2); } @@ -62,15 +64,17 @@ TEST(LoDTensor, LoDInGPU) { EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL); auto lod = lod_tensor.lod(); + paddle::framework::MixVector mix_vector(&(lod[0])); #ifdef PADDLE_WITH_HIP hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0, - lod[0].CUDAMutableData(place), lod[0].size()); + mix_vector.CUDAMutableData(place), lod[0].size()); hipDeviceSynchronize(); #else - test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); + test<<<1, 8>>>(mix_vector.CUDAMutableData(place), lod[0].size()); cudaDeviceSynchronize(); #endif + mix_vector.CopyToCPU(); for (size_t i = 0; i < src_lod[0].size(); ++i) { EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc index b15a66c51c4b6365cb4285894efb1e37a03b7b64..67b2d70f3440c5254abb5ff67995e6758af5c8f1 100644 --- a/paddle/fluid/framework/mixed_vector.cc +++ b/paddle/fluid/framework/mixed_vector.cc @@ -64,19 +64,20 @@ void CopyCPUDataToCUDAHelper(std::vector *cpu_, auto stream = dev_ctx->stream(); paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst, platform::CPUPlace(), src, *gpu_memory_size_, stream); + dev_ctx->Wait(); #endif } -#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \ - template <> \ - void Vector<__TYPE__>::VectorData::CopyToCPU() const { \ - CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_); \ - } \ - \ - template <> \ - void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \ - const platform::Place &place) const { \ - CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \ +#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \ + template <> \ + void MixVector<__TYPE__>::VectorData::CopyToCPU() const { \ + CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_); \ + } \ + \ + template <> \ + void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \ + const platform::Place &place) const { \ + CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \ } INSTANTIATE_VECTOR_FOR_TYPE(size_t) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 0fd67efc177b3d6bd83b1c9d8325d0de81c0d2e5..a589a5b4ea7e15fc24f443e8062635b1e337adfe 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -22,7 +22,6 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/utils/none.h" #include "paddle/utils/optional.h" @@ -30,6 +29,9 @@ limitations under the License. */ namespace paddle { namespace framework { +template +using Vector = std::vector; + inline paddle::optional OptionalCUDAPlace( const paddle::memory::allocation::AllocationPtr &gpu_) { return gpu_ == nullptr ? paddle::none @@ -39,7 +41,7 @@ inline paddle::optional OptionalCUDAPlace( // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template -class Vector { +class MixVector { public: using value_type = T; using iterator = typename std::vector::iterator; @@ -49,82 +51,68 @@ class Vector { // The actual class to implement vector logic class VectorData { public: - VectorData() : flag_(kDataInCPU) {} - VectorData(size_t count, const T &value) - : cpu_(count, value), flag_(kDataInCPU) {} - VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} template - explicit VectorData(const std::vector &dat) - : cpu_(dat), flag_(kDataInCPU) {} + explicit VectorData(std::vector *dat) : cpu_(dat), flag_(kDataInCPU) {} ~VectorData() {} - VectorData(const VectorData &o) { - o.ImmutableCPU(); - cpu_ = o.cpu_; - flag_ = kDataInCPU; - } + VectorData(const VectorData &o) = delete; - VectorData &operator=(const VectorData &o) { - o.ImmutableCPU(); - cpu_ = o.cpu_; - flag_ = kDataInCPU; - return *this; - } + VectorData &operator=(const VectorData &o) = delete; T &operator[](size_t i) { MutableCPU(); - return cpu_[i]; + return (*cpu_)[i]; } const T &operator[](size_t i) const { ImmutableCPU(); - return cpu_[i]; + return (*cpu_)[i]; } - size_t size() const { return cpu_.size(); } + size_t size() const { return (*cpu_).size(); } iterator begin() { MutableCPU(); - return cpu_.begin(); + return (*cpu_).begin(); } iterator end() { MutableCPU(); - return cpu_.end(); + return (*cpu_).end(); } T &front() { MutableCPU(); - return cpu_.front(); + return (*cpu_).front(); } T &back() { MutableCPU(); - return cpu_.back(); + return (*cpu_).back(); } const_iterator begin() const { ImmutableCPU(); - return cpu_.begin(); + return (*cpu_).begin(); } const_iterator end() const { ImmutableCPU(); - return cpu_.end(); + return (*cpu_).end(); } const T &back() const { ImmutableCPU(); - return cpu_.back(); + return (*cpu_).back(); } - T *data() { return &(*this)[0]; } + T *data() { return cpu_->data(); } - const T *data() const { return &(*this)[0]; } + const T *data() const { return cpu_->data(); } const T &front() const { ImmutableCPU(); - return cpu_.front(); + return (*cpu_).front(); } // assign this from iterator. @@ -132,14 +120,14 @@ class Vector { template void assign(Iter begin, Iter end) { MutableCPU(); - cpu_.assign(begin, end); + (*cpu_).assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. void push_back(T elem) { MutableCPU(); - cpu_.push_back(elem); + (*cpu_).push_back(elem); } // extend a vector by iterator. @@ -147,14 +135,14 @@ class Vector { template void Extend(It begin, It end) { MutableCPU(); - auto out_it = std::back_inserter>(this->cpu_); + auto out_it = std::back_inserter>(*(this->cpu_)); std::copy(begin, end, out_it); } // resize the vector void resize(size_t size) { MutableCPU(); - cpu_.resize(size); + (*cpu_).resize(size); } // get cuda ptr. immutable @@ -176,26 +164,16 @@ class Vector { // clear void clear() { - cpu_.clear(); + (*cpu_).clear(); flag_ = kDirty | kDataInCPU; } - size_t capacity() const { return cpu_.capacity(); } - - // reserve data - void reserve(size_t size) const { cpu_.reserve(size); } + std::vector *get_vector() { return cpu_; } - // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - ImmutableCPU(); - return cpu_; - } + size_t capacity() const { return (*cpu_).capacity(); } - bool operator==(const VectorData &other) const { - ImmutableCPU(); - other.ImmutableCPU(); - return cpu_ == other.cpu_; - } + // reserve data + void reserve(size_t size) const { (*cpu_).reserve(size); } std::mutex &Mutex() const { return mtx_; } @@ -203,6 +181,13 @@ class Vector { return OptionalCUDAPlace(gpu_); } + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + private: enum DataFlag { kDataInCPU = 0x01, @@ -213,13 +198,6 @@ class Vector { void CopyToCPU() const; - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - void ImmutableCUDA(platform::Place place) const { if (IsDirty()) { if (IsInCPU()) { @@ -269,7 +247,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } - mutable std::vector cpu_; + std::vector *cpu_; mutable paddle::memory::allocation::AllocationPtr gpu_; mutable size_t gpu_memory_size_{0}; mutable int flag_; @@ -278,89 +256,77 @@ class Vector { }; public: - // Default ctor. Create empty Vector - Vector() : m_(new VectorData()) {} - - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) - : m_(new VectorData(count, value)) {} - - // Ctor with init_list - Vector(std::initializer_list init) : m_(new VectorData(init)) {} - // implicit cast from std::vector. template - Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT + MixVector(const std::vector *dat) { // NOLINT + m_.reset(new VectorData(const_cast *>(dat))); } // Copy ctor - Vector(const Vector &other) { m_ = other.m_; } + MixVector(const MixVector &other) = delete; // Copy operator - Vector &operator=(const Vector &other) { - m_ = other.m_; - return *this; - } + MixVector &operator=(const MixVector &other) = delete; // Move ctor - Vector(Vector &&other) { m_ = std::move(other.m_); } + MixVector(MixVector &&other) = delete; // CPU data access method. Mutable. - T &operator[](size_t i) { return (*m_.MutableData())[i]; } + T &operator[](size_t i) { return (*m_)[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { return m_.Data()[i]; } + const T &operator[](size_t i) const { return (*m_)[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return m_.Data().size(); } + size_t size() const { return m_->size(); } - iterator begin() { return m_.MutableData()->begin(); } + iterator begin() { return m_->begin(); } - iterator end() { return m_.MutableData()->end(); } + iterator end() { return m_->end(); } - T &front() { return m_.MutableData()->front(); } + T &front() { return m_->front(); } - T &back() { return m_.MutableData()->back(); } + T &back() { return m_->back(); } - const_iterator begin() const { return m_.Data().begin(); } + const_iterator begin() const { return m_->begin(); } - const_iterator end() const { return m_.Data().end(); } + const_iterator end() const { return m_->end(); } const_iterator cbegin() const { return begin(); } const_iterator cend() const { return end(); } - const T &back() const { return m_.Data().back(); } + const T &back() const { return m_->back(); } - T *data() { return m_.MutableData()->data(); } + T *data() { return m_->data(); } - const T *data() const { return m_.Data().data(); } + const T *data() const { return m_->data(); } - const T &front() const { return m_.Data().front(); } + const T &front() const { return m_->front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - m_.MutableData()->assign(begin, end); + m_->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { m_.MutableData()->push_back(elem); } + void push_back(T elem) { m_->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - m_.MutableData()->Extend(begin, end); + m_->Extend(begin, end); } // resize the vector void resize(size_t size) { - if (m_.Data().size() != size) { - m_.MutableData()->resize(size); + if (m_->size() != size) { + m_->resize(size); } } @@ -368,15 +334,15 @@ class Vector { const T *CUDAData(platform::Place place) const { { platform::CUDAPlace p(place.GetDeviceId()); - auto &mtx = m_.Data().Mutex(); + auto &mtx = m_->Mutex(); std::lock_guard guard(mtx); - auto cuda_place = m_.Data().CUDAPlace(); + auto cuda_place = m_->CUDAPlace(); if (cuda_place == paddle::none || cuda_place == p) { - return m_.Data().CUDAData(place); + return m_->CUDAData(place); } } - // If m_ contains CUDAData in a different place. Detach manually. - m_.Detach(); + m_->MutableCPU(); + m_.reset(new VectorData(m_->get_vector())); return CUDAData(place); } @@ -384,25 +350,25 @@ class Vector { T *CUDAMutableData(platform::Place place) { { platform::CUDAPlace p(place.GetDeviceId()); - auto &mtx = m_.Data().Mutex(); + auto &mtx = m_->Mutex(); std::lock_guard guard(mtx); - auto cuda_place = m_.Data().CUDAPlace(); + auto cuda_place = m_->CUDAPlace(); if (cuda_place == paddle::none || cuda_place == p) { - return m_.MutableData()->CUDAMutableData(place); + return m_->CUDAMutableData(place); } } - // If m_ contains CUDAData in a different place. Detach manually. - m_.Detach(); + m_->MutableCPU(); + m_.reset(new VectorData(m_->get_vector())); return CUDAMutableData(place); } // clear - void clear() { m_.MutableData()->clear(); } + void clear() { m_->clear(); } - size_t capacity() const { return m_.Data().capacity(); } + size_t capacity() const { return m_->capacity(); } // reserve data - void reserve(size_t size) { m_.Data().reserve(size); } + void reserve(size_t size) { m_->reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -422,26 +388,12 @@ class Vector { } } - // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { return m_.Data(); } - - bool operator==(const Vector &other) const { - if (size() != other.size()) return false; - auto it1 = cbegin(); - auto it2 = other.cbegin(); - for (; it1 < cend(); ++it1, ++it2) { - if (*it1 != *it2) { - return false; - } - } - return true; - } + void CopyToCPU() { m_->MutableCPU(); } - const void *Handle() const { return &m_.Data(); } + const void *Handle() const { return m_.get(); } private: - // Vector is an COW object. - mutable details::COWPtr m_; + mutable std::unique_ptr m_; }; }; // namespace framework diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 011e2729d4adffd49c65f536f2ebb33d9a949e56..4cd9aab2896b6fc5940af38cde35945d007aec64 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -28,7 +28,7 @@ #include "paddle/fluid/platform/device_context.h" template -using vec = paddle::framework::Vector; +using vec = paddle::framework::MixVector; using gpuStream_t = paddle::gpuStream_t; static __global__ void multiply_10(int* ptr) { @@ -44,10 +44,11 @@ gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { } TEST(mixed_vector, GPU_VECTOR) { - vec tmp; + std::vector x; for (int i = 0; i < 10; ++i) { - tmp.push_back(i); + x.push_back(i); } + vec tmp(&x); ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu(0); @@ -70,10 +71,11 @@ TEST(mixed_vector, MultiGPU) { return; } - vec tmp; + std::vector x; for (int i = 0; i < 10; ++i) { - tmp.push_back(i); + x.push_back(i); } + vec tmp(&x); ASSERT_EQ(tmp.size(), 10UL); paddle::platform::CUDAPlace gpu0(0); paddle::platform::SetDeviceId(0); diff --git a/paddle/fluid/framework/op_kernel_info_helper.h b/paddle/fluid/framework/op_kernel_info_helper.h deleted file mode 100644 index d62711bb882750b93bdd33a5e7d9d1ab44c20c95..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/op_kernel_info_helper.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/api/ext/op_kernel_info.h" -#include "paddle/phi/core/kernel_factory.h" - -namespace paddle { -namespace framework { - -class OpKernelInfoHelper { - public: - static const std::string& GetOpName(const paddle::OpKernelInfo& info) { - return info.op_name_; - } - - static const phi::Backend& GetBackend(const paddle::OpKernelInfo& info) { - return info.backend_; - } - - static const phi::DataLayout& GetDataLayout( - const paddle::OpKernelInfo& info) { - return info.layout_; - } - - static const phi::DataType& GetDataType(const paddle::OpKernelInfo& info) { - return info.dtype_; - } - - static phi::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) { - return phi::KernelKey(info.backend_, info.layout_, info.dtype_); - } - - static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) { - return info.kernel_fn_; - } - - static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) { - return info.variadic_kernel_fn_; - } - - static const paddle::SmallVector& GetInputDefs( - const paddle::OpKernelInfo& info) { - return info.input_defs_; - } - - static const paddle::SmallVector& GetOutputDefs( - const paddle::OpKernelInfo& info) { - return info.output_defs_; - } - - static const paddle::SmallVector& GetAttributeDefs( - const paddle::OpKernelInfo& info) { - return info.attribute_defs_; - } -}; - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 8e614faa248faccd1385ea29e0cd0950f08c481d..692ebf6f332f15be552a223cab89eabbf5c4a69b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1211,7 +1211,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope, << "` not found."; } } - if (pt_kernel_->IsValid()) { +#ifdef PADDLE_WITH_XPU + bool is_xpu_unsupport = + paddle::platform::is_xpu_place(kernel_type_->place_) && + !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) || + paddle::platform::is_in_xpu_black_list(type_); +#endif + if (pt_kernel_->IsValid() +#ifdef PADDLE_WITH_XPU + && !is_xpu_unsupport +#endif + ) { run_pten_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); @@ -1220,13 +1230,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, kernels_iter->second.find(*kernel_type_.get()) == kernels_iter->second.end() #ifdef PADDLE_WITH_XPU - || - paddle::platform::is_xpu_place(kernel_type_->place_) && // NOLINT - !paddle::platform::is_xpu_support_op( - type_, *kernel_type_.get()) // NOLINT - || paddle::platform::is_in_xpu_black_list(type_) + || is_xpu_unsupport #endif - ) { + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( @@ -1972,6 +1978,9 @@ Scope* OperatorWithKernel::PreparePtenData( continue; } + if (in_def.backend == phi::Backend::ALL_BACKEND) { + continue; + } auto expected_place = phi::TransToPtenPlace(in_def.backend); if (platform::is_same_place(tensor_in->place(), expected_place)) { continue; @@ -2037,7 +2046,7 @@ void OperatorWithKernel::BuildPtenKernelContext( (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second); // deal with optional here - if ((it == ctx.inputs.end()) && + if ((it == ctx.inputs.end() || it->second.size() == 0) && (input_defs[i].type_index == std::type_index(typeid(paddle::optional)))) { pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index 3516e71b837917cae2d60193ec5e3798c9d1a211..d55950064a4a2363222929ea8d4f863575dcd6da 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -375,7 +375,7 @@ std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, const std::unordered_set& ignore_names) { auto result = std::make_unique>(); for (auto* node : nodes) { - if (ignore_names.count(node->Name())) { + if (!node->Var() || ignore_names.count(node->Name())) { continue; } result->emplace_back(node->Name()); diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 4d34ba85517e163f966b49d118e5fdce50865419..e0cf860e5bc7b94872e612112a4d5977571db489 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include @@ -20,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) @@ -44,6 +46,7 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, dense_grad_names_[table_id][j] = table.dense_grad_name(j); } } + InitializeGPUServer(trainer_desc); scale_datanorm_ = trainer_desc.scale_datanorm(); int place_num = trainer_desc.worker_places_size(); const std::vector readers = @@ -84,6 +87,166 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } +void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { + // add for hbmps optimizer config + auto fleet_desc_str = trainer_desc.fleet_desc(); + google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param); + auto sparse_table = + _ps_param.server_param().downpour_server_param().downpour_table_param(0); + auto sparse_table_accessor = sparse_table.accessor(); + auto sparse_table_accessor_parameter = + sparse_table_accessor.downpour_accessor_param(); + auto accessor_class = sparse_table_accessor.accessor_class(); + // gpups' sparse table optimizer config + // now only support single sparse table + // auto sparse_table = param_.sparse_table(0); + std::unordered_map config; + if (accessor_class == "DownpourFeatureValueAccessor" || + accessor_class == "DownpourCtrAccessor" || + accessor_class == "DownpourCtrDoubleAccessor") { + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + config["learning_rate"] = + sparse_table_accessor.sparse_sgd_param().learning_rate(); + config["initial_g2sum"] = + sparse_table_accessor.sparse_sgd_param().initial_g2sum(); + config["initial_range"] = + sparse_table_accessor.sparse_sgd_param().initial_range(); + if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) { + config["min_bound"] = + sparse_table_accessor.sparse_sgd_param().weight_bounds()[0]; + config["max_bound"] = + sparse_table_accessor.sparse_sgd_param().weight_bounds()[1]; + } + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + } else if (accessor_class == "DownpourSparseValueAccessor") { + auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name(); + if (optimizer_name == "naive") { + config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .learning_rate(); + config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .initial_range(); + if (sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .naive() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .learning_rate(); + config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .initial_range(); + config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .initial_g2sum(); + if (sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config["learning_rate"] = + sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate(); + config["initial_range"] = + sparse_table_accessor.sparse_commonsgd_param().adam().initial_range(); + if (sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds_size() == 2) { + config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds()[0]; + config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param() + .adam() + .weight_bounds()[1]; + } + } + } else if (accessor_class == "DownpourUnitAccessor" || + accessor_class == "DownpourDoubleUnitAccessor") { + config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); + config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); + auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name(); + if (optimizer_name == "naive") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().naive().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().naive().initial_range(); + if (sparse_table_accessor.embedx_sgd_param() + .naive() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = + sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0]; + config["mf_max_bound"] = + sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); + config["mf_initial_g2sum"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); + if (sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[0]; + config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "std_adagrad") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); + config["mf_initial_g2sum"] = + sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); + if (sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[0]; + config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() + .adagrad() + .weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config["mf_learning_rate"] = + sparse_table_accessor.embedx_sgd_param().adam().learning_rate(); + config["mf_initial_range"] = + sparse_table_accessor.embedx_sgd_param().adam().initial_range(); + if (sparse_table_accessor.embedx_sgd_param() + .adam() + .weight_bounds_size() == 2) { + config["mf_min_bound"] = + sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0]; + config["mf_max_bound"] = + sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1]; + } + } + config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + } + + auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); + ps_gpu_wrapper->InitializeGPUServer(config); +} + std::string PSGPUTrainer::GetDumpPath(int tid) { if (user_define_dump_filename_ != "") { return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(), diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index b96eb848e43a4e8ab6c323aa4361ed401dd9adf7..0ecc04dbd6b8d36a3540178d11d3e8def7449a7f 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -137,7 +137,7 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() { continue; } // If contains dispensable input, we should override the - // GetExpectedPtenKernelArgs method self + // OpArgumentMapping method self in phi/ops/compat dir if (in.has_dispensable() && in.dispensable()) { VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name; continue; @@ -153,7 +153,11 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() { for (int i = 0; i < op_proto_->outputs_size(); ++i) { auto& out = op_proto_->outputs()[i]; auto& out_name = out.name(); - // TODO(chenweihang): outputs also need skip some cases + if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) { + VLOG(6) << "Parse PtenKernel output: skip extra & quant output - " + << out_name; + continue; + } VLOG(6) << "Parse PtenKernel output: " << out_name; output_names_.emplace_back(out_name); } @@ -165,9 +169,10 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { for (int i = 0; i < op_proto_->attrs_size(); ++i) { auto& attr = op_proto_->attrs()[i]; auto& attr_name = attr.name(); - if (attr_name == "use_mkldnn" || attr_name == "op_role" || - attr_name == "op_role_var" || attr_name == "op_namescope" || - attr_name == "op_callstack" || attr_name == "op_device") { + if (attr_name == "use_mkldnn" || attr_name == "use_cudnn" || + attr_name == "op_role" || attr_name == "op_role_var" || + attr_name == "op_namescope" || attr_name == "op_callstack" || + attr_name == "op_device") { VLOG(6) << "Parse PtenKernel attribute: skip needless attr - " << attr_name; continue; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index b9a262105e47479fce8f5ae4f1ab6b852464d745..57eddf782f06bfce1d42c26e68c7789207bcf37f 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/phi/core/ddim.h" #include "paddle/phi/core/stream.h" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/phi/core/dense_tensor.h" namespace paddle { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1eb5727298c39aba41b4efe832b10d363b6030ea..10eefff093b0e867131c91fb0a8132175a28c6be 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1455,22 +1455,10 @@ std::ostream& print_tensor>( } std::ostream& operator<<(std::ostream& os, const LoD& lod) { - os << "{"; - for (auto& v : lod) { - os << "{"; - bool is_first = true; - for (auto& i : v) { - if (is_first) { - os << i; - is_first = false; - } else { - os << ", " << i; - } - } - os << "}"; - } - os << "}"; - + // NOTE(xiongkun): + // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution + // if we don't redefine, the operator << of pten / framework LoD is not found. + paddle::string::operator<<(os, lod); return os; } @@ -1479,6 +1467,11 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) { namespace phi { +std::ostream& operator<<(std::ostream& os, const LoD& lod) { + paddle::string::operator<<(os, lod); + return os; +} + std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) { if (t.lod().size() > 0) { os << " - lod: " << t.lod() << "\n"; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index c993895a9f0ea1ff5e592366136b4e3bba562bd8..8a11775702e57887015f831fcd4e3a3f91bd9d56 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -36,6 +36,10 @@ limitations under the License. */ #include "paddle/fluid/operators/reader/blocking_queue.h" #include "paddle/phi/backends/dynload/port.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif + namespace paddle { namespace framework { @@ -267,6 +271,7 @@ class PSGPUTrainer : public TrainerBase { template void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); + void InitializeGPUServer(const TrainerDesc& trainer_desc); protected: Dataset* dataset_; @@ -287,6 +292,9 @@ class PSGPUTrainer : public TrainerBase { int mpi_rank_; int mpi_size_; int dump_file_num_; + + // _ps_param for gpups optimizer config + ::paddle::PSParameter _ps_param; }; #endif diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 96d312437b34cf1fafc4fbcaeec91201a1fa934a..6fe33545aa22d3f17234dbb1b6cd8ad1bb719409 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -66,6 +66,9 @@ message TrainerDesc { repeated int32 trainers = 35; optional int32 trainer_id = 36; + // add for gpu + optional string fleet_desc = 37; + // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; optional DownpourWorkerParameter downpour_param = 103; diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h index f649c9388f0f6518dc4f8a587f5c9f9c01451373..945b68438e1e702e7b2e6498a26b0a107c6640da 100644 --- a/paddle/fluid/framework/var_type_inference.h +++ b/paddle/fluid/framework/var_type_inference.h @@ -69,6 +69,12 @@ class InferVarTypeContext { return op_->Inputs().at(name).size(); } + virtual size_t OutputSize(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL( + op_, platform::errors::PreconditionNotMet("op_ should not be null")); + return op_->Outputs().at(name).size(); + } + virtual const std::string& InputVarName(const std::string& name, const int index = 0) const { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 90cf0e76e000736f730121a6fcce841aa38a59ae..72f7e5af9a96eea2a6cd09912d2dbcc5f53bd931 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -31,6 +31,9 @@ if(NOT WIN32) cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits) cc_library(reducer SRCS reducer.cc DEPS layer) endif() + if(WITH_CNCL) + cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) endif() diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index 24a8ffbabf526ca779511f620648c64fcbb59cca..436e22f00c303d59652db33a723fe727b63657ef 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -90,6 +90,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, platform::DeviceContextPool::Instance().Get(place)); bool use_calc_stream = (dev_ctx->stream() == stream); + VLOG(4) << "Is use calculate stream: " << use_calc_stream; // 1. Gather rows number from all workers. Here use ncclAllGather to do this, // but we can use other ways to implement is in the future @@ -97,7 +98,9 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, framework::Vector rows_num_vector(strategy.nranks_); rows_num_vector[strategy.local_rank_] = static_cast(src_rows.size()); // CUDAMutableData use CalStream - auto *gpu_rows_num_ptr = rows_num_vector.CUDAMutableData(place); + paddle::framework::MixVector mixv_rows_num_vector(&rows_num_vector); + auto *gpu_rows_num_ptr = mixv_rows_num_vector.CUDAMutableData(place); + VLOG(4) << "start dev_ctx->wait"; if (!use_calc_stream) { dev_ctx->Wait(); } @@ -109,6 +112,7 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, platform::GpuStreamSync(stream); } + mixv_rows_num_vector.CopyToCPU(); const auto *cpu_rows_num_ptr = rows_num_vector.data(); auto rows_num = std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + strategy.nranks_, @@ -121,8 +125,10 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, auto *dst_rows = dst->mutable_rows(); dst_rows->resize(rows_num); - auto *dst_rows_ptr = dst_rows->CUDAMutableData(place); - const auto *src_rows_ptr = src_rows.CUDAData(place); + paddle::framework::MixVector mixv_dst_rows(dst_rows); + auto *dst_rows_ptr = mixv_dst_rows.CUDAMutableData(place); + paddle::framework::MixVector mixv_src_rows(&src_rows); + const auto *src_rows_ptr = mixv_src_rows.CUDAData(place); auto *dst_tensor = dst->mutable_value(); auto dims = src_tensor.dims(); @@ -150,24 +156,28 @@ static void AllReduce(const phi::SelectedRows &src, phi::SelectedRows *dst, PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather( src_tensor_ptr, dst_tensor_ptr, value_sendcount, nccl_dtype, comm->comm(), stream)); - return; - } - for (int i = 0; i < strategy.nranks_; ++i) { - if (cpu_rows_num_ptr[i] > 0) { - // 2. Broadcast the rows of SelectedRows - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( - src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i], - ncclInt64, i, comm->comm(), stream)); - // 3. Broadcast the tensor data of SelectedRows - auto *dst_tensor_ptr_i = reinterpret_cast(dst_tensor_ptr) + - row_offset * feature_size * sizeof_dtype; - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( - src_tensor_ptr, dst_tensor_ptr_i, cpu_rows_num_ptr[i] * feature_size, - nccl_dtype, i, comm->comm(), stream)); - row_offset += cpu_rows_num_ptr[i]; + } else { + for (int i = 0; i < strategy.nranks_; ++i) { + if (cpu_rows_num_ptr[i] > 0) { + // 2. Broadcast the rows of SelectedRows + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( + src_rows_ptr, dst_rows_ptr + row_offset, cpu_rows_num_ptr[i], + ncclInt64, i, comm->comm(), stream)); + // 3. Broadcast the tensor data of SelectedRows + auto *dst_tensor_ptr_i = reinterpret_cast(dst_tensor_ptr) + + row_offset * feature_size * sizeof_dtype; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBroadcast( + src_tensor_ptr, dst_tensor_ptr_i, + cpu_rows_num_ptr[i] * feature_size, nccl_dtype, i, comm->comm(), + stream)); + row_offset += cpu_rows_num_ptr[i]; + } } } - + if (!use_calc_stream) { + platform::GpuStreamSync(stream); + } + mixv_dst_rows.CopyToCPU(); VLOG(3) << "Original SelectedRows rows: " << string::join_strings(src_rows, ','); VLOG(3) << "Result SelectedRows rows: " diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..779b748c2d2d43db1019bf60d063a21eb209b6bb --- /dev/null +++ b/paddle/fluid/imperative/cncl_context.cc @@ -0,0 +1,237 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/imperative/cncl_context.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#include "paddle/fluid/platform/device/mlu/mlu_info.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, + const mluStream stream, const platform::CNCLComm *comm) { + const auto &place = src.place(); + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(place), true, + platform::errors::Unimplemented( + "Imperative mode does not support multi-CPU training yet.")); + + const void *src_ptr = src.data(); + dst->Resize(src.dims()); + auto *dst_ptr = dst->mutable_data(src.place(), src.dtype()); + auto cncl_dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(src.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(src_ptr, dst_ptr, src.numel(), + cncl_dtype, cnclSum, comm->comm(), + stream)); +} + +void CNCLParallelContext::BcastCNCLId( + std::vector &cncl_ids, // NOLINT + int root, int server_fd) { + if (strategy_.local_rank_ == root) { + std::vector other_trainers; + for (auto &ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) { + other_trainers.push_back(ep); + } + } + platform::SendBroadCastCommID(other_trainers, &cncl_ids); + } else { + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &cncl_ids); + } +} + +void CNCLParallelContext::Init() { + int server_fd = -1; + + std::vector cncl_ids; + cncl_ids.resize(strategy_.nrings_); + + if (strategy_.local_rank_ == 0) { + // generate the unique cnclid on the root worker + for (size_t i = 0; i < cncl_ids.size(); ++i) { + PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[i])); + } + } else { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastCNCLId(cncl_ids, 0, server_fd); + + int mlu_id = place_.device; + for (int ring_id = 0; ring_id < strategy_.nrings_; ++ring_id) { + VLOG(0) << "init cncl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id + << " ring id: " << ring_id; + // it will assign cncl_comm in MLUDeviceContext within ring_id + platform::CNCLCommContext::Instance().CreateComm( + &cncl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, mlu_id, + ring_id); + + compute_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + } +} + +void CNCLParallelContext::InitWithRingID(int ring_id) { + int server_fd = -1; + std::vector cncl_ids; + cncl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique cnclid on the root worker + PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[0])); + } else { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastCNCLId(cncl_ids, 0, server_fd); + + int mlu_id = place_.device; + VLOG(0) << "init cncl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id + << " ring id: " << ring_id; + // it will assign cncl_comm in MLUDeviceContext within ring_id + platform::CNCLCommContext::Instance().CreateComm( + &cncl_ids[0], strategy_.nranks_, strategy_.local_rank_, mlu_id, ring_id); + + compute_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); +} + +void CNCLParallelContext::AllReduceByStream(const framework::Variable &src, + framework::Variable *dst, + int ring_id, bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(place_), true, + platform::errors::Unimplemented( + "Dynamic graph mode does not support multi-CPU training yet.")); + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + platform::CNCLComm *comm = + platform::CNCLCommContext::Instance().Get(ring_id, place_); + mluStream stream = (use_calc_stream ? dev_ctx->stream() : comm->stream()); + + if (src.IsType()) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable(), stream, comm); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor is supported.", + platform::demangle(framework::ToTypeName(src.Type())))); + } +} + +void CNCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { + VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id; + framework::Tensor *src_tensor = src->GetMutable(); + const auto &place = src_tensor->place(); + platform::CNCLComm *comm = + platform::CNCLCommContext::Instance().Get(ring_id, place); + mluStream stream = comm->stream(); + + void *src_ptr = src_tensor->data(); + auto cncl_dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(src_tensor->dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(src_ptr, src_tensor->numel(), cncl_dtype, + 0, comm->comm(), stream)); +} + +paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext( + int ring_id) { + return static_cast( + platform::CNCLCommContext::Instance() + .Get(ring_id, place_) + ->dev_context()); +} + +void CNCLParallelContext::WaitCompute(int ring_id) { + PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( + "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_LT(ring_id, compute_events_.size(), + platform::errors::OutOfRange( + "ring id must < compute events size," + "but got ring id = %d, compute events size = %d", + ring_id, compute_events_.size())); + + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto comm_stream = + platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream(); + auto event = compute_events_[ring_id].get(); + + // compute_stream-->event-->comm_stream + PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, compute_stream)); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, comm_stream, 0)); +} + +void CNCLParallelContext::WaitComm(int ring_id) { + PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( + "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_LT(ring_id, comm_events_.size(), + platform::errors::OutOfRange( + "ring id must < comm events size," + "but got ring id = %d, comm events size = %d", + ring_id, comm_events_.size())); + + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto comm_stream = + platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream(); + auto event = comm_events_[ring_id].get(); + + // comm_stream-->event-->compute_stream + PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, comm_stream)); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, compute_stream, 0)); +} + +void CNCLParallelContext::SynchronizeCompute() { + auto *compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + +} // namespace imperative +} // namespace paddle + +#endif diff --git a/paddle/fluid/imperative/cncl_context.h b/paddle/fluid/imperative/cncl_context.h new file mode 100644 index 0000000000000000000000000000000000000000..85f53319bfcde909f8ddc42ad1640a6b5269632d --- /dev/null +++ b/paddle/fluid/imperative/cncl_context.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(PADDLE_WITH_CNCL) +#include + +#include +#include +#include + +#include "paddle/fluid/imperative/parallel_context.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +class CNCLParallelContext : public ParallelContext { + public: + explicit CNCLParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : ParallelContext(strategy, place) {} + + ~CNCLParallelContext() override = default; + + void BcastCNCLId(std::vector& cncl_ids, int root, // NOLINT + int server_fd); + + void Init() override; + + void InitWithRingID(int ring_id) override; + + void AllReduceByStream(const framework::Variable& src, + framework::Variable* dst, int ring_id, + bool use_calc_stream) override; + + void Broadcast(framework::Variable* src, int ring_id) override; + + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; + + void WaitCompute(int ring_id) override; + + void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; + + private: + // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] + std::vector> compute_events_; + + // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream + std::vector> comm_events_; +}; + +} // namespace imperative +} // namespace paddle +#endif diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index 8997966165769cac1c89ad7c8846cdd13bbc2348..dd34b8b619f80a0e7cb5f122d10850482b1b74ad 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -143,7 +143,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src, auto dtype = framework::TransToProtoVarType(src_tensor.dtype()); // 1. Gather rows number from all workers. Here use ncclAllGather to do this, // but we can use other ways to implement is in the future - const auto &src_rows = src.rows(); + auto &src_rows = src.rows(); auto gloo_wrapper = framework::GlooWrapper::GetInstance(); size_t local_row_num = src_rows.size(); std::vector rows_num_vector = @@ -157,8 +157,10 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src, << ", height: " << src.height(); auto *dst_rows = dst->mutable_rows(); dst_rows->resize(rows_num); - auto *dst_rows_ptr = dst_rows->MutableData(place); - const int64_t *src_rows_ptr = src_rows.Data(place); + paddle::framework::MixVector mixv_dst_rows(dst_rows); + auto *dst_rows_ptr = mixv_dst_rows.MutableData(place); + paddle::framework::MixVector mixv_src_rows(&src_rows); + const int64_t *src_rows_ptr = mixv_src_rows.Data(place); auto *dst_tensor = dst->mutable_value(); auto dims = src_tensor.dims(); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 05218ba961fdd115bd0d28755ce14e03a1c01003..6d18b0a86f0911f38e1c51d61467bf9a01a6de21 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -161,6 +161,13 @@ PreparedOp PrepareImpl(const NameVarMap& ins, framework::KernelSignature pt_kernel_signature; phi::KernelKey pt_kernel_key; std::string pt_kernel_name; +#ifdef PADDLE_WITH_XPU + bool is_xpu_unsupport = + paddle::platform::is_xpu_place(expected_kernel_key.place_) && + !paddle::platform::is_xpu_support_op(op.Type(), + expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(op.Type()); +#endif if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx); VLOG(6) << pt_kernel_signature; @@ -170,7 +177,11 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, pt_kernel_key); - if (pt_kernel.IsValid()) { + if (pt_kernel.IsValid() +#ifdef PADDLE_WITH_XPU + && !is_xpu_unsupport +#endif + ) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel; @@ -197,13 +208,9 @@ PreparedOp PrepareImpl(const NameVarMap& ins, kernels_iter->second.find(expected_kernel_key) == kernels_iter->second.end()) #ifdef PADDLE_WITH_XPU - || - paddle::platform::is_xpu_place(expected_kernel_key.place_) && - !paddle::platform::is_xpu_support_op(op.Type(), - expected_kernel_key) || - paddle::platform::is_in_xpu_black_list(op.Type()) + || is_xpu_unsupport #endif - ) { + ) { if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); @@ -230,9 +237,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #ifdef PADDLE_WITH_XPU if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (kernel_iter == kernels.end() || - !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || - paddle::platform::is_in_xpu_black_list(op.Type()))) { + (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 589c8edd446bdb8eaf56d43826c7c5305829965b..879b3ec3e68a25141c239d00e25fab92914ef068 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -442,7 +442,9 @@ void BuildDygraphPtenKernelContext( vector_int_attr.end()); kernel_ctx->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::vector, attr)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` when construct " @@ -477,6 +479,9 @@ void PreparePtenData(const phi::Kernel& pt_kernel, auto var = ins_vector[offset]; const auto* tensor_in = GetTensorFromVar(var->Var()); if (tensor_in && tensor_in->IsInitialized()) { + if (in_def.backend == phi::Backend::ALL_BACKEND) { + continue; + } auto expected_place = phi::TransToPtenPlace(in_def.backend); if (platform::is_same_place(tensor_in->place(), expected_place)) { continue; diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 774bb9653e2cba5c27f9037ee905e70175375339..a9c81cb87798b6e7b68de169e7f40ba1c3ccd367 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -9,6 +9,9 @@ else() if (WITH_XPU_BKCL) cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context) endif() + if (WITH_CNCL) + cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context) + endif() endif(WIN32) diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d5ee8e7fc899f2b5496fd808b39c1bf4be69e73 --- /dev/null +++ b/paddle/fluid/imperative/tests/cncl_context_test.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/cncl_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" + +#include "gtest/gtest.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test +// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test + +int nrings = 1; +imperative::ParallelStrategy GetStrategy(int local_rank) { + std::vector eps = {"127.0.0.1:9866", "localhost:9867"}; + imperative::ParallelStrategy strategy; + strategy.trainer_endpoints_ = eps; + strategy.current_endpoint_ = eps[local_rank]; + strategy.nranks_ = 2; + strategy.local_rank_ = local_rank; + strategy.nrings_ = nrings; + return strategy; +} + +#if defined(PADDLE_WITH_CNCL) +void Broadcast(int local_rank, int device_id) { + int data_size = 4; + float test_data = 7; + const auto& place = platform::MLUPlace(device_id); + platform::MLUDeviceContext ctx(place); + + imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place); + + // init + cpc.Init(); + + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // fill data for rank 0 only + std::vector src_vec; + if (local_rank == 0) { + for (int i = 0; i < data_size; ++i) { + src_vec.push_back(test_data); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + } + ctx.Wait(); + + // call broadcast + cpc.Broadcast(src_dev_var, 0); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + for (int i = 0; i < data_size; ++i) { + EXPECT_EQ(dst_vec[i], test_data); + } +} + +TEST(Broadcast, Run) { + if (platform::GetMLUDeviceCount() >= 2) { + int local_rank = atoi(getenv("PADDLE_TRAINER_ID")); + int device_id = atoi(getenv("FLAGS_selected_mlus")); + Broadcast(local_rank, device_id); + } +} + +void AllReduceByStream(int local_rank, int device_id) { + int data_size = 32; + const auto& place = platform::MLUPlace(device_id); + platform::MLUDeviceContext ctx(place); + + imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place); + + // init + cpc.Init(); + + // input data + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // fill input data + std::vector src_vec; + for (int i = 0; i < data_size; ++i) { + src_vec.push_back(1.0 + local_rank); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + ctx.Wait(); + + // output data + framework::Variable* dst_dev_var(new framework::Variable()); + auto* dst_dev_tensor = dst_dev_var->GetMutable(); + dst_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // call allreduce + cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + EXPECT_EQ(dst_vec.size(), src_vec.size()); + for (int i = 0; i < data_size; ++i) { + EXPECT_EQ(dst_vec[i], 3.0); + } +} + +TEST(AllReduceByStream, Run) { + if (platform::GetMLUDeviceCount() >= 2) { + int local_rank = atoi(getenv("PADDLE_TRAINER_ID")); + int device_id = atoi(getenv("FLAGS_selected_mlus")); + AllReduceByStream(local_rank, device_id); + } +} +#endif diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 1c9cc538ffece6778084075b01d565050e00d71e..03811ac778779c24beb765de118f2d7d00af515b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -32,6 +32,8 @@ DECLARE_string(tracer_mkldnn_ops_off); namespace paddle { namespace imperative { +thread_local bool Tracer::enable_program_desc_tracing_ = false; + thread_local bool Tracer::has_grad_ = true; thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index b508126c367960a842eb9562d42af1de9defade1..73ecbbe6143ca8e68049c2d2886e9eee93b741f1 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -160,10 +160,11 @@ class Tracer { private: std::unique_ptr basic_engine_; std::unique_ptr program_desc_tracer_; - bool enable_program_desc_tracing_{false}; std::unique_ptr generator_; platform::Place expected_place_; GarbageCollectorMap gcs_; + + static thread_local bool enable_program_desc_tracing_; static thread_local bool has_grad_; static thread_local AmpLevel amp_level_; static thread_local phi::DataType amp_dtype_; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index d731bfe139bac58050fdf79b420744551bfd17e8..887bd52bae54770c3637e83c86d098a39f9a2e04 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -48,11 +48,10 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) +elseif(WITH_IPU) + cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu) else() create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) - if(WITH_IPU) - target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils) - endif() endif() if(NOT APPLE) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index f474ccd260e808c2b852eb0443818e8265bb7f7a..a5c32164bf1a28687ea6f8cc53427db67560c307 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -278,10 +278,14 @@ struct Argument { // ipu related DECL_ARGUMENT_FIELD(use_ipu, UseIpu, bool); DECL_ARGUMENT_FIELD(ipu_device_num, IpuDeviceNum, int); + DECL_ARGUMENT_FIELD(ipu_micro_batch_size, IpuMicroBatchSize, int); DECL_ARGUMENT_FIELD(ipu_enable_pipelining, IpuEnablePipelining, bool); DECL_ARGUMENT_FIELD(ipu_batches_per_step, IpuBatchesPerStep, int); - DECL_ARGUMENT_FIELD(ipu_batch_size, IpuBatchSize, int); - DECL_ARGUMENT_FIELD(ipu_need_avg_shard, IpuNeedAvgShard, bool); + DECL_ARGUMENT_FIELD(ipu_enable_fp16, IpuEnableFp16, bool); + DECL_ARGUMENT_FIELD(ipu_replica_num, IpuReplicaNum, int); + DECL_ARGUMENT_FIELD(ipu_available_memory_proportion, + IpuAvailableMemoryProportion, float); + DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool); // npu related DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 837b83004de84e6839935835e5b7d4d1e2bc3f45..796c86a3ad1efe45dd8a00139b92c2642676a811 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -54,6 +54,27 @@ void IRPassManager::CreatePasses(Argument *argument, int pass_num = 0; for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); + pass->Set("use_oss", new bool(argument->tensorrt_use_oss())); + pass->Set("with_interleaved", + new bool(argument->tensorrt_with_interleaved())); + pass->Set("disable_logs", new bool(argument->disable_logs())); + auto precision_mode = argument->tensorrt_precision_mode(); + bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; + pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("max_input_shape", new std::map>( + argument->max_input_shape())); + pass->Set("min_input_shape", new std::map>( + argument->min_input_shape())); + pass->Set("optim_input_shape", new std::map>( + argument->optim_input_shape())); + // tuned trt dynamic_shape + pass->Set("trt_tuned_dynamic_shape", + new bool(argument->tensorrt_tuned_dynamic_shape())); + bool with_dynamic_shape = (argument->max_input_shape().size() > 0 && + argument->min_input_shape().size() > 0 && + argument->optim_input_shape().size() > 0) || + argument->tensorrt_tuned_dynamic_shape(); + pass->Set("with_dynamic_shape", new bool(with_dynamic_shape)); if (pass_name == "graph_viz_pass") { std::string optim_cache_dir = argument->optim_cache_dir(); @@ -99,17 +120,9 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->tensorrt_min_subgraph_size())); pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); - - auto precision_mode = argument->tensorrt_precision_mode(); - bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; - pass->Set("predictor_id", new int(argument->predictor_id())); bool use_calib_mode = argument->tensorrt_use_calib_mode(); - pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_calib_mode", new bool(use_calib_mode)); - pass->Set("use_oss", new bool(argument->tensorrt_use_oss())); - pass->Set("with_interleaved", - new bool(argument->tensorrt_with_interleaved())); pass->Set("precision_mode", new AnalysisConfig::Precision(precision_mode)); @@ -161,22 +174,8 @@ void IRPassManager::CreatePasses(Argument *argument, // tuned trt dynamic_shape pass->Set("trt_shape_range_info_path", new std::string(argument->tensorrt_shape_range_info_path())); - pass->Set("trt_tuned_dynamic_shape", - new bool(argument->tensorrt_tuned_dynamic_shape())); pass->Set("trt_allow_build_at_runtime", new bool(argument->tensorrt_allow_build_at_runtime())); - pass->Set("max_input_shape", new std::map>( - argument->max_input_shape())); - pass->Set("min_input_shape", new std::map>( - argument->min_input_shape())); - pass->Set("optim_input_shape", - new std::map>( - argument->optim_input_shape())); - bool with_dynamic_shape = (argument->max_input_shape().size() > 0 && - argument->min_input_shape().size() > 0 && - argument->optim_input_shape().size() > 0) || - argument->tensorrt_tuned_dynamic_shape(); - pass->Set("with_dynamic_shape", new bool(with_dynamic_shape)); pass->Set("trt_disabled_ops", new std::vector( argument->tensorrt_disabled_ops())); pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla())); @@ -192,14 +191,15 @@ void IRPassManager::CreatePasses(Argument *argument, new framework::ProgramDesc *(&argument->main_program())); } if (pass_name == "lite_subgraph_pass") { - bool enable_int8 = + bool lite_enable_int8 = argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8; pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); pass->Set("lite_ops_filter", new std::vector(argument->lite_ops_filter())); pass->Set("predictor_id", new int(argument->predictor_id())); - pass->Set("enable_int8", new bool(enable_int8)); + pass->Erase("enable_int8"); + pass->Set("enable_int8", new bool(lite_enable_int8)); pass->Set("use_gpu", new bool(argument->use_gpu())); pass->Set("zero_copy", new bool(argument->lite_zero_copy())); pass->Set("use_xpu", new bool(argument->use_xpu())); @@ -236,7 +236,6 @@ void IRPassManager::CreatePasses(Argument *argument, new std::vector( argument->nnadapter_model_cache_token())); } - disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { pass->Set("use_gpu", new bool(argument->use_gpu())); bool fc_mkldnn_pass = 0; @@ -248,9 +247,6 @@ void IRPassManager::CreatePasses(Argument *argument, bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); pass->Set("use_fc_padding", new bool(use_fc_padding)); } - - pass->Set("disable_logs", new bool(disable_logs_)); - pre_pass = pass_name; passes_.emplace_back(std::move(pass)); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index fe6a27f80725f8e6520c0988f195419eb8a0cc1d..321716b1c8a1849c394850a874cd5d20e88c4a9a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -72,17 +72,21 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { if (argument->use_ipu()) { argument->main_graph().SetNotOwned("num_ipus", &argument->ipu_device_num()); - argument->main_graph().SetNotOwned("need_avg_shard", - &argument->ipu_need_avg_shard()); + argument->main_graph().SetNotOwned("micro_batch_size", + &argument->ipu_micro_batch_size()); argument->main_graph().SetNotOwned("enable_pipelining", &argument->ipu_enable_pipelining()); argument->main_graph().SetNotOwned("batches_per_step", &argument->ipu_batches_per_step()); - argument->main_graph().SetNotOwned("batch_size", - &argument->ipu_batch_size()); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("Please compile with WITH_IPU")); + argument->main_graph().SetNotOwned("enable_fp16", + &argument->ipu_enable_fp16()); + argument->main_graph().SetNotOwned("replica_num", + &argument->ipu_replica_num()); + argument->main_graph().SetNotOwned( + "available_memory_proportion", + &argument->ipu_available_memory_proportion()); + argument->main_graph().SetNotOwned("enable_half_partial", + &argument->ipu_enable_half_partial()); } } #endif diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 57e49733b329aab8d73ab5e39c594711d5a416a9..fd2ccffae3b4af3280f622722d6080d7c68bfbad 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -142,17 +142,28 @@ void AnalysisConfig::EnableNpu(int device_id) { Update(); } -void AnalysisConfig::EnableIpu(int device_num, bool ipu_enable_pipelining, - int ipu_batches_per_step, int ipu_batch_size, - bool ipu_need_avg_shard) { + +void AnalysisConfig::EnableIpu(int ipu_device_num, int ipu_micro_batch_size, + bool ipu_enable_pipelining, + int ipu_batches_per_step) { enable_ir_optim_ = true; use_ipu_ = true; - ipu_device_num_ = device_num; + ipu_device_num_ = ipu_device_num; + ipu_micro_batch_size_ = ipu_micro_batch_size; ipu_enable_pipelining_ = ipu_enable_pipelining; ipu_batches_per_step_ = ipu_batches_per_step; - ipu_batch_size_ = ipu_batch_size; - ipu_need_avg_shard_ = ipu_need_avg_shard; + + Update(); +} + +void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, + float ipu_available_memory_proportion, + bool ipu_enable_half_partial) { + ipu_enable_fp16_ = ipu_enable_fp16; + ipu_replica_num_ = ipu_replica_num; + ipu_available_memory_proportion_ = ipu_available_memory_proportion; + ipu_enable_half_partial_ = ipu_enable_half_partial; Update(); } @@ -255,10 +266,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // ipu related CP_MEMBER(use_ipu_); CP_MEMBER(ipu_device_num_); + CP_MEMBER(ipu_micro_batch_size_); CP_MEMBER(ipu_enable_pipelining_); CP_MEMBER(ipu_batches_per_step_); - CP_MEMBER(ipu_batch_size_); - CP_MEMBER(ipu_need_avg_shard_); + CP_MEMBER(ipu_enable_fp16_); + CP_MEMBER(ipu_replica_num_); + CP_MEMBER(ipu_available_memory_proportion_); + CP_MEMBER(ipu_enable_half_partial_); if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, @@ -684,10 +698,13 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_ipu_; ss << ipu_device_num_; + ss << ipu_micro_batch_size_; ss << ipu_enable_pipelining_; ss << ipu_batches_per_step_; - ss << ipu_batch_size_; - ss << ipu_need_avg_shard_; + ss << ipu_enable_fp16_; + ss << ipu_replica_num_; + ss << ipu_available_memory_proportion_; + ss << ipu_enable_half_partial_; return ss.str(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index a3812244fbe224982063e6000924cb670e67b85b..cd6e3a3c759c05bda34978dd78d07358aacd53fe 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -93,6 +93,8 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt, framework::LoDTensor *t, input_ptr = t->mutable_data(ddim, place); } else if (pt.dtype == PaddleDType::INT32) { input_ptr = t->mutable_data(ddim, place); + } else if (pt.dtype == PaddleDType::FLOAT16) { + input_ptr = t->mutable_data(ddim, place); } else { LOG(ERROR) << "unsupported feed type " << pt.dtype; return false; @@ -563,8 +565,12 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, } else if (type == framework::proto::VarType::INT32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT32; + } else if (type == framework::proto::VarType::FP16) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT16; } else { - LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; + LOG(ERROR) << "unknown type, only support float32, float16, int64 and " + "int32 now."; } } return true; @@ -592,6 +598,14 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetModelParamsPath(config_.params_file()); } + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseOSS(config_.trt_use_oss_); + argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_); + argument_.SetMinInputShape(config_.min_input_shape_); + argument_.SetMaxInputShape(config_.max_input_shape_); + argument_.SetOptimInputShape(config_.optim_input_shape_); + argument_.SetTensorRtTunedDynamicShape( + config_.tuned_tensorrt_dynamic_shape()); if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { LOG(INFO) << "TensorRT subgraph engine is enabled"; argument_.SetUseTensorRT(true); @@ -601,18 +615,10 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetTensorRtDisabledOPs(config_.trt_disabled_ops_); argument_.SetTensorRtUseDLA(config_.trt_use_dla_); argument_.SetTensorRtDLACore(config_.trt_dla_core_); - argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_); - argument_.SetTensorRtUseOSS(config_.trt_use_oss_); - argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_); - argument_.SetMinInputShape(config_.min_input_shape_); - argument_.SetMaxInputShape(config_.max_input_shape_); - argument_.SetOptimInputShape(config_.optim_input_shape_); argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_); argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path()); - argument_.SetTensorRtTunedDynamicShape( - config_.tuned_tensorrt_dynamic_shape()); argument_.SetTensorRtAllowBuildAtRuntime( config_.trt_allow_build_at_runtime()); argument_.SetTensorRtUseInspector(config_.trt_use_inspector_); @@ -662,12 +668,18 @@ void AnalysisPredictor::PrepareArgument() { LOG(INFO) << "Lite subgraph engine is enabled"; } +#ifdef PADDLE_WITH_IPU argument_.SetUseIpu(config_.use_ipu_); argument_.SetIpuDeviceNum(config_.ipu_device_num()); + argument_.SetIpuMicroBatchSize(config_.ipu_micro_batch_size_); argument_.SetIpuEnablePipelining(config_.ipu_enable_pipelining_); argument_.SetIpuBatchesPerStep(config_.ipu_batches_per_step_); - argument_.SetIpuBatchSize(config_.ipu_batch_size_); - argument_.SetIpuNeedAvgShard(config_.ipu_need_avg_shard_); + argument_.SetIpuEnableFp16(config_.ipu_enable_fp16_); + argument_.SetIpuReplicaNum(config_.ipu_replica_num_); + argument_.SetIpuAvailableMemoryProportion( + config_.ipu_available_memory_proportion_); + argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_); +#endif argument_.SetUseNpu(config_.use_npu_); argument_.SetNPUDeviceId(config_.npu_device_id()); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 4b13ca073bc4f77756112322700ad5ad6d9d7fa4..180c028c6a61088edeb8723891d4de1ba2272b80 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -234,20 +234,30 @@ struct PD_INFER_DECL AnalysisConfig { /// /// \brief Turn on IPU. /// - /// \param device_num The number of IPUs. - /// \param ipu_enable_pipelining Enable data pipelining between subgraphs, - /// each subgraph is settled on an IPU. (This feature requires the number of - /// IPUs > 1.) - /// \param ipu_batches_per_step The number of micro_batch_size per run. (This - /// feature requires to enable pipelining.) - /// \param ipu_batch_size The micro_batch_size which is the batch_size in the - /// graph. - /// \param ipu_need_avg_shard Enable the auto graph sharding. (This feature - /// requires the number of IPUs > 1.) - /// - void EnableIpu(int device_num = 1, bool ipu_enable_pipelining = false, - int ipu_batches_per_step = 1, int ipu_batch_size = 1, - bool ipu_need_avg_shard = false); + /// \param ipu_device_num the number of IPUs. + /// \param ipu_micro_batch_size the batch size in the graph, only work with + /// mutable input shapes. + /// \param ipu_enable_pipelining enable pipelining. + /// \param ipu_batches_per_step the number of batches per run in pipelining. + /// + void EnableIpu(int ipu_device_num = 1, int ipu_micro_batch_size = 1, + bool ipu_enable_pipelining = false, + int ipu_batches_per_step = 1); + + /// + /// \brief Set IPU config. + /// + /// \param ipu_enable_fp16 enable fp16. + /// \param ipu_replica_num the number of graph replication. + /// \param ipu_available_memory_proportion the available memory proportion for + /// matmul/conv. + /// \param ipu_enable_half_partial enable fp16 partial for matmul, only work + /// with fp16. + /// + void SetIpuConfig(bool ipu_enable_fp16 = false, int ipu_replica_num = 1, + float ipu_available_memory_proportion = 1.0, + bool ipu_enable_half_partial = false); + /// /// \brief Set XPU device id. /// @@ -876,11 +886,14 @@ struct PD_INFER_DECL AnalysisConfig { // ipu related. bool use_ipu_{false}; int ipu_device_num_{1}; - + int ipu_micro_batch_size_{1}; bool ipu_enable_pipelining_{false}; int ipu_batches_per_step_{1}; - int ipu_batch_size_{1}; - bool ipu_need_avg_shard_{false}; + + bool ipu_enable_fp16_{false}; + int ipu_replica_num_{1}; + float ipu_available_memory_proportion_{1.0}; + bool ipu_enable_half_partial_{false}; // If the config is already used on a predictor, it becomes invalid. // Any config can only be used with one predictor. diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index 24a72a0b9dadbd8123876cd8a91dccb22e1c8de2..81eecbb2c1480499b81556c48d021a8ff8929899 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -45,7 +45,7 @@ enum DataType { // TODO(Superjomn) support more data types if needed. }; -enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU }; +enum class PlaceType { kUNK = -1, kCPU, kGPU, kXPU, kNPU, kIPU }; /// \brief Represents an n-dimensional array of values. /// The Tensor is used to store the input or output of the network. diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 04ae3b9afe32c1762399e987ac5be8bc312d4d59..0e4fb3335f3d76eecea85417ac83c205d63ac9c4 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -38,8 +38,6 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) { dst->emplace_back(v); } } -template void SetLoD( - paddle::lite::LoD* dst, const framework::LoD& src); template void SetLoD( framework::LoD* dst, const paddle::lite::LoD& src); diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc index a58de101053b3847db063bef5b5870992676b124..daa3b186ab4c4ca95d17d1bbd26a8cf32b4f4416 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc @@ -51,21 +51,11 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { auto pos_emb_name = op_desc.Input("PosEmbedding").front(); auto sent_emb_name = op_desc.Input("SentEmbedding").front(); - std::vector id_names; std::vector emb_names; - - id_names = - std::vector{word_id_name, pos_id_name, sent_id_name}; emb_names = std::vector{word_emb_name, pos_emb_name, sent_emb_name}; - int input_num = id_names.size(); - - // Declare inputs - std::vector input_ids; - for (int i = 0; i < input_num; i++) { - input_ids.push_back(engine_->GetITensor(id_names[i])); - } + int input_num = emb_names.size(); // input_embs[0]: word_embedding // input_embs[1]: pos_embedding @@ -126,7 +116,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { {"bert_embeddings_position_embeddings", input_embs[1], nvinfer1::PluginFieldType::kFLOAT32, static_cast(emb_sizes[1])}, - {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1}, + {"output_fp16", &output_int8, nvinfer1::PluginFieldType::kINT32, 1}, }; nvinfer1::PluginFieldCollection* plugin_ptr = @@ -156,7 +146,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { shuffle_layer->setReshapeDimensions(shape_dim); shuffle_layer->setName( ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " + - op_desc.Output("Out")[0] + ")") + op_desc.Output("Out_0")[0] + ")") .c_str()); engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f); plugin_inputs.emplace_back( @@ -170,7 +160,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter { auto plugin_layer = engine_->network()->addPluginV2( plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " + - op_desc.Output("Out")[0] + ")") + op_desc.Output("Out_0")[0] + ")") .c_str()); free(plugin_ptr); float out_0_scale = diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc index 521e04b8974fd5a761b337ecc618cf061b90a79a..d9eca65fc45dcd44725c79aaa07e1d618a15a539 100644 --- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc @@ -92,8 +92,10 @@ class PrelnSkipLayerNormOpConverter : public OpConverter { "fail to add CustomPrelnSkipLayerNormPluginDynamic layer")); layer = plugin_layer; - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name}, + std::vector output_names; + output_names.push_back(op_desc.Output("Out_0")[0]); + output_names.push_back(op_desc.Output("Out_1")[0]); + RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_names}, test_mode); #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 436c80d9a6bcf27ad00451642119c54760029669..7ddd4b558228b8577554352089aab1a9b62e16b0 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -560,12 +560,14 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, "the pass."; return false; } +#if !IS_TRT_VERSION_GE(7000) auto* x_var_desc = block->FindVar(desc.Input("X")[0]); const auto x_shape = x_var_desc->GetShape(); if (x_shape.size() == 1) { VLOG(3) << "Gather does not support 1-dimensional input in tensorrt"; return false; } +#endif } } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 9dafd0d17c7157c0e351b67d0a01fccccbdbc47a..85fe931cf93f85d3b25334bdb5ec2d0a62e37b30 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -758,11 +758,30 @@ if(ON_INFER OR WITH_GPU) set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120) endif() -# IPU if (WITH_IPU) - #resnet50 + #word2vec sample + set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec/word2vec.inference.model") + inference_analysis_test(ipu_word2vec_sample SRCS ipu_word2vec_sample.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${WORD2VEC_INSTALL_DIR}) + + # ERNIE + set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") + inference_analysis_api_test(ipu_ernie_test ${ERNIE_INSTALL_DIR} ipu_ernie_test.cc + ARGS --warmup=true --repeat=10) + inference_analysis_api_test(ipu_ernie_fp16_test ${ERNIE_INSTALL_DIR} ipu_ernie_fp16_test.cc + ARGS --warmup=true --repeat=10) + + # Resnet50 set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") inference_analysis_test(ipu_resnet50_test SRCS ipu_resnet50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=1000) + ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) + inference_analysis_test(ipu_resnet50_fp16_test SRCS ipu_resnet50_fp16_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) + + # Only support Resnet50 and Ernie currently + inference_analysis_api_test(ipu_multi_model_profile SRCS ipu_multi_model_profile.cc + ARGS --model_name="Resnet50" --infer_model=${RESNET50_MODEL_DIR} --warmup=true --repeat=10) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h index 2582a1cb09eef02272f441376cec73b196142f10..fffcd38d95a0c06ed375438c2fb9d201ce7b2a7f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h +++ b/paddle/fluid/inference/tests/api/analyzer_ernie_tester.h @@ -150,8 +150,7 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false, void SetIpuConfig(AnalysisConfig *cfg, int batch_size = 1) { cfg->SetModel(FLAGS_infer_model); - // num_ipu, enable_pipelining, batches_per_step, batch_size, need_avg_shard - cfg->EnableIpu(4, false, 1, batch_size, true); + cfg->EnableIpu(4, batch_size, false, 1); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa775bd9a9cb99c2566133f474a8bc529336477e --- /dev/null +++ b/paddle/fluid/inference/tests/api/ipu_ernie_fp16_test.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + tensors->clear(); + tensors->reserve(4); + + int i = 0; + auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_"; + for (; i < 3; i++) { + paddle::PaddleTensor temp; + ParseTensor(fields[i], &temp); + temp.name = input_name + std::to_string(i); + tensors->push_back(temp); + } + + // input_mask + paddle::PaddleTensor input_mask; + ParseTensor(fields[i], &input_mask); + // fp32 to fp16 + ConvertFP32toFP16(input_mask); + input_mask.name = input_name + std::to_string(i); + tensors->push_back(input_mask); + + return true; +} + +bool LoadInputData(std::vector> *inputs, + int batch_size = 1) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + return true; +} + +void SetConfig(AnalysisConfig *cfg, int batch_size = 1) { + cfg->SetModel(FLAGS_infer_model); + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + cfg->EnableIpu(1, batch_size, false); + // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion, + // ipu_enable_half_partial + cfg->SetIpuConfig(true, 1, 1.0, true); +} + +// Compare results +TEST(Analyzer_Ernie_ipu, compare_results) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + LoadInputData(&input_slots_all); + + std::ifstream fin(FLAGS_refer_result); + std::string line; + std::vector ref; + + while (std::getline(fin, line)) { + Split(line, ' ', &ref); + } + + auto predictor = CreateTestPredictor( + reinterpret_cast(&cfg), + FLAGS_use_analysis); + + std::vector outputs; + for (size_t i = 0; i < input_slots_all.size(); i++) { + outputs.clear(); + predictor->Run(input_slots_all[i], &outputs); + + auto output = outputs.front(); + ConvertFP16toFP32(output); + auto outputs_size = 1; + for (auto dim : output.shape) { + outputs_size *= dim; + } + float *fp32_data = reinterpret_cast(output.data.data()); + for (size_t j = 0; j < outputs_size; ++j) { + EXPECT_NEAR(ref[i * outputs_size + j], fp32_data[j], 5e-3); + } + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/ipu_ernie_test.cc b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..e36917c9acd3eb56f6a5004d092c3d6839ceb101 --- /dev/null +++ b/paddle/fluid/inference/tests/api/ipu_ernie_test.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + tensors->clear(); + tensors->reserve(4); + + int i = 0; + auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_"; + for (; i < 3; i++) { + paddle::PaddleTensor temp; + ParseTensor(fields[i], &temp); + temp.name = input_name + std::to_string(i); + tensors->push_back(temp); + } + + // input_mask + paddle::PaddleTensor input_mask; + ParseTensor(fields[i], &input_mask); + input_mask.name = input_name + std::to_string(i); + tensors->push_back(input_mask); + + return true; +} + +bool LoadInputData(std::vector> *inputs, + int batch_size = 1) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + return true; +} + +void SetConfig(AnalysisConfig *cfg, int batch_size = 1) { + cfg->SetModel(FLAGS_infer_model); + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + cfg->EnableIpu(1, batch_size, false); +} + +void profile() { + AnalysisConfig config; + SetConfig(&config); + + std::vector> outputs; + std::vector> inputs; + LoadInputData(&inputs); + TestPrediction(reinterpret_cast(&config), + inputs, &outputs, FLAGS_num_threads); +} + +// Compare Deterministic result +TEST(Analyzer_Ernie_ipu, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + LoadInputData(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +// Compare results +TEST(Analyzer_Ernie_ipu, compare_results) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + LoadInputData(&input_slots_all); + + std::ifstream fin(FLAGS_refer_result); + std::string line; + std::vector ref; + + while (std::getline(fin, line)) { + Split(line, ' ', &ref); + } + + auto predictor = CreateTestPredictor( + reinterpret_cast(&cfg), + FLAGS_use_analysis); + + std::vector outputs; + for (size_t i = 0; i < input_slots_all.size(); i++) { + outputs.clear(); + predictor->Run(input_slots_all[i], &outputs); + auto outputs_size = outputs.front().data.length() / (sizeof(float)); + for (size_t j = 0; j < outputs_size; ++j) { + EXPECT_NEAR(ref[i * outputs_size + j], + static_cast(outputs[0].data.data())[j], + FLAGS_accuracy); + } + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc new file mode 100644 index 0000000000000000000000000000000000000000..a225feae4a2619a7c9e26e5dd0ab4bfa584b1938 --- /dev/null +++ b/paddle/fluid/inference/tests/api/ipu_multi_model_profile.cc @@ -0,0 +1,105 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +void ErnieInputData(const int &total_batch_size, const bool enable_fp16, + std::vector *inputs) { + const int input_num = total_batch_size * 128 * 1; + std::vector placeholder_012(input_num, 1); + std::vector placeholder_3(input_num, 1); + + for (int i = 0; i < 4; i++) { + PaddleTensor in; + in.name = "placeholder_" + std::to_string(i); + in.shape = {total_batch_size, 128, 1}; + if (i < 3) { + in.data = PaddleBuf(static_cast(placeholder_012.data()), + input_num * sizeof(int64_t)); + in.dtype = PaddleDType::INT64; + } else { + in.data = PaddleBuf(static_cast(placeholder_3.data()), + input_num * sizeof(float)); + in.dtype = PaddleDType::FLOAT32; + if (enable_fp16) { + ConvertFP32toFP16(in); + } + } + inputs->push_back(std::move(in)); + } +} + +void Resnet50InputData(const int &total_batch_size, const bool enable_fp16, + std::vector *inputs) { + const int input_num = total_batch_size * 3 * 318 * 318; + std::vector input(input_num, 1); + PaddleTensor in; + in.shape = {total_batch_size, 3, 318, 318}; + in.data = + PaddleBuf(static_cast(input.data()), input_num * sizeof(float)); + in.dtype = PaddleDType::FLOAT32; + if (enable_fp16) { + ConvertFP32toFP16(in); + } + inputs->push_back(std::move(in)); +} + +// performance profile +TEST(Analyzer_ipu_fp16, performance_profile) { + AnalysisConfig config; + std::vector inputs; + std::vector> outputs; + + int total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_replica_num; + if (FLAGS_ipu_enable_pipelining) { + // if device_num > 1 and pipelining is enabled, the total batch size = + // micro_batch_size * device_num(batches_per_step) * replica_num + total_batch_size = FLAGS_ipu_micro_batch_size * FLAGS_ipu_batches_per_step * + FLAGS_ipu_replica_num; + } + + if (FLAGS_model_name == "Resnet50") { + config.SetModel(FLAGS_infer_model + "/model/model", + FLAGS_infer_model + "/model/params"); + Resnet50InputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs); + } else if (FLAGS_model_name == "Ernie") { + config.SetModel(FLAGS_infer_model + "/model/"); + ErnieInputData(total_batch_size, FLAGS_ipu_enable_fp16, &inputs); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support Resnet50 and Ernie Currently")); + } + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining, + // ipu_batches_per_step + config.EnableIpu(FLAGS_ipu_device_num, FLAGS_ipu_micro_batch_size, + FLAGS_ipu_enable_pipelining, FLAGS_ipu_batches_per_step); + // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion, + // ipu_enable_half_partial + config.SetIpuConfig(FLAGS_ipu_enable_fp16, FLAGS_ipu_replica_num, + FLAGS_ipu_available_memory_proportion, + FLAGS_ipu_enable_half_partial); + + TestPrediction(reinterpret_cast(&config), + {inputs}, &outputs, 1); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d69069da0716017a8dd4ce62fbe2a083516a40c --- /dev/null +++ b/paddle/fluid/inference/tests/api/ipu_resnet50_fp16_test.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gflags/gflags.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +// Compare results with 1 batch +TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) { + std::string model_dir = FLAGS_infer_model + "/" + "model"; + AnalysisConfig config; + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + config.EnableIpu(1, 1, false); + // ipu_enable_fp16, ipu_replica_num, ipu_available_memory_proportion, + // ipu_enable_half_partial + config.SetIpuConfig(true, 1, 1.0, true); + config.SetModel(model_dir + "/model", model_dir + "/params"); + + std::vector inputs; + auto predictor = CreatePaddlePredictor(config); + const int batch = 1; + const int channel = 3; + const int height = 318; + const int width = 318; + const int input_num = batch * channel * height * width; + std::vector input(input_num, 1); + + PaddleTensor in; + in.shape = {batch, channel, height, width}; + in.data = + PaddleBuf(static_cast(input.data()), input_num * sizeof(float)); + in.dtype = PaddleDType::FLOAT32; + ConvertFP32toFP16(in); + inputs.emplace_back(in); + + std::vector outputs; + + ASSERT_TRUE(predictor->Run(inputs, &outputs)); + + const std::vector truth_values = { + 127.779f, 738.165f, 1013.22f, -438.17f, 366.401f, 927.659f, + 736.222f, -633.684f, -329.927f, -430.155f, -633.062f, -146.548f, + -1324.28f, -1349.36f, -242.675f, 117.448f, -801.723f, -391.514f, + -404.818f, 454.16f, 515.48f, -133.031f, 69.293f, 590.096f, + -1434.69f, -1070.89f, 307.074f, 400.525f, -316.12f, -587.125f, + -161.056f, 800.363f, -96.4708f, 748.706f, 868.174f, -447.938f, + 112.737f, 1127.2f, 47.4355f, 677.72f, 593.186f, -336.4f, + 551.362f, 397.823f, 78.3979f, -715.398f, 405.969f, 404.256f, + 246.019f, -8.42969f, 131.365f, -648.051f}; + + const size_t expected_size = 1; + EXPECT_EQ(outputs.size(), expected_size); + + auto output = outputs.front(); + ConvertFP16toFP32(output); + auto outputs_size = 1; + for (auto dim : output.shape) { + outputs_size *= dim; + } + float* fp32_data = reinterpret_cast(output.data.data()); + + for (size_t j = 0; j < outputs_size; j += 10) { + EXPECT_NEAR((fp32_data[j] - truth_values[j / 10]) / truth_values[j / 10], + 0., 9e-2); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc index f5e755ab466915d03d799e565a14107ff2f62f23..5fde8e6a5e1e676d5dacfb9c4c0c1d876130844b 100644 --- a/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/ipu_resnet50_test.cc @@ -33,9 +33,8 @@ static std::vector truth_values = { TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) { std::string model_dir = FLAGS_infer_model + "/" + "model"; AnalysisConfig config; - // num_ipu, enable_pipelining, batches_per_step, batch_size, - // need_avg_shard - config.EnableIpu(1, false); + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + config.EnableIpu(1, 1, false); config.SetModel(model_dir + "/model", model_dir + "/params"); std::vector inputs; @@ -72,9 +71,8 @@ TEST(Analyzer_Resnet50_ipu, compare_results_1_batch) { TEST(Analyzer_Resnet50_ipu, compare_results_2_batch) { std::string model_dir = FLAGS_infer_model + "/" + "model"; AnalysisConfig config; - // num_ipu, enable_pipelining, batches_per_step, batch_size, - // need_avg_shard - config.EnableIpu(2, false, 1, 2, 1); + // ipu_device_num, ipu_micro_batch_size, ipu_enable_pipelining + config.EnableIpu(1, 2, false); config.SetModel(model_dir + "/model", model_dir + "/params"); std::vector inputs; diff --git a/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc new file mode 100644 index 0000000000000000000000000000000000000000..d38c5c3416351ae6b55d3e5ea8632290e8e202a7 --- /dev/null +++ b/paddle/fluid/inference/tests/api/ipu_word2vec_sample.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains a simple demo for how to take a model for inference with + * IPUs. + * Model: wget -q + * http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz + */ + +#include +#include +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(infer_model, "", "Directory of the inference model."); + +using paddle_infer::Config; +using paddle_infer::Predictor; +using paddle_infer::CreatePredictor; + +void inference(std::string model_path, bool use_ipu, + std::vector *out_data) { + //# 1. Create Predictor with a config. + Config config; + config.SetModel(FLAGS_infer_model); + if (use_ipu) { + // ipu_device_num, ipu_micro_batch_size + config.EnableIpu(1, 4); + } + auto predictor = CreatePredictor(config); + + //# 2. Prepare input/output tensor. + auto input_names = predictor->GetInputNames(); + std::vector data{1, 2, 3, 4}; + // For simplicity, we set all the slots with the same data. + for (auto input_name : input_names) { + auto input_tensor = predictor->GetInputHandle(input_name); + input_tensor->Reshape({4, 1}); + input_tensor->CopyFromCpu(data.data()); + } + + //# 3. Run + predictor->Run(); + + //# 4. Get output. + auto output_names = predictor->GetOutputNames(); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_tensor->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data->resize(out_num); + output_tensor->CopyToCpu(out_data->data()); +} + +int main(int argc, char *argv[]) { + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + std::vector ipu_result; + std::vector cpu_result; + inference(FLAGS_infer_model, true, &ipu_result); + inference(FLAGS_infer_model, false, &cpu_result); + for (size_t i = 0; i < ipu_result.size(); i++) { + CHECK_NEAR(ipu_result[i], cpu_result[i], 1e-6); + } + LOG(INFO) << "Finished"; +} diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 77fab0a86f83300b89d75ae0cd1ce7fa5bf03a5a..637fa16e31ba7996713a6971c3a1802627811e7f 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -76,10 +76,23 @@ DEFINE_int32(cpu_num_threads, 1, "Number of threads for each paddle instance."); DEFINE_bool(fuse_multi_gru, false, "Running the inference program with multi_gru_fuse_pass"); +// ipu related +DEFINE_int32(ipu_micro_batch_size, 1, "micro batch size"); +DEFINE_int32(ipu_device_num, 1, "device num"); +DEFINE_bool(ipu_enable_pipelining, false, "enable pipelining"); +DEFINE_int32(ipu_batches_per_step, 1, + "the number of batches per run in pipelining"); +DEFINE_bool(ipu_enable_fp16, false, "enable fp16"); +DEFINE_int32(ipu_replica_num, 1, "replica num"); +DEFINE_double(ipu_available_memory_proportion, 1.0, + "available memory proportion"); +DEFINE_bool(ipu_enable_half_partial, false, "enable half partial"); + namespace paddle { namespace inference { using paddle::framework::proto::VarType; +using float16 = paddle::platform::float16; template constexpr paddle::PaddleDType GetPaddleDType(); @@ -1060,5 +1073,44 @@ static bool CompareTensor(const framework::LoDTensor &a, return true; } +void ConvertFP32toFP16(paddle::PaddleTensor &tensor // NOLINT + ) { + int num = 1; + for (auto dim : tensor.shape) { + num *= dim; + } + PADDLE_ENFORCE_EQ( + tensor.dtype, PaddleDType::FLOAT32, + platform::errors::InvalidArgument( + "The tensor dtype is not float32, only support float32 as input")); + float *fp32_data = reinterpret_cast(tensor.data.data()); + float16 *fp16_data = new float16[num]; + for (int i = 0; i < num; i++) { + fp16_data[i] = float16(fp32_data[i]); + } + tensor.data = + PaddleBuf(static_cast(fp16_data), num * sizeof(float16)); + tensor.dtype = PaddleDType::FLOAT16; +} + +void ConvertFP16toFP32(paddle::PaddleTensor &tensor // NOLINT + ) { + int num = 1; + for (auto dim : tensor.shape) { + num *= dim; + } + PADDLE_ENFORCE_EQ( + tensor.dtype, PaddleDType::FLOAT16, + platform::errors::InvalidArgument( + "The tensor dtype is not float16, only support float16 as input")); + float16 *fp16_data = reinterpret_cast(tensor.data.data()); + float *fp32_data = new float[num]; + for (int i = 0; i < num; i++) { + fp32_data[i] = static_cast(fp16_data[i]); + } + tensor.data = PaddleBuf(static_cast(fp32_data), num * sizeof(float)); + tensor.dtype = PaddleDType::FLOAT32; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index a71e5fe9877c5b9bf886cb6afedb2ac7c4aab155..166cdd0b5d6b6a523cfe470662951184ebbfabc5 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -246,7 +246,8 @@ void Copy(platform::NPUPlace dst_place, << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); + platform::RecordEvent record_event( + "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { @@ -256,7 +257,8 @@ void Copy(platform::NPUPlace dst_place, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(dst_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); + platform::RecordEvent record_event( + "NpuMemcpySync:CPU->NPU", platform::TracerEventType::UserDefined, 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } } @@ -275,14 +277,16 @@ void Copy(platform::CPUPlace dst_place, << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); + platform::RecordEvent record_event( + "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU"); + platform::RecordEvent record_event( + "NpuMemcpySync:NPU->CPU", platform::TracerEventType::UserDefined, 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } } @@ -300,7 +304,9 @@ void Copy(platform::NPUPlace dst_place, if (dst_place == src_place) { platform::SetNPUDeviceId(src_place.device); if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); + platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { @@ -308,7 +314,9 @@ void Copy(platform::NPUPlace dst_place, platform::DeviceContextPool::Instance(); static_cast(pool.Get(dst_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } } else { @@ -318,7 +326,9 @@ void Copy(platform::NPUPlace dst_place, } if (stream) { // TODO(zhiqiu): support peer access? - platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); + platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { @@ -326,7 +336,9 @@ void Copy(platform::NPUPlace dst_place, platform::DeviceContextPool::Instance(); static_cast(pool.Get(dst_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } } @@ -374,14 +386,18 @@ void Copy( << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned"); + platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } } @@ -398,7 +414,9 @@ void Copy( << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); + platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { @@ -408,7 +426,9 @@ void Copy( platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(dst_place))->Wait(); - platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU"); + platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU", + platform::TracerEventType::UserDefined, + 1); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } } @@ -596,7 +616,8 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by stream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); + platform::RecordEvent record_event( + "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); @@ -605,7 +626,8 @@ void Copy( reinterpret_cast(stream)); #endif } else { - platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); + platform::RecordEvent record_event( + "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); #else @@ -628,7 +650,8 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); + platform::RecordEvent record_event( + "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, reinterpret_cast(stream)); @@ -637,7 +660,8 @@ void Copy( reinterpret_cast(stream)); #endif } else { - platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); + platform::RecordEvent record_event( + "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); #else @@ -661,7 +685,9 @@ void Copy( if (dst_place == src_place) { platform::SetDeviceId(src_place.device); if (stream) { - platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); + platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, reinterpret_cast(stream)); @@ -670,7 +696,9 @@ void Copy( reinterpret_cast(stream)); #endif } else { - platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); + platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice); #else @@ -679,11 +707,15 @@ void Copy( } } else { if (stream) { - platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); + platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU", + platform::TracerEventType::UserDefined, + 1); platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, num, reinterpret_cast(stream)); } else { - platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); + platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU", + platform::TracerEventType::UserDefined, + 1); platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, num); } @@ -729,7 +761,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, reinterpret_cast(stream)); @@ -738,7 +772,9 @@ void Copy( reinterpret_cast(stream)); #endif } else { - platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); + platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost); #else @@ -758,7 +794,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; if (stream) { - platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); + platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, reinterpret_cast(stream)); @@ -767,7 +805,9 @@ void Copy( reinterpret_cast(stream)); #endif } else { - platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); + platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU", + platform::TracerEventType::UserDefined, + 1); #ifdef PADDLE_WITH_HIP platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice); #else @@ -927,7 +967,9 @@ void Copy(platform::CPUPlace dst_place, if (stream) { VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; - platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); + platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyD2HAsync(dst, src, num, reinterpret_cast(stream)); } else { @@ -936,7 +978,8 @@ void Copy(platform::CPUPlace dst_place, VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; - platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU"); + platform::RecordEvent record_event( + "MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1); platform::MLUMemcpyD2HSync(dst, src, num); } } @@ -953,7 +996,9 @@ void Copy(platform::MLUPlace dst_place, if (stream) { VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; - platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); + platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyH2DAsync(dst, src, num, reinterpret_cast(stream)); } else { @@ -962,7 +1007,8 @@ void Copy(platform::MLUPlace dst_place, VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; - platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU"); + platform::RecordEvent record_event( + "MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1); platform::MLUMemcpyH2DSync(dst, src, num); } } @@ -980,8 +1026,9 @@ void Copy(platform::MLUPlace dst_place, if (stream) { VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; - platform::RecordEvent record_event( - "MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); + platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyD2DAsync(dst, src, num, reinterpret_cast(stream)); } else { @@ -991,20 +1038,26 @@ void Copy(platform::MLUPlace dst_place, VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; - platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU"); + platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyD2DSync(dst, src, num); } } else { if (stream) { VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by mlu stream(" << stream << ")"; - platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU"); + platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, num, reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; - platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU"); + platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU", + platform::TracerEventType::UserDefined, + 1); platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, num); } diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index 149a87fe32da16e850d5d64fb519c9bde7afef62..c28026a4bd43aac5b0c447e24a164e27233076e8 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -16,7 +16,10 @@ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -27,16 +30,6 @@ namespace operators { class AbsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "abs"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "abs"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class AbsOpMaker : public framework::OpProtoAndCheckerMaker { @@ -148,11 +141,15 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(abs, ops::AbsOp, ops::AbsOpMaker, ops::AbsGradMaker, - ops::AbsGradMaker); + ops::AbsGradMaker, + AbsInferShapeFunctor); REGISTER_OPERATOR(abs_grad, ops::AbsGradOp, ops::AbsDoubleGradMaker, diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc index 8ee6540bfa5f0c413f759f58ab506ac181c19c49..71a895c244c54f62c0af1745635c08fea35436c4 100644 --- a/paddle/fluid/operators/atan2_op.cc +++ b/paddle/fluid/operators/atan2_op.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/atan2_op.h" - -#include -#include -#include -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -25,16 +25,6 @@ namespace operators { class Atan2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "atan2"); - OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "atan2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "atan2"); - - auto in_dims = ctx->GetInputDim("X1"); - - ctx->SetOutputDim("Out", in_dims); - } }; class Atan2OpMaker : public framework::OpProtoAndCheckerMaker { @@ -115,24 +105,11 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; - +DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, + PT_INFER_META(phi::Atan2InferMeta)); REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, ops::Atan2GradMaker, ops::Atan2GradMaker, - ops::Atan2OpVarTypeInference); + ops::Atan2OpVarTypeInference, Atan2InferShapeFunctor); REGISTER_OPERATOR(atan2_grad, ops::Atan2GradOp); - -REGISTER_OP_CPU_KERNEL( - atan2, ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel); - -REGISTER_OP_CPU_KERNEL( - atan2_grad, ops::Atan2GradKernel, - ops::Atan2GradKernel, - ops::Atan2GradKernel); diff --git a/paddle/fluid/operators/atan2_op.cu b/paddle/fluid/operators/atan2_op.cu deleted file mode 100644 index faf1fde47e4c45a00836eee1d81ed1233170ecbe..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/atan2_op.cu +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/atan2_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - atan2, ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel, - ops::Atan2Kernel); - -REGISTER_OP_CUDA_KERNEL( - atan2_grad, - ops::Atan2GradKernel, - ops::Atan2GradKernel, - ops::Atan2GradKernel); diff --git a/paddle/fluid/operators/atan2_op.h b/paddle/fluid/operators/atan2_op.h deleted file mode 100644 index a0e64c301524e2051abf8d2fc1641e0bcfafe69d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/atan2_op.h +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; -using framework::To32BitIndex; - -template -struct Atan2Out { - using type = T; -}; - -template <> -struct Atan2Out { - using type = double; -}; - -template <> -struct Atan2Out { - using type = double; -}; - -template -struct Atan2Functor { - Atan2Functor(const T* x1, const T* x2, typename Atan2Out::type* out, - int64_t numel) - : x1_(x1), x2_(x2), out_(out), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - out_[idx] = static_cast::type>( - ::atan2f(static_cast(x1_[idx]), static_cast(x2_[idx]))); - } - - const T* x1_; - const T* x2_; - typename Atan2Out::type* out_; - int64_t numel_; -}; - -template <> -struct Atan2Functor { - Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel) - : x1_(x1), x2_(x2), out_(out), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - out_[idx] = ::atan2(x1_[idx], x2_[idx]); - } - - const double* x1_; - const double* x2_; - double* out_; - int64_t numel_; -}; - -// dx1 = dout * x2 / ((x1)^2 + (x2)^2) -// dx2 = - dout * x1 / ((x1)^2 + (x2)^2) -template -struct Atan2GradFunctor { - Atan2GradFunctor(const T* x1, const T* x2, const T* dout, T* dx1, T* dx2, - int64_t numel) - : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - float x1 = static_cast(x1_[idx]); - float x2 = static_cast(x2_[idx]); - float x = x1 * x1 + x2 * x2; - dx1_[idx] = static_cast(static_cast(dout_[idx]) * x2 / x); - dx2_[idx] = static_cast(-static_cast(dout_[idx]) * x1 / x); - } - - const T* x1_; - const T* x2_; - const T* dout_; - T* dx1_; - T* dx2_; - int64_t numel_; -}; - -template <> -struct Atan2GradFunctor { - Atan2GradFunctor(const double* x1, const double* x2, const double* dout, - double* dx1, double* dx2, int64_t numel) - : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx]; - dx1_[idx] = dout_[idx] * x2_[idx] / x; - dx2_[idx] = -dout_[idx] * x1_[idx] / x; - } - - const double* x1_; - const double* x2_; - const double* dout_; - double* dx1_; - double* dx2_; - int64_t numel_; -}; - -template -class Atan2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* X1 = context.Input("X1"); - const Tensor* X2 = context.Input("X2"); - Tensor* Out = context.Output("Out"); - - auto numel = X1->numel(); - auto x1 = X1->data(); - auto x2 = X2->data(); - auto out = Out->mutable_data::type>( - context.GetPlace(), size_t(numel * sizeof(typename Atan2Out::type))); - auto& dev_ctx = context.template device_context(); - - platform::ForRange for_range(dev_ctx, numel); - Atan2Functor functor(x1, x2, out, numel); - for_range(functor); - } -}; - -template -class Atan2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const { - const Tensor* X1 = context.Input("X1"); - const Tensor* X2 = context.Input("X2"); - const Tensor* dOut = context.Input(framework::GradVarName("Out")); - Tensor* dX1 = context.Output(framework::GradVarName("X1")); - Tensor* dX2 = context.Output(framework::GradVarName("X2")); - - auto numel = X1->numel(); - auto x1 = X1->data(); - auto x2 = X2->data(); - auto dout = dOut->data(); - auto dx1 = - dX1->mutable_data(context.GetPlace(), size_t(numel * sizeof(T))); - auto dx2 = - dX2->mutable_data(context.GetPlace(), size_t(numel * sizeof(T))); - auto& dev_ctx = context.template device_context(); - - platform::ForRange for_range(dev_ctx, numel); - Atan2GradFunctor functor(x1, x2, dout, dx1, dx2, numel); - for_range(functor); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc index 505acbbdbde1b0f5842ebbd272f4bfb930e812b7..6699df0c8dc59cbd4ce14a1f6d2b6523f21b590d 100644 --- a/paddle/fluid/operators/batch_norm_op_xpu.cc +++ b/paddle/fluid/operators/batch_norm_op_xpu.cc @@ -38,23 +38,25 @@ class BatchNormXPUKernel : public framework::OpKernel { bool global_stats = test_mode || use_global_stats; const auto &data_layout_str = ctx.Attr("data_layout"); const auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW, + PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC", + true, platform::errors::InvalidArgument( - "The 'data_layout' attribute must be NCHW. But " - "recevived 'data_layout' is [%s].", + "The 'data_layout' attribute must be NCHW or NHWC. " + "But recevived 'data_layout' is [%s].", data_layout_str)); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + const auto *scale = ctx.Input("Scale"); const auto *bias = ctx.Input("Bias"); const auto *x_data = x->data(); @@ -75,6 +77,7 @@ class BatchNormXPUKernel : public framework::OpKernel { saved_variance->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); + bool is_nchw = data_layout_str == "NCHW"; if (!global_stats) { auto *mean_out_data = mean_out->data(); @@ -95,7 +98,7 @@ class BatchNormXPUKernel : public framework::OpKernel { int r = xpu::batch_norm(dev_ctx.x_context(), x_data, y_data, N, C, H, W, epsilon, momentum, scale_data, bias_data, saved_mean_data, saved_variance_data, - mean_out_data, variance_out_data, true); + mean_out_data, variance_out_data, is_nchw); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "The batch_norm XPU API return wrong value[%d %s]", @@ -107,7 +110,7 @@ class BatchNormXPUKernel : public framework::OpKernel { const auto *variance_data = variance->data(); int r = xpu::batch_norm_infer(dev_ctx.x_context(), x_data, y_data, N, C, H, W, epsilon, scale_data, bias_data, - mean_data, variance_data, true); + mean_data, variance_data, is_nchw); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External( @@ -168,11 +171,11 @@ class BatchNormGradXPUKernel : public framework::OpKernel { const float epsilon = ctx.Attr("epsilon"); const auto data_layout = framework::StringToDataLayout(data_layout_str); - // TODO(guozbin): Transform input tensor from NHWC to NCHW - PADDLE_ENFORCE_EQ(data_layout, DataLayout::kNCHW, + PADDLE_ENFORCE_EQ(data_layout_str == "NCHW" || data_layout_str == "NHWC", + true, platform::errors::InvalidArgument( - "The 'data_layout' attribute must be NCHW. But " - "recevived 'data_layout' is [%s].", + "The 'data_layout' attribute must be NCHW or NHWC. " + "But recevived 'data_layout' is [%s].", data_layout_str)); auto *d_x = ctx.Output(framework::GradVarName("X")); @@ -207,15 +210,15 @@ class BatchNormGradXPUKernel : public framework::OpKernel { } const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); - const int N = x_dims[0]; - const int C = x_dims[1]; - const int H = x_dims[2]; - const int W = x_dims[3]; + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, true, + platform::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); const auto *x_data = x->data(); const auto *d_y_data = d_y->data(); @@ -250,38 +253,35 @@ class BatchNormGradXPUKernel : public framework::OpKernel { auto &dev_ctx = ctx.template device_context(); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); - const T *mean_data = nullptr; - const T *inv_var_data = nullptr; + const auto *batch_mean = ctx.Input("SavedMean"); + const auto *batch_inv_std = ctx.Input("SavedVariance"); + const auto *global_mean = ctx.Input("Mean"); + const auto *global_var = ctx.Input("Variance"); // TODO(guozibin): hadle the situation case of N * H * W = 1 - if (!use_global_stats) { - const auto *saved_mean = ctx.Input("SavedMean"); - // SavedVariance have been reverted in forward operator - const auto *saved_inv_variance = ctx.Input("SavedVariance"); - mean_data = saved_mean->data(); - inv_var_data = saved_inv_variance->data(); - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_variance = ctx.Input("Variance"); - mean_data = running_mean->data(); - inv_var_data = running_variance->data(); - float *running_inv_var_data = - RAII_GUARD.alloc_l3_or_gm(running_variance->numel()); - float *epsilon_data = RAII_GUARD.alloc_l3_or_gm(1); - int r1 = calculate_inv_var(dev_ctx.x_context(), inv_var_data, epsilon, C, - epsilon_data, running_inv_var_data); - PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External( - "XPU API(batch_norm_grad " - "calculate_inv_var function) " - "return wrong value[%d %s]", - r1, XPUAPIErrorMsg[r1])); - inv_var_data = running_inv_var_data; - } if (is_inplace) { + float *global_inv_std_data; + if (use_global_stats) { + global_inv_std_data = + RAII_GUARD.alloc_l3_or_gm(global_var->numel()); + float *epsilon_data = RAII_GUARD.alloc_l3_or_gm(1); + int r1 = + calculate_inv_var(dev_ctx.x_context(), global_var->data(), + epsilon, C, epsilon_data, global_inv_std_data); + PADDLE_ENFORCE_EQ(r1, XPU_SUCCESS, platform::errors::External( + "XPU API(batch_norm_grad " + "calculate_inv_var function) " + "return wrong value[%d %s]", + r1, XPUAPIErrorMsg[r1])); + } auto px = *x; + auto *inv_std_data = + use_global_stats ? global_inv_std_data : batch_inv_std->data(); + auto mean_data = use_global_stats ? global_mean->data() + : batch_mean->data(); int r2 = calculate_inv_BN_Y( dev_ctx.x_context(), px.mutable_data(ctx.GetPlace()), - scale->data(), bias->data(), mean_data, inv_var_data, N, + scale->data(), bias->data(), mean_data, inv_std_data, N, C, H * W, x->data()); PADDLE_ENFORCE_EQ(r2, XPU_SUCCESS, platform::errors::External( "XPU API(batch_norm_grad " @@ -289,19 +289,29 @@ class BatchNormGradXPUKernel : public framework::OpKernel { "return wrong value[%d %s]", r2, XPUAPIErrorMsg[r2])); } - if (!d_x) { - d_x_data = RAII_GUARD.alloc_l3_or_gm(x->numel()); - } - if (!d_scale) { - d_scale_data = RAII_GUARD.alloc_l3_or_gm(C); - } - if (!d_bias_data) { - d_bias_data = RAII_GUARD.alloc_l3_or_gm(C); - } - int r3 = xpu::batch_norm_grad( - dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, scale_data, - mean_data, inv_var_data, d_scale_data, d_bias_data, true); + int r3; + bool is_nchw = data_layout_str == "NCHW"; + if (use_global_stats) { + r3 = xpu::batch_norm_grad( + dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, + scale_data, nullptr, nullptr, d_scale_data, d_bias_data, is_nchw, + global_mean->data(), global_var->data(), epsilon); + } else { + if (!d_x) { + d_x_data = RAII_GUARD.alloc_l3_or_gm(x->numel()); + } + if (!d_scale) { + d_scale_data = RAII_GUARD.alloc_l3_or_gm(C); + } + if (!d_bias_data) { + d_bias_data = RAII_GUARD.alloc_l3_or_gm(C); + } + r3 = xpu::batch_norm_grad( + dev_ctx.x_context(), x_data, d_y_data, d_x_data, N, C, H, W, + scale_data, batch_mean->data(), batch_inv_std->data(), + d_scale_data, d_bias_data, is_nchw); + } PADDLE_ENFORCE_EQ(r3, XPU_SUCCESS, platform::errors::External( "XPU API(batch_norm_grad) return " "wrong value[%d %s]", diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 1c390923d0b0ad92f73eced96a79771db7ad4010..55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bce_loss_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -26,41 +29,6 @@ class BCELossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BCELoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BCELoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ(x_dims, labels_dims, - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same " - "shape. But received: the shape of Input(X) is " - "[%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -170,16 +138,12 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PT_INFER_META(phi::BCELossInferMeta)); + REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, ops::BCELossGradOpMaker, - ops::BCELossInplaceInferer); + ops::BCELossInplaceInferer, BCELossInferShapeFunctor); REGISTER_OPERATOR(bce_loss_grad, ops::BCELossGradOp, ops::BCELossGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - bce_loss, ops::BCELossOpKernel, - ops::BCELossOpKernel); -REGISTER_OP_CPU_KERNEL( - bce_loss_grad, - ops::BCELossGradOpKernel, - ops::BCELossGradOpKernel); diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu deleted file mode 100644 index f71fbbdc6b19e9a2c71b5194e8f2343d2398d62a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bce_loss_op.cu +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include "paddle/fluid/operators/bce_loss_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -template -struct BCELossFunctor { - T one; - T neg_100; - - HOSTDEVICE inline BCELossFunctor() { - one = static_cast(1.0f); - neg_100 = static_cast(-100.); - } - - HOSTDEVICE inline T operator()(const T x, const T label) const { - PADDLE_ENFORCE( - (x >= static_cast(0)) && (x <= one), - "Input is expected to be within the interval [0, 1], but recieved %f.", - x); - T term1 = max(real_log(x), neg_100); - T term2 = max(real_log(one - x), neg_100); - return (((label - one) * term2) - (label * term1)); - } -}; - -template -struct BCELossGradFunctor { - T one; - T eps; - - HOSTDEVICE inline BCELossGradFunctor() { - one = static_cast(1.0f); - eps = static_cast(1e-12); - } - - HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const { - T term1 = max((one - x) * x, eps); - return (dout * (x - label) / term1); - } -}; - -using Tensor = framework::Tensor; - -template -class BCELossCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - std::vector ins = {x, labels}; - std::vector outs = {out}; - auto& dev_ctx = ctx.template device_context(); - auto functor = BCELossFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -template -class BCELossGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - std::vector ins = {x, labels, dout}; - std::vector outs = {dx}; - auto& dev_ctx = ctx.template device_context(); - auto functor = BCELossGradFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bce_loss, - ops::BCELossCUDAKernel, - ops::BCELossCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - bce_loss_grad, - ops::BCELossGradCUDAKernel, - ops::BCELossGradCUDAKernel); diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h deleted file mode 100644 index dd87b69efe2869727f2db778cec44612efbcff6b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bce_loss_op.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for max -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class BCELossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); - auto x_numel = x->numel(); - - // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 - - // x) - label * ln(x) - for (int64_t i = 0; i < x_numel; ++i) { - PADDLE_ENFORCE_GE( - x_data[i], static_cast(0), - platform::errors::InvalidArgument( - "Illegal input, input must be greater than or equal to 0")); - PADDLE_ENFORCE_LE( - x_data[i], static_cast(1), - platform::errors::InvalidArgument( - "Illegal input, input must be less than or equal to 1")); - out_data[i] = - (label_data[i] - static_cast(1)) * - std::max(real_log(static_cast(1) - x_data[i]), (T)(-100)) - - label_data[i] * std::max(real_log(x_data[i]), (T)(-100)); - } - } -}; - -template -class BCELossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto x_data = x->data(); - auto label_data = labels->data(); - - int x_numel = x->numel(); - - // dx = dout * ((x - label)/(x - x^2)) - for (int i = 0; i < x_numel; ++i) { - dx_data[i] = - dout_data[i] * ((x_data[i] - label_data[i]) / - std::max((static_cast(1) - x_data[i]) * x_data[i], - static_cast(1e-12))); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc index 46e8a36d2eef73e59bfc22308e5c0b593bd2832d..c3cee6a7b0d5bf4b2e41bfc020e6c9fcd34677d9 100644 --- a/paddle/fluid/operators/bce_loss_op_npu.cc +++ b/paddle/fluid/operators/bce_loss_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bce_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 31ed10a71201c666c72e23853fdf925a42a80fb3..6bf419c47a5669b87c0b47d48259362a66a23239 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -272,8 +272,18 @@ class ConditionalBlockGradInferVarType : public framework::VarTypeInference { // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as // Input@GRAD. - ctx->SyncTypeAndDataType(ConditionalOp::kInputs, - framework::GradVarName(ConditionalOp::kInputs)); + auto input_size = ctx->InputSize(ConditionalOp::kInputs); + auto output_size = + ctx->OutputSize(framework::GradVarName(ConditionalOp::kInputs)); + PADDLE_ENFORCE_EQ(input_size, output_size, + platform::errors::InvalidArgument( + "input_size and output_size should be equal for " + "conditional_block_grad_op.")); + for (size_t i = 0; i < output_size; ++i) { + ctx->SyncTypeAndDataType(ConditionalOp::kInputs, + framework::GradVarName(ConditionalOp::kInputs), + i); + } } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index ed4995d4fbeda208bfdd09be52c98195b52786db..de3d8bd996149f92ed24be63fadacfc51c2764b0 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index bda5ac42da8e3845b76880209438bfed1cacc6e0..dff60afd74c02f458b5b3c7428c2703197b61af0 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -28,7 +28,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" DECLARE_bool(cudnn_deterministic); DECLARE_uint64(conv_workspace_size_limit); diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index e6b30ba42fc2664404ecc51bb68c8e3c06a26dc1..fe00ee06603f0ecf2e3fa6ac367303a70702508f 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -12,67 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cross_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { using framework::Tensor; using framework::DDim; +const int kDefaultDim = framework::DDim::kMaxRank; class CrossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of CrossOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, - platform::errors::InvalidArgument( - "Input(Index) of CrossOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of CrossOp should not be null.")); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - auto dim = ctx->Attrs().Get("dim"); - - bool dims_match = CheckDims(x_dim, y_dim); - PADDLE_ENFORCE_EQ(dims_match, true, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - x_dim, y_dim)); - - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < x_dim.size() && dim >= (0 - x_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - x_dim.size(), x_dim.size() - 1, dim)); - if (dim < 0) { - dim += x_dim.size(); - } - PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims()[dim] should be equal to 3." - "But received Input(X/Y).dims()[dim] = %d.", - x_dim[dim])); - } - - ctx->SetOutputDim("Out", x_dim); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -153,17 +109,10 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PT_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, - ops::CrossGradMaker); + ops::CrossGradMaker, + CrossInferShapeFunctor); REGISTER_OPERATOR(cross_grad, ops::CrossGradOp); -REGISTER_OP_CPU_KERNEL( - cross, ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel); -REGISTER_OP_CPU_KERNEL( - cross_grad, ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel); diff --git a/paddle/fluid/operators/cross_op.cu b/paddle/fluid/operators/cross_op.cu deleted file mode 100644 index 78bbb3ea564544a46e19723e9a83e90194b50597..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cross_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/cross_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cross, ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel); -REGISTER_OP_CUDA_KERNEL( - cross_grad, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel); diff --git a/paddle/fluid/operators/cross_op.h b/paddle/fluid/operators/cross_op.h deleted file mode 100644 index b1c5eb62fdce57640e4b6c1a9bf1f55d59d1c6d6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cross_op.h +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; -const int kDefaultDim = framework::DDim::kMaxRank; - -inline bool CheckDims(const DDim& dims_x, const DDim& dims_y) { - if (dims_x.size() != dims_y.size()) { - return false; - } - for (int i = 0; i < dims_x.size(); i++) { - if (dims_x[i] != dims_y[i]) { - return false; - } - } - return true; -} - -template -class CrossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_x_var = context.InputVar("X"); - auto* input_y_var = context.InputVar("Y"); - auto* output_var = context.OutputVar("Out"); - - auto& input_x = input_x_var->Get(); - auto& input_y = input_y_var->Get(); - auto* output = output_var->GetMutable(); - int dim = context.Attr("dim"); - - auto input_x_dims = input_x.dims(); - auto input_y_dims = input_y.dims(); - bool dims_match = CheckDims(input_x_dims, input_y_dims); - PADDLE_ENFORCE_EQ(dims_match, true, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - input_x_dims, input_x_dims)); - - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_x_dims.size(), input_x_dims.size() - 1, dim)); - if (dim < 0) { - dim += input_x_dims.size(); - } - - PADDLE_ENFORCE_EQ( - input_x_dims[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims[dim] must be equal to 3. But received: " - "Input(X/Y).dims[dim] = [%d].", - input_x_dims[dim])); - } else { - for (auto i = 0; i < input_x_dims.size(); i++) { - if (input_x_dims[i] == 3) { - dim = i; - break; - } - } - PADDLE_ENFORCE_EQ(dim == kDefaultDim, false, - platform::errors::InvalidArgument( - "There must be at least one dimension 'd' so that " - "Input(X/Y).dims()[d] is equal to 3. " - "But received: Input(X/Y).dims() == [%s].", - input_x_dims)); - } - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_x_dims[i]; - } - auto slice_size = 1; - for (auto i = dim + 1; i < input_x_dims.size(); i++) { - slice_size *= input_x_dims[i]; - } - - std::vector input_x_vec, input_y_vec; - framework::TensorToVector(input_x, context.device_context(), &input_x_vec); - framework::TensorToVector(input_y, context.device_context(), &input_y_vec); - std::vector out_vec(output->numel()); - - output->mutable_data(context.GetPlace()); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < 3; j++) { - auto dst_pos = (3 * i + j) * slice_size; - auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; - auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; - - for (auto k = 0; k < slice_size; k++) { - out_vec[dst_pos + k] = - input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] - - input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k]; - } - } - } - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input_x_dims); - } -}; - -template -class CrossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_x_var = context.InputVar("X"); - auto* input_y_var = context.InputVar("Y"); - auto* input_out_grad_var = context.InputVar(framework::GradVarName("Out")); - auto* output_x_grad_var = context.OutputVar(framework::GradVarName("X")); - auto* output_y_grad_var = context.OutputVar(framework::GradVarName("Y")); - - auto& input_x = input_x_var->Get(); - auto& input_y = input_y_var->Get(); - auto& input_out_grad = input_out_grad_var->Get(); - auto* output_x_grad = output_x_grad_var->GetMutable(); - auto* output_y_grad = output_y_grad_var->GetMutable(); - - int dim = context.Attr("dim"); - auto input_x_dims = input_x.dims(); - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_x_dims.size(), input_x_dims.size() - 1, dim)); - if (dim < 0) { - dim += input_x_dims.size(); - } - - PADDLE_ENFORCE_EQ( - input_x_dims[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims[dim] must be equal to 3. But received: " - "Input(X/Y).dims[dim] = [%d].", - input_x_dims[dim])); - } else { - for (auto i = 0; i < input_x_dims.size(); i++) { - if (input_x_dims[i] == 3) { - dim = i; - break; - } - } - PADDLE_ENFORCE_EQ(dim == kDefaultDim, false, - platform::errors::InvalidArgument( - "There must be at least one dimension 'd' " - "so that Input(X/Y).dims()[d] is equal to 3. " - "But received: Input(X/Y).dims() == [%s].", - input_x_dims)); - } - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_x_dims[i]; - } - auto slice_size = 1; - for (auto i = dim + 1; i < input_x_dims.size(); i++) { - slice_size *= input_x_dims[i]; - } - - std::vector input_x_vec, input_y_vec, input_dout_vec; - framework::TensorToVector(input_x, context.device_context(), &input_x_vec); - framework::TensorToVector(input_y, context.device_context(), &input_y_vec); - framework::TensorToVector(input_out_grad, context.device_context(), - &input_dout_vec); - std::vector out_dx_vec(output_x_grad->numel()); - std::vector out_dy_vec(output_y_grad->numel()); - - output_x_grad->mutable_data(context.GetPlace()); - output_y_grad->mutable_data(context.GetPlace()); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < 3; j++) { - auto dst_pos = (3 * i + j) * slice_size; - auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; - auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; - for (auto k = 0; k < slice_size; k++) { - out_dx_vec[dst_pos + k] = - input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] - - input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k]; - out_dy_vec[dst_pos + k] = - input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] - - input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k]; - } - } - } - framework::TensorFromVector(out_dx_vec, context.device_context(), - output_x_grad); - framework::TensorFromVector(out_dy_vec, context.device_context(), - output_y_grad); - output_x_grad->Resize(input_x_dims); - output_y_grad->Resize(input_x_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index 8a44c1327b9e6fbb1f8767a9ecdf40faf95993eb..b1f2e61ef3930d81aa56794c0d232930452b03d9 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -110,10 +110,12 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel { // merge elements and delete blank T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + paddle::framework::MixVector mixv_input_lod(&input_lod[level]); MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( num_tokens, tokens, num_seq, - input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, - merge_repeated, dev_out_lod0_ptr, output_data); + mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, + dev_out_lod0_ptr, output_data); + mixv_input_lod.CopyToCPU(); // set output lod std::vector host_out_lod0(dev_out_lod0.begin(), diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index ad96dc24b9206c0e7c6bc172180cec829230dde1..1a3bdee53e9bd31b410093446280a18e2f75d7a2 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -149,11 +149,12 @@ class CVMGradCUDAKernel : public framework::OpKernel { batch_size, lod[lod.size() - 1], platform::errors::PreconditionNotMet( "Output(X@GRAD)'s dim[0] must be equal to last element of lod")); + paddle::framework::MixVector mixv_lod(&lod); CvmGradComputeKernel<<<(dx_numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( use_cvm, item_size, cvm_data, dout_data, dx_data, true, - lod.CUDAData(context.GetPlace()), lod.size(), dx_numel); + mixv_lod.CUDAData(context.GetPlace()), lod.size(), dx_numel); } } }; diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index bda22dd0155cce6cec767dfe1c3b282788a5f160..65f2a5590716d42649dbf766575c72571c23eb4d 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -57,9 +57,11 @@ class GPUBoxClipKernel : public framework::OpKernel { auto stream = dev_ctx.stream(); const size_t batch_size = lod.back().size() - 1; T *output_data = output->mutable_data(dev_ctx.GetPlace()); + paddle::framework::MixVector mix_vector(&abs_offset_lod[0]); GPUBoxClip<<>>( - input->data(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + input->data(), mix_vector.CUDAMutableData(dev_ctx.GetPlace()), bbox_width, im_info->data(), output_data); + mix_vector.CopyToCPU(); } }; diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h index 01b15865e93b6035598b382b506504e9fcc22698..c4506f04e083e0a1e7671605ef6e39a06aa68eed 100644 --- a/paddle/fluid/operators/detection/target_assign_op.h +++ b/paddle/fluid/operators/detection/target_assign_op.h @@ -108,7 +108,8 @@ class TargetAssignKernel : public framework::OpKernel { auto x_lod = x->lod().back(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace()); + paddle::framework::MixVector mixv_x_lod(&x_lod); + size_t* x_lod_data = mixv_x_lod.MutableData(ctx.GetPlace()); #else size_t* x_lod_data = x_lod.data(); #endif @@ -116,6 +117,9 @@ class TargetAssignKernel : public framework::OpKernel { TargetAssignFunctor functor(x_data, match_idx_data, x_lod_data, mismatch_value, n, m, p, k, out_data, out_wt_data); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + mixv_x_lod.CopyToCPU(); +#endif auto& device_ctx = ctx.template device_context(); platform::ForRange for_range(device_ctx, n * m); @@ -130,13 +134,17 @@ class TargetAssignKernel : public framework::OpKernel { const int* neg_idx_data = neg_indices->data(); auto neg_lod = neg_indices->lod().back(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); + paddle::framework::MixVector mixv_neg_lod(&neg_lod); + size_t* neg_lod_data = mixv_neg_lod.MutableData(ctx.GetPlace()); #else size_t* neg_lod_data = neg_lod.data(); #endif NegTargetAssignFunctor neg_trg_functor; neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k, mismatch_value, out_data, out_wt_data); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + mixv_neg_lod.CopyToCPU(); +#endif } } }; diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index ca6bcb1147a2fb78031227f0bb3a9f7e01326fcb..c13bf687af23470d4595def6fb6fabf7385c999f 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -180,8 +180,8 @@ struct normal_distribution { /******** Launch GPU function of distribution and transformation *********/ template __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, - DistOp dist, TransformOp trans, - T *out_data) { + DistOp dist, TransformOp trans, T *out_data, + size_t stride) { size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); static constexpr int kCount = DistOp::kReturnsCount; #if defined(__NVCC__) @@ -201,7 +201,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, kps::ElementwiseUnary(&result[0], &args[0], trans); kps::WriteData(out_data + i, &result[0], size - i, - 1, total_thread, 1); + 1, stride, 1); + __syncthreads(); } } @@ -234,7 +235,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, DistributionKernel< T, DistOp, TransformOp><<>>( - size, seed, offset, dist, trans, out_data); + size, seed, offset, dist, trans, out_data, total_thread); } #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps new file mode 100644 index 0000000000000000000000000000000000000000..a3fea0d7b3dbf91cbe19c299edea3ffee77d3cbe --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -0,0 +1,188 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// Please do not modify the following code +#if defined(__CUDA_ARCH__) +#undef __CUDA_ARCH__ +#endif + +#if defined(__CUDACC__) +#undef __CUDACC__ +#endif + +#if defined(__CUDA__) +#undef __CUDA__ +#endif + +#if defined(__NVCC__) +#undef __NVCC__ +#endif + +#ifdef PADDLE_WITH_XPU_KP +#include // NOLINT +#include "xpu/kernel/cluster_header.h" // NOLINT +#include "xpu/kernel/debug.h" // NOLINT +#include "xpu/kernel/math.h" // NOLINT + +#include +#include +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/elementwise/elementwise_xpu.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +template +class ElementwiseAddXPUKPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + const auto& xpu_ctx = + ctx.template device_context(); + paddle::operators::LaunchElementwiseCudaKernel, 1>( + xpu_ctx, ins, &outs, axis, kps::AddFunctor()); + } +}; + +static std::vector get_rdims(const std::vector& xdims, + const std::vector& ydims) { + std::vector rdims; + for (size_t i = 0; i < xdims.size(); i++) { + if (xdims[i] != ydims[i]) { + rdims.push_back(i); + } + } + return rdims; +} + +template +class ElementwiseAddGradXPUKPKernel : public ElemwiseGradKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dz = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + const framework::DDim& x_dims = x->dims(); + const framework::DDim& y_dims = y->dims(); + const framework::DDim& dz_dims = dz->dims(); + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); + PADDLE_ENFORCE_GE( + axis, 0, + platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT( + axis, max_dim, + platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", max_dim, + axis)); + + std::vector x_dims_vec(max_dim, 1); + std::vector y_dims_vec(max_dim, 1); + std::vector z_dims_vec(max_dim, 1); + if (x_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + x_dims_vec[i] = x_dims[i]; + } + } else { + for (int i = 0; i < x_dims.size(); i++) { + x_dims_vec[i + axis] = x_dims[i]; + } + } + + if (y_dims.size() == max_dim) { + for (int i = 0; i < max_dim; i++) { + y_dims_vec[i] = y_dims[i]; + } + } else { + for (int i = 0; i < y_dims.size(); i++) { + y_dims_vec[i + axis] = y_dims[i]; + } + } + + for (int i = 0; i < max_dim; i++) { + z_dims_vec[i] = dz_dims[i]; + } + std::vector rdims_for_x; + std::vector rdims_for_y; + rdims_for_x = get_rdims(x_dims_vec, z_dims_vec); + rdims_for_y = get_rdims(y_dims_vec, z_dims_vec); + const T* dz_data = dz->data(); + auto& dev_ctx = + ctx.template device_context(); + + if (dx != nullptr) { + T* dx_data = dx->mutable_data(ctx.GetPlace()); + if (rdims_for_x.size() == 0) { + if (dx_data != dz_data) { + framework::TensorCopy( + *dz, ctx.GetPlace(), + ctx.template device_context(), dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dz, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(*dz)) { + dx->clear(); + dx->mutable_data(x->dims(), ctx.GetPlace()); + } + + int ret = xpu::reduce_sum( + dev_ctx.x_context(), reinterpret_cast(dz_data), + reinterpret_cast(dx_data), z_dims_vec, rdims_for_x); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum "); + } + } + + if (dy != nullptr) { + T* dy_data = dy->mutable_data(ctx.GetPlace()); + if (rdims_for_y.size() == 0) { + if (dy_data != dz_data) { + framework::TensorCopy( + *dz, ctx.GetPlace(), + ctx.template device_context(), dy); + } + } else { + int ret = xpu::reduce_sum( + dev_ctx.x_context(), reinterpret_cast(dz_data), + reinterpret_cast(dy_data), z_dims_vec, rdims_for_y); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum "); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace, + ops::ElementwiseAddXPUKPKernel); + +REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace, + ops::ElementwiseAddGradXPUKPKernel); + +#endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 06f9107db27b4f2cce54bbcabe3c53e81e4167d1..9eb4b0352e5337e3fdd758d2e95cfa61d1d62724 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -53,6 +53,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, @@ -65,6 +67,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, @@ -78,6 +82,8 @@ REGISTER_OP_CUDA_KERNEL( float>, ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -91,14 +92,3 @@ REGISTER_OPERATOR( ::paddle::framework::EmptyGradOpMaker, ::paddle::framework::EmptyGradOpMaker, ops::FillAnyLikeVarTypeInference) - -REGISTER_OP_CPU_KERNEL( - fill_any_like, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel); diff --git a/paddle/fluid/operators/fill_any_like_op.cu b/paddle/fluid/operators/fill_any_like_op.cu deleted file mode 100644 index 3ebc0ad7c8ec53b5c3de68823d9ba943e49bd364..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_any_like_op.cu +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/fill_any_like_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fill_any_like, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel, - ops::FillAnyLikeKernel); diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h deleted file mode 100644 index 36b56394b6f1b1198c65cb7a6a6046d223b31922..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/fill_any_like_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/pten_utils.h" - -#include "paddle/phi/kernels/full_kernel.h" - -namespace paddle { -namespace operators { - -template -class FillAnyLikeKernel : public framework::OpKernel { - public: - using CommonType = typename std::common_type< - float, - typename std::conditional::value, - float, T>::type>::type; - - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - // TODO(fangzeyang): Once context.Attribute supports double dtype, this - // kernel should be updated to support double dtype, too. - float value = context.Attr("value"); - - auto common_type_value = static_cast(value); - - PADDLE_ENFORCE_EQ( - (common_type_value >= - static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, - platform::errors::InvalidArgument( - "The filled value is out of range for target type, " - "current kernel type is %s, the range should between %f " - "and %f, but now value is %f.", - typeid(T).name(), - static_cast(std::numeric_limits::lowest()), - static_cast(std::numeric_limits::max()), value)); - - PADDLE_ENFORCE_EQ( - std::isnan(value), false, - platform::errors::InvalidArgument("The filled value is NaN.")); - - const auto& dev_ctx = context.template device_context(); - // call new kernel - phi::FullLikeKernel( - static_cast::TYPE&>(dev_ctx), - *x, value, phi::DataType::UNDEFINED, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc index a584c1341dc0f280d483d5677ef2276b43c003d2..2a914ff2ebd33024d80f8d88fde97f70a2f203a7 100644 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/fill_any_like_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc index 693d4431b2ec8e0546dfe125d3d7bd00f70993c9..896310cd0918b118db003d784daca87c49c5ab32 100644 --- a/paddle/fluid/operators/fill_any_like_op_xpu.cc +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/fill_any_like_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index 3f6171b8a07b023b547caaaff31f04a988885bd9..fc03ef0afae51ec2c55ebf6f5a36c57b089093a9 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/flip_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -29,6 +29,7 @@ class FlipOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + // TODO move to phi kernel void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE_EQ( ctx->HasInput("X"), true, @@ -150,14 +151,6 @@ namespace plat = paddle::platform; REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType, ops::FlipOpGradMaker, ops::FlipOpGradMaker); -REGISTER_OP_CPU_KERNEL( - flip, ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel>, - ops::FlipKernel>); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(flip) diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu deleted file mode 100644 index b9f8b16214fe476622263f914c7e818bef91ba92..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/flip_op.cu +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/flip_op.h" - -#include -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/complex.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -__global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data, - int64_t* x_shape, int64_t* x_stride, - int* flip_dims, int flip_dims_size, - int total_dims) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int cur_indices = idx, rem = 0, dst_offset = 0; - for (int i = 0; i < total_dims; ++i) { - int64_t temp = cur_indices; - cur_indices = cur_indices / x_stride[i]; - rem = temp - cur_indices * x_stride[i]; - // flip the indices if it is in flip_dims - for (int j = 0; j < flip_dims_size; ++j) { - if (i == flip_dims[j]) { - cur_indices = x_shape[i] - 1 - cur_indices; - } - } - dst_offset += cur_indices * x_stride[i]; - cur_indices = rem; - } - out_data[idx] = in_data[dst_offset]; -} - -template -class FlipKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto gplace = ctx.GetPlace(); - auto cplace = platform::CPUPlace(); - auto& dev_ctx = ctx.template device_context(); - - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - auto* in_data = x->data(); - auto* out_data = out->mutable_data(ctx.GetPlace()); - auto flip_dims = ctx.template Attr>("axis"); - - const int flip_dims_size = static_cast(flip_dims.size()); - auto x_dims = x->dims(); - const int total_dims = x_dims.size(); - const int N = x->numel(); - - int block_size = 512; - dim3 dim_block(block_size); - dim3 dim_grid((N + block_size - 1) / block_size); - - for (size_t i = 0; i < flip_dims.size(); ++i) { - if (flip_dims[i] < 0) { - flip_dims[i] += total_dims; - } - } - - auto x_stride = phi::stride(x_dims); - std::vector x_dims_v = phi::vectorize(x_dims); - std::vector x_stride_v = phi::vectorize(x_stride); - - int bytes = total_dims * sizeof(int64_t); - auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes); - int64_t* x_strides_array_gpu = - reinterpret_cast(x_strides_array_tmp->ptr()); - memory::Copy(gplace, x_strides_array_gpu, cplace, x_stride_v.data(), bytes, - dev_ctx.stream()); - - auto x_shape_array_tmp = memory::Alloc(dev_ctx, bytes); - int64_t* x_shape_array_gpu = - reinterpret_cast(x_shape_array_tmp->ptr()); - memory::Copy(gplace, x_shape_array_gpu, cplace, x_dims_v.data(), bytes, - dev_ctx.stream()); - - bytes = flip_dims_size * sizeof(int); - auto flip_dims_array_tmp = memory::Alloc(dev_ctx, bytes); - int* flip_dims_array_gpu = - reinterpret_cast(flip_dims_array_tmp->ptr()); - memory::Copy(gplace, flip_dims_array_gpu, cplace, flip_dims.data(), bytes, - dev_ctx.stream()); - - flip_cuda_kernel< - T><<>>( - N, in_data, out_data, x_shape_array_gpu, x_strides_array_gpu, - flip_dims_array_gpu, flip_dims_size, total_dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - flip, ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel>, - ops::FlipKernel>); diff --git a/paddle/fluid/operators/flip_op.h b/paddle/fluid/operators/flip_op.h deleted file mode 100644 index 3c00df5f67d19a9a58a3fe2f4ed2f64f34128063..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/flip_op.h +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -constexpr size_t dim_bitset_size = 64; - -template -class FlipKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class FlipKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - auto flip_dims = ctx.template Attr>("axis"); - - auto x_dims = x->dims(); - const int total_dims = x_dims.size(); - std::bitset dim_bitset; - for (size_t i = 0; i < flip_dims.size(); ++i) { - int dim = flip_dims[i]; - if (flip_dims[i] < 0) { - dim += total_dims; - } - dim_bitset[dim] = true; - } - auto x_strides = phi::stride(x_dims); - auto numel = x->numel(); - const T* x_data = x->data(); - T* out_data = out->mutable_data(ctx.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < numel; ++i) { - int64_t cur_indices = i; - int64_t rem = 0; - int64_t dst_offset = 0; - - for (int d = 0; d < total_dims; ++d) { - int64_t temp = cur_indices; - cur_indices = cur_indices / x_strides[d]; - rem = temp - cur_indices * x_strides[d]; - dst_offset += dim_bitset[d] - ? (x_dims[d] - 1 - cur_indices) * x_strides[d] - : cur_indices * x_strides[d]; - cur_indices = rem; - } - out_data[i] = x_data[dst_offset]; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 7b97663c387ca0636e989f4ccb0d9223fb969f44..40ec9aef190ff4bacd52b19a1c0b12300a35b61e 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -13,7 +13,6 @@ * limitations under the License. */ #include "paddle/fluid/operators/fold_op.h" -#include "paddle/fluid/operators/unfold_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc index 177e8f5bcb7bdd1af907c397bfb75db8dd014d88..0ffc4c91b851c12a5329ae5b27bd3300753896a9 100644 --- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "dnnl.hpp" +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/multi_gru_op.h" #include "paddle/fluid/platform/errors.h" diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index fa9fe9d8602012f71ca6829e58561d03b7bfb2f1..21d827c79200c4a368ce7677b01b18ee4ddedb8d 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/operators/index_impl.cu.h" DECLARE_bool(use_curand); @@ -65,7 +66,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel { } T mean = static_cast(context.Attr("mean")); T std = static_cast(context.Attr("std")); - thrust::counting_iterator index_sequence_begin(0); auto shape = GetShape(context); tensor->Resize(shape); @@ -88,15 +88,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel { } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GaussianGenerator(mean, std, seed_offset.first, gen_offset)); + auto func = + GaussianGenerator(mean, std, seed_offset.first, gen_offset); + IndexKernel>(dev_cxt, tensor, func); } } else { - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GaussianGenerator(mean, std, seed)); + auto func = GaussianGenerator(mean, std, seed); + IndexKernel>(dev_cxt, tensor, func); } } }; @@ -116,23 +114,22 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { } T mean = static_cast(context.Attr("mean")); T std = static_cast(context.Attr("std")); - thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); int device_id = context.GetPlace().GetDeviceId(); auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); + auto& dev_cxt = + context.template device_context(); if (gen_cuda->GetIsInitPy() && seed_flag) { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GaussianGenerator(mean, std, seed_offset.first, - seed_offset.second)); + auto func = GaussianGenerator(mean, std, seed_offset.first, + seed_offset.second); + IndexKernel>(dev_cxt, tensor, func); } else { - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GaussianGenerator(mean, std, seed)); + auto func = GaussianGenerator(mean, std, seed); + IndexKernel>(dev_cxt, tensor, func); } } }; diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 041f7487fd2575faa2407ea90c064a2cfdea96c5..3915ce5809c394738c58e80accccac531c268c23 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -12,47 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/huber_loss_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { class HuberLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "HuberLoss"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "HuberLoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), - platform::errors::InvalidArgument( - "Input(input) rank and Input(label) rank should be " - "same, but received input rank(%d) != label rank(%d)", - x_dims.size(), y_dims.size())); - - bool contain_unknown_dim = - phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(y_dims); - if (ctx->IsRuntime() || !contain_unknown_dim) { - PADDLE_ENFORCE_EQ( - x_dims, y_dims, - platform::errors::InvalidArgument( - "The Input(input) and Input(label) should have the same " - "shape, but received input shape [%s] != label shape [%s]", - x_dims, y_dims)); - } - - auto out_dims = y_dims; - ctx->SetOutputDim("Residual", out_dims); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", "Out"); - } }; template @@ -139,14 +112,11 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, + PT_INFER_META(phi::HuberLossInferMeta)); + REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, ops::HuberLossGradOpMaker, - ops::HuberLossGradOpMaker); + ops::HuberLossGradOpMaker, + HuberLossInferShapeFunctor); REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp); -REGISTER_OP_CPU_KERNEL( - huber_loss, ops::HuberLossKernel, - ops::HuberLossKernel); -REGISTER_OP_CPU_KERNEL( - huber_loss_grad, - ops::HuberLossGradKernel, - ops::HuberLossGradKernel); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h deleted file mode 100644 index ebe26f05ab3e47245176614fb2ce57c264ebf5f5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/huber_loss_op.h +++ /dev/null @@ -1,123 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - -template -struct HuberLossForward { - HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {} - - HOSTDEVICE T operator()(const T& val) const { - T abs_val = std::abs(val); - if (abs_val <= delta) { - return static_cast(0.5) * val * val; - } else { - return delta * (abs_val - static_cast(0.5) * delta); - } - } - - T delta; -}; - -template -class HuberLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto* in1 = context.Input("Y"); - auto* out0 = context.Output("Residual"); - auto* out1 = context.Output("Out"); - auto delta = static_cast(context.Attr("delta")); - auto& place = - *context.template device_context().eigen_device(); - - auto x = EigenVector::Flatten(*in0); - auto y = EigenVector::Flatten(*in1); - out0->mutable_data(context.GetPlace()); - auto residual = EigenVector::Flatten(*out0); - residual.device(place) = y - x; - out1->mutable_data(context.GetPlace()); - auto loss = EigenVector::Flatten(*out1); - loss.device(place) = residual.unaryExpr(HuberLossForward(delta)); - } -}; - -template -struct HuberLossBackward { - HOSTDEVICE HuberLossBackward(const T& delta, T sign) - : sign(sign), delta(delta) {} - - HOSTDEVICE T operator()(const T& val) const { - T abs_val = std::abs(val); - if (abs_val <= delta) { - return sign * val; - } else { - if (val > 0) { - return sign * delta; - } else { - return -1 * sign * delta; - } - } - } - - T sign; - T delta; -}; - -template -class HuberLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("Residual"); - auto* in1 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - auto* out1 = context.Output(framework::GradVarName("Y")); - auto delta = static_cast(context.Attr("delta")); - auto& place = - *context.template device_context().eigen_device(); - - auto residual = EigenVector::Flatten(*in0); - auto out_grad = EigenVector::Flatten(*in1); - - if (out0) { - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - x_grad.device(place) = - residual.unaryExpr(HuberLossBackward(delta, -1.0)); - x_grad.device(place) = out_grad * x_grad; - } - - if (out1) { - out1->mutable_data(context.GetPlace()); - auto y_grad = EigenVector::Flatten(*out1); - y_grad.device(place) = - residual.unaryExpr(HuberLossBackward(delta, 1.0)); - y_grad.device(place) = out_grad * y_grad; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc index 19ced131c00a2a861a5140697b8a199f013ad5bf..6fc6960d3db565d698b252347e5734f949e16211 100644 --- a/paddle/fluid/operators/huber_loss_op_npu.cc +++ b/paddle/fluid/operators/huber_loss_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/huber_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc index 767ce542736e831e2ea587fc765ed6c0baf96589..ccddec2779515f26db10440633ab9d9894537182 100644 --- a/paddle/fluid/operators/huber_loss_op_xpu.cc +++ b/paddle/fluid/operators/huber_loss_op_xpu.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU - -#include "paddle/fluid/operators/huber_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 6a195bb9400e89ef09bc7ca2c08637eeb505dda2..33b68d68992dd819f74c2ae67153ecc6b050b16b 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/imag_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,15 +23,6 @@ namespace operators { class ImagOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Imag"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Imag"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", "Out"); - } }; class ImagOpMaker : public framework::OpProtoAndCheckerMaker { @@ -88,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker, ops::ImagGradOpMaker, - ops::ImagGradOpMaker); + ops::ImagGradOpMaker, + ImagInferShapeFunctor); REGISTER_OPERATOR(imag_grad, ops::ImagGradOp); - -REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel>, - ops::ImagKernel>); -REGISTER_OP_CPU_KERNEL(imag_grad, - ops::ImagGradKernel>, - ops::ImagGradKernel>); diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu deleted file mode 100644 index 9cfb2ef7f2fef6b25322ba76bedadae3c6ca8d87..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/imag_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/imag_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(imag, - ops::ImagKernel>, - ops::ImagKernel>); -REGISTER_OP_CUDA_KERNEL(imag_grad, - ops::ImagGradKernel>, - ops::ImagGradKernel>); diff --git a/paddle/fluid/operators/imag_op.h b/paddle/fluid/operators/imag_op.h deleted file mode 100644 index 33eab2abb74e177eb863989cd6a1e8132ad09e8c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/imag_op.h +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -class ImagKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* out = ctx.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - ctx.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ImagFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class ImagGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ImagToComplexFunctor functor(dout_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..3d6a5e0ea88a28addaf09d90cae9659cbea85305 --- /dev/null +++ b/paddle/fluid/operators/index_impl.cu.h @@ -0,0 +1,97 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/distribution_helper.h" +#include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +namespace paddle { +namespace operators { + +namespace kps = phi::kps; +template +__global__ void VectorizedIndexKernel(T *out, int numel, int main_offset, + Functor func) { + int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + int args[VecSize]; + T result[VecSize]; + for (; data_offset < main_offset; data_offset += stride) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary(&result[0], &args[0], + func); + kps::WriteData(out + data_offset, &result[0], + BLOCK_NUM_X * VecSize); + } + int num = numel - data_offset; + if (num > 0) { + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary(&result[0], &args[0], + func); + kps::WriteData(out + data_offset, &result[0], num); + } +} + +template +void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { + int numel = out->numel(); + T *out_data = out->mutable_data(dev_ctx.GetPlace()); + if (numel <= 0) return; + int vec_size = paddle::platform::GetVectorizedSize((out->data())); +#ifdef PADDLE_WITH_XPU_KP + int block = 64; + int grid = 8; + auto stream = dev_ctx.x_context()->xpu_stream; +#else + auto config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size); + int grid = config.block_per_grid.x; + int block = config.thread_per_block.x; + auto stream = dev_ctx.stream(); +#endif + + int main_offset = (numel / (vec_size * block)) * vec_size * block; + switch (vec_size) { + case 4: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 2: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + case 1: + VectorizedIndexKernel<<>>( + out_data, numel, main_offset, func); + break; + default: { + PADDLE_THROW(paddle::platform::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 8fac84176d97fd371ddfac25dab2aee8c098607a..fda168c94e1e064c65e3b5fcf56b606772345b9d 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -328,7 +328,7 @@ class InterpolateOp : public framework::OperatorWithKernel { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - auto interp_method = ctx.Attr("interp_method"); + const auto& interp_method = ctx.Attr("interp_method"); // TODO(danqing): support other interp_method if (this->CanMKLDNNBeUsed(ctx, data_type) && (interp_method == "nearest" || interp_method == "bilinear")) { diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index 7783303785998e9db05a5f5117a047e2729de848..4b5a18141d5aa9ac5d1f5354fafbad0e38bb8474 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -414,7 +414,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - auto interp_method = ctx.Attr("interp_method"); + const auto& interp_method = ctx.Attr("interp_method"); // TODO(danqing): support other interp_method if (this->CanMKLDNNBeUsed(ctx, data_type) && (interp_method == "nearest" || interp_method == "bilinear")) { diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6e9d6a1995474812abe333137cc75ba90a2b4fac..d61eb46d97e98972963f5871a4c6e7b06468337c 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -210,32 +210,66 @@ __global__ void KeNearestNeighbor3DInterpFw( } } +template +__global__ void KeNearestNeighborInterpNCHWBw( + T* in, const size_t in_img_h, const size_t in_img_w, const T* out, + const size_t out_img_h, const size_t out_img_w, const size_t nc, + const float ratio_h, const float ratio_w, const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + T* in_pos = &in[in_index]; + const T out_pos = out[out_index]; + platform::CudaAtomicAdd(in_pos, out_pos); + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + template __global__ void KeNearestNeighborInterpBw( T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { + const bool align_corners, FastDivModForInterpolate divmods) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) @@ -244,15 +278,10 @@ __global__ void KeNearestNeighborInterpBw( ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - const T out_pos = out[out_id_h * output_w + out_id_w]; + T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + const T out_pos = out[tid]; platform::CudaAtomicAdd(in_pos, out_pos); } } @@ -1842,11 +1871,26 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + platform::GpuLaunchConfig config_3d = + GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); + KeNearestNeighborInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc, + ratio_h, ratio_w, align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, + interp_divmods); + } } else if ("bilinear" == interp_method) { const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h index 66ab1e14390b33e99b80393d3bddaf9126bca325..f99d3f6c324421534dd51c74c840a8dca5dcedd9 100644 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ b/paddle/fluid/operators/interpolate_v2_op.h @@ -65,6 +65,13 @@ inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { &cpu_starts_tensor); new_data = cpu_starts_tensor.data(); } +#endif +#ifdef PADDLE_WITH_XPU + if (platform::is_xpu_place(new_data_tensor->place())) { + paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), + &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } #endif vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); return vec_new_data; diff --git a/paddle/fluid/operators/interpolate_v2_op_xpu.cc b/paddle/fluid/operators/interpolate_v2_op_xpu.cc index 66314cb74456d66522a83abd4eb37873ab8bf9f2..850dbe025b9cb5f13db58eaab86ce777ec7b97ae 100644 --- a/paddle/fluid/operators/interpolate_v2_op_xpu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_xpu.cc @@ -14,7 +14,7 @@ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/interpolate_op.h" +#include "paddle/fluid/operators/interpolate_v2_op.h" #ifdef PADDLE_WITH_XPU @@ -41,18 +41,6 @@ inline std::vector get_new_shape_xpu( return vec_new_shape; } -template -inline std::vector get_new_data_from_tensor_xpu( - const Tensor* new_data_tensor) { - std::vector vec_new_data; - framework::Tensor cpu_starts_tensor; - paddle::framework::TensorCopySync(*new_data_tensor, platform::CPUPlace(), - &cpu_starts_tensor); - auto* new_data = cpu_starts_tensor.data(); - vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); - return vec_new_data; -} - template class InterpolateV2XPUKernel : public framework::OpKernel { public: @@ -90,7 +78,7 @@ class InterpolateV2XPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor_xpu(scale_tensor); + auto scale_data = get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; @@ -202,7 +190,7 @@ class InterpolateV2GradXPUKernel : public framework::OpKernel { auto scale_tensor = ctx.Input("Scale"); auto scale = ctx.Attr>("scale"); if (scale_tensor != nullptr) { - auto scale_data = get_new_data_from_tensor_xpu(scale_tensor); + auto scale_data = get_new_data_from_tensor(scale_tensor); if (scale_data.size() > 1) { scale_h = scale_data[0]; scale_w = scale_data[1]; diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index 5ae9fd7a6102893d6e4a16c451c6d017ad70de5f..7e07610db2875d45aa250ab084e0eaf493dc7034 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/label_smooth_op.h" - #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace framework { @@ -152,11 +151,3 @@ REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, ops::LabelSmoothGradMaker, ops::LabelSmoothGradMaker); REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp); -REGISTER_OP_CPU_KERNEL( - label_smooth, - ops::LabelSmoothKernel, - ops::LabelSmoothKernel); -REGISTER_OP_CPU_KERNEL( - label_smooth_grad, - ops::LabelSmoothGradKernel, - ops::LabelSmoothGradKernel); diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu deleted file mode 100644 index f149e104eff624fd6145926aec60350b41de3cdf..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/label_smooth_op.cu +++ /dev/null @@ -1,125 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/label_smooth_op.h" -namespace paddle { -namespace operators { - -template -struct LabelSmoothFunctor { - T epsilon; - T label_dim; - - __forceinline__ LabelSmoothFunctor(float epsilon_data, int label_dim_data) { - epsilon = static_cast(epsilon_data); - label_dim = static_cast(label_dim_data); - } - - __device__ __forceinline__ T operator()(const T x) const { - return (static_cast(1 - epsilon) * x + - static_cast(epsilon / label_dim)); - } -}; - -template -struct LabelSmoothGradFunctor { - T epsilon; - - __forceinline__ LabelSmoothGradFunctor(float epsilon_data) { - epsilon = static_cast(epsilon_data); - } - - __device__ __forceinline__ T operator()(const T x) const { - return static_cast(1 - epsilon) * x; - } -}; - -template -__global__ void LabelSmoothRunDistKernel(const int N, const float epsilon, - const int dist_numel, const T* src, - const T* dist_data, T* dst) { - CUDA_KERNEL_LOOP(idx, N) { - int dist_idx = idx % dist_numel; - dst[idx] = static_cast(1 - epsilon) * src[idx] + - static_cast(epsilon) * dist_data[dist_idx]; - } -} - -template -class LabelSmoothGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out_t = ctx.Output("Out"); - auto* in_t = ctx.Input("X"); - auto* dist_t = ctx.Input("PriorDist"); - auto label_dim = in_t->dims()[in_t->dims().size() - 1]; - auto epsilon = ctx.Attr("epsilon"); - auto& dev = *ctx.template device_context().eigen_device(); - auto size_prob = in_t->numel(); - const T* in_data = in_t->data(); - T* out_data = out_t->mutable_data(ctx.GetPlace()); - int threads = 512; - int grid = (size_prob + threads - 1) / threads; - auto stream = ctx.cuda_device_context().stream(); - if (dist_t) { - auto dist_numel = dist_t->numel(); - const T* dist_data = dist_t->data(); - LabelSmoothRunDistKernel<<>>( - size_prob, epsilon, dist_numel, in_data, dist_data, out_data); - - } else { - auto& dev_ctx = - ctx.template device_context(); - - std::vector ins = {in_t}; - std::vector outs = {out_t}; - auto functor = LabelSmoothFunctor(epsilon, label_dim); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } - } -}; - -template -class LabelSmoothGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out_t = ctx.Input(framework::GradVarName("Out")); - auto* d_in_t = ctx.Output(framework::GradVarName("X")); - d_in_t->mutable_data(ctx.GetPlace()); - - auto epsilon = ctx.Attr("epsilon"); - auto& dev_ctx = ctx.template device_context(); - - std::vector ins = {d_out_t}; - std::vector outs = {d_in_t}; - auto functor = LabelSmoothGradFunctor(epsilon); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; -} // namespace operators -} // namespace paddle -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - label_smooth, - ops::LabelSmoothGPUKernel, - ops::LabelSmoothGPUKernel); -REGISTER_OP_CUDA_KERNEL( - label_smooth_grad, - ops::LabelSmoothGradGPUKernel, - ops::LabelSmoothGradGPUKernel); diff --git a/paddle/fluid/operators/label_smooth_op.h b/paddle/fluid/operators/label_smooth_op.h deleted file mode 100644 index 6b509eb64cce6d289032d366552f6bb5e6712388..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/label_smooth_op.h +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class LabelSmoothKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out_t = ctx.Output("Out"); - auto* in_t = ctx.Input("X"); - auto* dist_t = ctx.Input("PriorDist"); - auto label_dim = in_t->dims()[in_t->dims().size() - 1]; - out_t->mutable_data(ctx.GetPlace()); - if (label_dim != 0) { - auto epsilon = ctx.Attr("epsilon"); - auto out = framework::EigenVector::Flatten(*out_t); - auto in = framework::EigenVector::Flatten(*in_t); - auto& dev = *ctx.template device_context().eigen_device(); - if (dist_t) { - auto dist = framework::EigenVector::Flatten(*dist_t); - out.device(dev) = static_cast(1 - epsilon) * in + - static_cast(epsilon) * - dist.broadcast(Eigen::DSizes( - in_t->numel() / label_dim)); - } else { - out.device(dev) = static_cast(1 - epsilon) * in + - static_cast(epsilon / label_dim); - } - } - } -}; - -template -class LabelSmoothGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out_t = ctx.Input(framework::GradVarName("Out")); - auto* d_in_t = ctx.Output(framework::GradVarName("X")); - d_in_t->mutable_data(ctx.GetPlace()); - auto d_out_dim = d_out_t->dims()[d_out_t->dims().size() - 1]; - if (d_out_dim != 0) { - auto d_out = framework::EigenVector::Flatten(*d_out_t); - auto d_in = framework::EigenVector::Flatten(*d_in_t); - - auto epsilon = ctx.Attr("epsilon"); - auto& dev = *ctx.template device_context().eigen_device(); - d_in.device(dev) = static_cast(1 - epsilon) * d_out; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/label_smooth_op_npu.cc b/paddle/fluid/operators/label_smooth_op_npu.cc index af519cc9090b06c789d88d2ef3a2e2d6ba61495b..c24b896e0a49ae5b5c7717a9173d862633fb7cca 100644 --- a/paddle/fluid/operators/label_smooth_op_npu.cc +++ b/paddle/fluid/operators/label_smooth_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/label_smooth_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/label_smooth_op_xpu.cc b/paddle/fluid/operators/label_smooth_op_xpu.cc index 6b6350753909f0dc319d07904b4d81327262684e..dd8d0c721c9c29242ba06d3bc57b51da04ff69f5 100644 --- a/paddle/fluid/operators/label_smooth_op_xpu.cc +++ b/paddle/fluid/operators/label_smooth_op_xpu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/label_smooth_op.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/load_op.h b/paddle/fluid/operators/load_op.h index 521a35646c45a257b56783c500b239ce74a5de0a..7a161fb9dd38352ce4f0f0b6d1fc92b725cfcc52 100644 --- a/paddle/fluid/operators/load_op.h +++ b/paddle/fluid/operators/load_op.h @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index e36c8b1c1b2531f726cc0e9ec1cde6a7aaac6bb5..29079b8b1385dee3a28c42a178a046fab77e6200 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -164,8 +164,10 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto gpu_place = context.GetPlace(); // TODO(yuyang18): Strange code here. - memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + paddle::framework::MixVector mixv_new_rows(&new_rows); + memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(context.GetPlace()), gpu_place, ids_data, ids_num * sizeof(int64_t), stream); + mixv_new_rows.CopyToCPU(); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 42318ca6a8d3e06a8a6560cdf6eef2d67e6116b0..d40b2643785706e843dbd9812e74ca0aa134f7b5 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -21,19 +21,18 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, const int64_t N, const int64_t K, const int64_t D, const int64_t padding_idx) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); T *out = output + idy * D; const T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { + for (int i = idx; i < D; i += blockDim.x) { if (PaddingFlag) { if (id == padding_idx) out[i] = static_cast(0); @@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, out[i] = tab[i]; } } - idy += BlockDimY * GridDimX; + idy += blockDim.y * gridDim.x; } } -template +template __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids, const int64_t N, const int64_t K, const int64_t D) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); const T *out = output + idy * D; T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { +#ifdef PADDLE_WITH_CUDA + paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); +#else + for (int i = idx; i < D; i += blockDim.x) { paddle::platform::CudaAtomicAdd(&tab[i], out[i]); } - idy += BlockDimY * GridDimX; +#endif + idy += blockDim.y * gridDim.x; } } @@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor { size_t D = table_t->dims()[1]; size_t K = ids_t_->numel(); + const int gridx = 2 * context_.cuda_device_context().GetSMCount(); dim3 threads(256, 4); - dim3 grids(80, 1); + dim3 grids(gridx, 1); const auto *table = table_t->template data(); const auto *ids = ids_t_->template data(); @@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor { auto stream = context_.cuda_device_context().stream(); if (padding_idx == -1) { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } else { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } } @@ -152,14 +156,16 @@ struct LookupTableV2GradCUDAFunctor { new_rows.resize(ids_num); auto gpu_place = context_.GetPlace(); + paddle::framework::MixVector mixv_new_rows(&new_rows); if (!std::is_same::value) { InputTypeConvert<<>>( - ids_data, ids_num, new_rows.MutableData(gpu_place)); + ids_data, ids_num, mixv_new_rows.MutableData(gpu_place)); } else { - memory::Copy(gpu_place, new_rows.CUDAMutableData(gpu_place), gpu_place, - ids_data, ids_num * sizeof(int64_t), stream); + memory::Copy(gpu_place, mixv_new_rows.CUDAMutableData(gpu_place), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); } + mixv_new_rows.CopyToCPU(); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); @@ -191,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor { int D = d_table_t->dims()[1]; int K = ids_t_->numel(); - dim3 threads(128, 8); - dim3 grids(8, 1); const T *d_output = d_output_t->template data(); const auto *ids = ids_t_->template data(); T *d_table = d_table_t->mutable_data(context_.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#endif - LookupTableV2Grad<<>>( + const int gridx = 2 * dev_ctx.GetSMCount(); + dim3 threads(128, 8); + dim3 grids(gridx, 1); + LookupTableV2Grad<<>>( d_table, d_output, ids, N, K, D); } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d5336af8f05ef7fce1d5b1a2153cb8928772e232..d7d1093b9b3bf2f9f605c7c45c6d5f8a4e52bb6a 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -6,9 +6,9 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) -math_library(concat_and_split DEPS npu_op_runner) +math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) else() -math_library(concat_and_split) +math_library(concat_and_split DEPS concat_and_split_functor) endif() math_library(context_project DEPS im2col math_function) math_library(cross_entropy) diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu index c954bdf81d30d13abc8383544e17709ee249cc99..486979aa0a8b3009d09f73de54f9b7b3ac8a77ad 100644 --- a/paddle/fluid/operators/math/beam_search.cu +++ b/paddle/fluid/operators/math/beam_search.cu @@ -357,8 +357,9 @@ class BeamSearchFunctor { framework::LoD selected_lod(2); selected_lod[0].assign(abs_lod[level].begin(), abs_lod[level].end()); selected_lod[1].resize(scores->dims()[0] + 1); - size_t* selected_offsets = - selected_lod[1].CUDAMutableData(context.GetPlace()); + paddle::framework::MixVector mix_vector(&selected_lod[1]); + paddle::framework::MixVector mixv_abs(&abs_lod[level]); + size_t* selected_offsets = mix_vector.CUDAMutableData(context.GetPlace()); if (num_seqs == 1) { const int seq_length = static_cast(abs_lod[level][1]); @@ -377,7 +378,7 @@ class BeamSearchFunctor { is_accumulated, num_used_threads)); } } else if (num_seqs <= 4) { - const size_t* seq_offsets = abs_lod[level].CUDAData(context.GetPlace()); + const size_t* seq_offsets = mixv_abs.CUDAData(context.GetPlace()); // Use only 1 block const int kMaxThreadsPerSeq = 32; const int kMaxSeqs = 4; @@ -400,6 +401,7 @@ class BeamSearchFunctor { } context.Wait(); + mix_vector.CopyToCPU(); if (!framework::CheckLoD(selected_lod)) { PADDLE_THROW(platform::errors::InvalidArgument( "lod %s is not right in" diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 8ec89f1b60acebdb0d1da8b6a07113b1f4c23ef0..46126ac59c892787d2f63956983404843e518ae7 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/phi/kernels/cpu/concat_and_split.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif @@ -46,9 +46,8 @@ class ConcatFunctor { void operator()(const platform::CPUDeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - std::vector pt_input{input.begin(), input.end()}; - phi::ConcatImpl(context, pt_input, axis, - output); + phi::funcs::ConcatFunctor functor; + functor(context, input, axis, output); } }; @@ -63,11 +62,8 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, const int axis, std::vector* outputs) { - std::vector pt_ref_inputs{ref_inputs.begin(), - ref_inputs.end()}; - std::vector pt_outputs{outputs->begin(), outputs->end()}; - phi::SplitImpl(context, input, pt_ref_inputs, - axis, &pt_outputs); + phi::funcs::SplitFunctor functor; + functor(context, input, ref_inputs, axis, outputs); } }; diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 51f94afcfc1b99755d5f9dca8460a56fc76cf543..e51631385eb75a63083e0cbbd2a8632d689be8f1 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/phi/kernels/gpu/concat_and_split.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace paddle { namespace operators { namespace math { @@ -29,10 +29,8 @@ class ConcatFunctor { void operator()(const platform::CUDADeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - std::vector pt_input{input.begin(), input.end()}; - - phi::ConcatImpl(context, pt_input, axis, - output); + phi::funcs::ConcatFunctor functor; + functor(context, input, axis, output); } }; @@ -43,16 +41,12 @@ class ConcatFunctor { template class SplitFunctor { public: - SplitFunctor(); void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ref_inputs, int axis, std::vector* outputs) { - std::vector pt_ref_inputs{ref_inputs.begin(), - ref_inputs.end()}; - std::vector pt_outputs{outputs->begin(), outputs->end()}; - phi::SplitImpl( - context, input, pt_ref_inputs, axis, &pt_outputs); + phi::funcs::SplitFunctor functor; + functor(context, input, ref_inputs, axis, outputs); } }; diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h index 65d2ca79e60c2ec90d879ce9818c398adc93c73c..b5b0aae23ac875c7afeb4148309138aae49e5b4a 100644 --- a/paddle/fluid/operators/math/concat_and_split.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -64,17 +64,3 @@ class SplitFunctor { } // namespace math } // namespace operators } // namespace paddle - -#define FOR_ALL_TYPES(macro) \ - macro(int); \ - macro(float); \ - macro(double); \ - macro(bool); \ - macro(int64_t); \ - macro(int16_t); \ - macro(uint8_t); \ - macro(int8_t); \ - macro(::paddle::platform::float16); \ - macro(::paddle::platform::bfloat16); \ - macro(::paddle::platform::complex); \ - macro(::paddle::platform::complex); diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index 8efd35ca108100e4d224890846433433702c57a9..8fc6c52122abfe48d87a14ae274849a18c020546 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -22,6 +22,10 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +namespace phi { +class CPUContext; +} // namespace phi + namespace paddle { namespace operators { namespace math { @@ -31,12 +35,12 @@ namespace math { * col = * [input_channels, filter_height, filter_width, output_height, output_width] */ -template -class Im2ColFunctor { +template +class Im2ColFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& im, const std::vector& dilation, + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col, const DataLayout data_layout) { @@ -73,12 +77,11 @@ class Im2ColFunctor -class Col2ImFunctor { +template +class Col2ImFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im, @@ -155,22 +158,30 @@ template class Im2ColFunctor; template class Im2ColFunctor; +template class Im2ColFunctor; +template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; /* * im = [input_channels, input_height, input_width] * col = * [output_height, output_width, input_channels, filter_height, filter_width] */ -template -class Im2ColFunctor { +template +class Im2ColFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& im, const std::vector& dilation, + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col, const DataLayout data_layout) { @@ -235,12 +246,11 @@ class Im2ColFunctor -class Col2ImFunctor { +template +class Col2ImFunctor { public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im, @@ -316,11 +326,18 @@ template class Im2ColFunctor; template class Im2ColFunctor; +template class Im2ColFunctor; +template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; - +template class Col2ImFunctor; +template class Col2ImFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 67165ff2219891e3518673845ce224a30b117ff8..fcd5c06a6f310f8a23608a77f2d6b9098e99b33a 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -170,7 +170,8 @@ struct SelectedRowsAddTo { auto* in2_value = input2->mutable_value(); // concat rows - in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + paddle::framework::MixVector mixv_in2_rows(&in2_rows); + mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end()); auto in1_place = input1.place(); PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index ea0b0bb29548bef0792d00f177d6789daf211ad6..8563d8b05b186c025ecc4c970a400765adeb0c5d 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -161,9 +161,10 @@ struct SelectedRowsAddTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); + paddle::framework::MixVector mixv_in1_rows(&in1_rows); SelectedRowsAddTensorKernel< T, block_size><<>>( - in1_data, in1_rows.CUDAData(context.GetPlace()), out_data, + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); @@ -198,8 +199,9 @@ struct SelectedRowsAddTo { auto* in2_value = input2->mutable_value(); // concat rows + paddle::framework::MixVector mixv_in2_rows(&in2_rows); if (in1_rows.size()) { - in2_rows.Extend(in1_rows.begin(), in1_rows.end()); + mixv_in2_rows.Extend(in1_rows.begin(), in1_rows.end()); } auto in1_place = input1.place(); @@ -274,9 +276,10 @@ struct SelectedRowsAddToTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(in1_rows.size(), 1); + paddle::framework::MixVector mixv_in1_rows(&in1_rows); SelectedRowsAddToTensorKernel< T, block_size><<>>( - in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data, + in1_data, mixv_in1_rows.CUDAData(context.GetPlace()), in2_data, in1_row_numel); } }; @@ -356,10 +359,13 @@ struct MergeAdd { dim3 threads(block_size, 1); dim3 grid1(input_rows.size(), 1); + paddle::framework::MixVector mix_vector_input(&input_rows); + paddle::framework::MixVector mix_vector_out(out.mutable_rows()); MergeAddKernel<<>>( - input_data, input_rows.CUDAData(context.GetPlace()), out_data, - out.mutable_rows()->CUDAMutableData(context.GetPlace()), - out.rows().size(), input_width); + input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data, + mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(), + input_width); + mix_vector_out.CopyToCPU(); } void operator()(const platform::CUDADeviceContext& context, @@ -423,10 +429,13 @@ struct MergeAdd { auto& input_rows = input->rows(); dim3 grid1(input_rows.size(), 1); + paddle::framework::MixVector mix_vector_input(&input_rows); + paddle::framework::MixVector mix_vector_out(out.mutable_rows()); MergeAddKernel<<>>( - input_data, input_rows.CUDAData(context.GetPlace()), out_data, - out.mutable_rows()->CUDAMutableData(context.GetPlace()), - out.rows().size(), input_width); + input_data, mix_vector_input.CUDAData(context.GetPlace()), out_data, + mix_vector_out.CUDAMutableData(context.GetPlace()), out.rows().size(), + input_width); + mix_vector_out.CopyToCPU(); } } }; diff --git a/paddle/fluid/operators/math/sequence2batch.cu b/paddle/fluid/operators/math/sequence2batch.cu index cd1ca572689bc701da801384e5ed08fe6dc10749..f56c5293971bce3b43e86686e828fad4c90639f5 100644 --- a/paddle/fluid/operators/math/sequence2batch.cu +++ b/paddle/fluid/operators/math/sequence2batch.cu @@ -72,8 +72,9 @@ class CopyMatrixRowsFunctor { dim3 threads(128, 8); dim3 grid(8, 1); auto stream = context.stream(); + paddle::framework::MixVector mix_index_lod(&index_lod); CopyMatrixRowsKernel<<>>( - src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height, + src_data, dst_data, mix_index_lod.CUDAData(context.GetPlace()), height, width, is_src_index); } }; diff --git a/paddle/fluid/operators/math/sequence_padding.cu b/paddle/fluid/operators/math/sequence_padding.cu index 65bf77f0d152b99059eea2ba98b5d2f0945dc273..01fd2d403c4564ba022e3ab9633fa04d998dd662 100644 --- a/paddle/fluid/operators/math/sequence_padding.cu +++ b/paddle/fluid/operators/math/sequence_padding.cu @@ -59,7 +59,7 @@ class PaddingLoDTensorFunctor { int lod_level = 0, bool norm_by_times = false, const PadLayout layout = kBatchLengthWidth) { auto seq_lod = seq_tensor.lod(); - const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; + auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level]; const auto& seq_tensor_dims = seq_tensor.dims(); const auto& pad_tensor_dims = pad_tensor->dims(); int max_seq_len = MaximumSequenceLength(seq_offsets); @@ -104,10 +104,11 @@ class PaddingLoDTensorFunctor { T* pad_data = pad_tensor->data(); const T* pad_value_data = pad_value.data(); + paddle::framework::MixVector mix_vector_seq_offsets(&seq_offsets); SequencePaddingKernel<<>>( pad_data, seq_data, pad_value_data, pad_value.numel() == 1, - seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, - step_width, norm_by_times, layout); + mix_vector_seq_offsets.CUDAData(context.GetPlace()), seq_num, + pad_seq_len, step_width, norm_by_times, layout); } }; @@ -157,9 +158,10 @@ class UnpaddingLoDTensorFunctor { const T* pad_data = pad_tensor.data(); T* seq_data = seq_tensor->data(); + paddle::framework::MixVector mixv_seq_offsets(&seq_offsets); SequencePaddingKernel<<>>( seq_data, pad_data, nullptr, false, - seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, + mixv_seq_offsets.CUDAData(context.GetPlace()), seq_num, pad_seq_len, step_width, norm_by_times, layout); } }; diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 1c09acf52fae3f911b3c5e46855c9343a88ffae8..fa7b043153851460c9c8d5586ddce88872b7e3c7 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -168,41 +168,42 @@ class SequencePoolFunctor { const size_t item_dim = output->numel() / output->dims()[0]; dim3 threads(1024, 1); dim3 grid(std::max(static_cast(lod.size()) - 1, 1), 1); + paddle::framework::MixVector mix_vector(&lod); if (pooltype == "MAX") { sequence_pool_kernel< T, MaxPoolFunctor><<>>( MaxPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { sequence_pool_kernel< T, AvgPoolFunctor><<>>( AvgPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { sequence_pool_kernel< T, SumPoolFunctor><<>>( SumPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { sequence_pool_kernel< T, SqrtPoolFunctor><<>>( SqrtPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { sequence_pool_kernel< T, LastPoolFunctor><<>>( LastPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { sequence_pool_kernel< T, FirstPoolFunctor><<>>( FirstPoolFunctor(), input.data(), pad_value, - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, output->mutable_data(context.GetPlace()), nullptr); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -335,41 +336,42 @@ class SequencePoolGradFunctor { const size_t item_dim = in_grad->numel() / in_grad->dims()[0]; dim3 threads(1024, 1); dim3 grid(std::max(static_cast(lod.size()) - 1, 1), 1); + paddle::framework::MixVector mix_vector(&lod); if (pooltype == "MAX") { sequence_pool_grad_kernel< T, MaxPoolGradFunctor><<>>( MaxPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), index->data()); } else if (pooltype == "AVERAGE") { sequence_pool_grad_kernel< T, AvgPoolGradFunctor><<>>( AvgPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SUM") { sequence_pool_grad_kernel< T, SumPoolGradFunctor><<>>( SumPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "SQRT") { sequence_pool_grad_kernel< T, SqrtPoolGradFunctor><<>>( SqrtPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "LAST") { sequence_pool_grad_kernel< T, LastPoolGradFunctor><<>>( LastPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else if (pooltype == "FIRST") { sequence_pool_grad_kernel< T, FirstPoolGradFunctor><<>>( FirstPoolGradFunctor(), out_grad.data(), - lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + mix_vector.CUDAData(context.GetPlace()), lod.size(), item_dim, in_grad->mutable_data(context.GetPlace()), nullptr); } else { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index 1807c77e37ca16967d24c423a1bebac779f59ce5..8e02d1b70ff83b3641d498567a236ffcb41bb988 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -41,21 +41,23 @@ class ScaleLoDTensorFunctor { auto lod = seq->lod(); const size_t num_seq = lod[level].size() - 1; const size_t seq_width = seq->numel() / seq->dims()[0]; - framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto abs_offset_lod = framework::ToAbsOffset(lod); T* seq_data = seq->mutable_data(context.GetPlace()); + paddle::framework::MixVector mix_vector(&(abs_offset_lod[level])); #ifdef PADDLE_WITH_HIP hipLaunchKernelGGL( HIP_KERNEL_NAME(SequenceScaleKernel), dim3(num_seq), dim3(PADDLE_CUDA_NUM_THREADS), 0, context.stream(), - seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), - scales, seq_width); + seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, + seq_width); #else SequenceScaleKernel<<< num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( - seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()), - scales, seq_width); + seq_data, mix_vector.CUDAMutableData(context.GetPlace()), scales, + seq_width); #endif + mix_vector.CopyToCPU(); } }; diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index bc13321473b88fd89a635259e6c4e8c4c113cc1b..e8c80096dd88bf9542794a850f08be931b221e81 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -262,6 +262,10 @@ using EluMKLDNNFunctor = MKLDNNActivationFunc; template using ExpMKLDNNFunctor = MKLDNNActivationFunc; +template +using RoundMKLDNNFunctor = + MKLDNNActivationFunc; + template using ReluMKLDNNGradFunctor = MKLDNNActivationGradFunc; @@ -330,6 +334,10 @@ namespace ops = paddle::operators; ops::MKLDNNActivationGradKernel< \ ops::grad_functor>); +#define REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(act_type, functor) \ + REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \ + ops::MKLDNNActivationKernel>); + #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ @@ -341,6 +349,8 @@ namespace ops = paddle::operators; __macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); +REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY(round, RoundMKLDNNFunctor); + REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index 33ea36d24b8aef833890277fd69ed02e4859802f..04b90d2f1f380a72dd076774f2b68c2d1bc7e55b 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -53,17 +53,13 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { std::vector ComputeOutputShape( const framework::ExecutionContext& ctx) const { const auto* x = ctx.Input("X"); - auto in_dims = x->dims(); - const bool is_channel_last = false; // In mkldnn kernel, always use NCHW - - framework::DDim in_dhw_dims; - if (is_channel_last) { // NDHWC, NHWC, NWC - in_dhw_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { // NCDHW, NCHW, NCW - in_dhw_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } + const auto& in_dims = x->dims(); + + const framework::DDim in_dhw_dims = + phi::slice_ddim(in_dims, 2, in_dims.size()); std::vector out_dims; + out_dims.reserve(5); if (in_dhw_dims.size() == 1) { out_dims.push_back(ctx.Attr("out_w")); } else if (in_dhw_dims.size() == 2) { @@ -125,12 +121,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { "out_d, out_h, out_w of Op(interpolate) " "should be greater than 0.")); - out_dims.insert(out_dims.begin(), in_dims[0]); - if (is_channel_last) { - out_dims.push_back(in_dims[in_dims.size() - 1]); - } else { - out_dims.insert(out_dims.begin() + 1, in_dims[1]); - } + const std::vector nc_dims = {in_dims[0], in_dims[1]}; + out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end()); return out_dims; } @@ -143,12 +135,12 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { const auto* x = ctx.Input("X"); auto* z = ctx.Output("Out"); - auto interp_method = ctx.Attr("interp_method"); - dnnl::algorithm algo = (interp_method == "nearest") - ? dnnl::algorithm::resampling_nearest - : dnnl::algorithm::resampling_linear; + const auto interp_method = ctx.Attr("interp_method"); + const dnnl::algorithm algo = (interp_method == "nearest") + ? dnnl::algorithm::resampling_nearest + : dnnl::algorithm::resampling_linear; - auto out_dims_vec = ComputeOutputShape(ctx); + const auto out_dims_vec = ComputeOutputShape(ctx); framework::DDim dim_out = phi::make_ddim(out_dims_vec); z->Resize(dim_out); @@ -162,6 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { const std::unordered_map args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + resampling_prim->execute(astream, args); astream.wait(); @@ -184,6 +177,7 @@ REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, ops::InterpolateMKLDNNKernel, + ops::InterpolateMKLDNNKernel, ops::InterpolateMKLDNNKernel, ops::InterpolateMKLDNNKernel); REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu index 5bfbc3fd681b8a677e5d512750c69706cc68b2d1..3b8ef9056946a1f84d98621442394dbf3e806576 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cu +++ b/paddle/fluid/operators/optimizers/adagrad_op.cu @@ -96,12 +96,14 @@ struct SparseAdagradFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid2(1, merge_rows.size()); + paddle::framework::MixVector mixv_merge_rows(&merge_rows); SparseAdagradFunctorKernel< T, 256><<(context) .stream()>>>( - grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr, - param_data, moment_data, grad_width, epsilon); + grad_merge_data, mixv_merge_rows.CUDAMutableData(context.GetPlace()), + lr, param_data, moment_data, grad_width, epsilon); + mixv_merge_rows.CopyToCPU(); } }; diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 668dd41fa257f28ab819dd811c1002b024372fab..c1aa392d8a528d248d07fb9654e45e3006e79139 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -345,7 +345,10 @@ class AdamOpCUDAKernel : public framework::OpKernel { auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); if (beta1_pow->place() == platform::CPUPlace() && diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 7a04b0bd75a4950c926e7db21e13c70ea20d2bb1..decab04f1ca261a828dd749cefbdbaf9f5cfac79 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -592,7 +592,10 @@ class AdamOpKernel : public framework::OpKernel { auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); SparseAdamFunctor functor( diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu index abdc61e7fcb46655e3741c1bd7b37a0ec3fd2c7f..1d61bdec26d581278758f39293e600598624435f 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.cu +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -368,7 +368,10 @@ class AdamWOpCUDAKernel : public framework::OpKernel { auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); if (beta1_pow->place() == platform::CPUPlace() && diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index b74009120abc48feb8b4da0256eac96b1e9b1698..596ed05df3ffd740958bc123582139464722ac23 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -189,7 +189,9 @@ class FTRLOpKernel : public framework::OpKernel { merge_func(ctx.template device_context(), *grad, merged_grad); - const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace()); + auto* merged_rows = merged_grad->mutable_rows(); + paddle::framework::MixVector mixv_merged_rows(merged_rows); + const int64_t* rows = mixv_merged_rows.Data(ctx.GetPlace()); auto row_numel = static_cast(merged_grad->value().dims()[1]); auto row_height = static_cast(merged_grad->rows().size()); diff --git a/paddle/fluid/operators/optimizers/lamb_op.h b/paddle/fluid/operators/optimizers/lamb_op.h index a2189d2a7ca0eda833e926604affc9d9075b1e75..45acf2b3e48345c6a17c75f8409744776a03b243 100644 --- a/paddle/fluid/operators/optimizers/lamb_op.h +++ b/paddle/fluid/operators/optimizers/lamb_op.h @@ -594,7 +594,10 @@ class LambOpKernel : public framework::OpKernel { auto& grad_merge = *grad_merge_ptr; auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); - const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); if (platform::is_gpu_place(ctx.GetPlace()) && beta1_pow.place() == platform::CPUPlace() && diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 0561c18580a3f6098ef3471d1cfaa328e5b31026..e271755b740ce33369348ca6f415af958a43616d 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -561,7 +561,10 @@ class MomentumOpKernel : public framework::OpKernel { merge_func(ctx.template device_context(), *grad, merged_grad); - const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace()); + auto* grad_merge_rows = merged_grad->mutable_rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); int64_t row_numel = merged_grad->value().numel() / merged_grad->rows().size(); platform::ForRange for_range( diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h index 66c16d8015806982a5cf5b321e3ff019fe14831a..71decd27d0d7822c67ba4a2782c1ec2461e67911 100644 --- a/paddle/fluid/operators/optimizers/rmsprop_op.h +++ b/paddle/fluid/operators/optimizers/rmsprop_op.h @@ -227,7 +227,10 @@ class RmspropOpKernel : public framework::OpKernel { merge_func(dev_ctx, grad, merged_grad); platform::ForRange for_range(dev_ctx, limit); - const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace()); + auto &grad_merge_rows = merged_grad->rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + &grad_merge_rows); + const int64_t *rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); auto &merged_tensor = merged_grad->value(); int64_t row_count = merged_grad->rows().size(); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index a255f0fed3ce0c7b143de6d75beabe36b08b6d60..3149f5f56ed4964a750f61a354c6cd31a29fc526 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -148,11 +148,11 @@ class SGDOpKernel int thread_x = kThreadsPerBlock; int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - + paddle::framework::MixVector mixv_in_rows(&in_rows); SparseSGDFunctorKernel<<>>( - in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel, in_rows.size()); + in_data, mixv_in_rows.CUDAData(ctx.GetPlace()), + learning_rate->data(), out_data, in_row_numel, in_rows.size()); } else { PADDLE_ENFORCE_EQ(false, true, diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index cc4b6e5e0756a0a50dd3f28d6c7056e748c80a87..0cecbf0b9cb027f7032b7b20fb10ef06a79503df 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include - -#include "paddle/fluid/operators/poisson_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,14 +25,6 @@ class PoissonOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PoissonOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PoissonOp"); - - auto dim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -61,29 +55,6 @@ class PoissonOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { } }; -template -class PoissonKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - - const T *x_data = x->data(); - T *out_data = out->mutable_data(ctx.GetPlace()); - - int64_t size = x->numel(); - - auto gen = framework::DefaultCPUGenerator(); - auto engine = gen->GetCPUEngine(); - - for (int64_t i = 0; i < size; ++i) { - std::poisson_distribution<> dist(x_data[i]); - out_data[i] = static_cast(dist(*engine)); - } - } -}; - class PoissonGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -116,17 +87,13 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, ops::PoissonGradOpMaker, - ops::PoissonGradOpMaker); + ops::PoissonGradOpMaker, + PoissonInferShapeFunctor); REGISTER_OPERATOR(poisson_grad, ops::PoissonGradOp); - -REGISTER_OP_CPU_KERNEL(poisson, - ops::PoissonKernel, - ops::PoissonKernel); - -REGISTER_OP_CPU_KERNEL(poisson_grad, - ops::PoissonGradKernel, - ops::PoissonGradKernel); diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu deleted file mode 100644 index ef2f6d4665554024066f4e843707d6612290340f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/poisson_op.cu +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -#endif -#include "paddle/fluid/operators/poisson_op.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct PoissonCudaFunctor { - public: - PoissonCudaFunctor(const T* in, T* out, unsigned int seed, - unsigned int offset) - : in_(in), out_(out), seed_(seed), offset_(offset) {} - - __device__ void operator()(int64_t idx) { -#ifdef __NVCC__ - curandStatePhilox4_32_10_t state; - curand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(curand_poisson(&state, in_[idx])); -#elif __HIPCC__ - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(hiprand_poisson(&state, in_[idx])); -#endif - } - - private: - const T* in_; - T* out_; - const unsigned int seed_; - const unsigned int offset_; -}; - -template -class PoissonKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - const T* x_data = x->data(); - T* out_data = out->mutable_data(ctx.GetPlace()); - auto size = x->numel(); - int64_t device_id = ctx.GetPlace().GetDeviceId(); - - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - auto seed_offset = gen_cuda->IncrementOffset(20); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, size); - - PoissonCudaFunctor functor(x_data, out_data, seed, offset); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(poisson, - ops::PoissonKernel, - ops::PoissonKernel); - -REGISTER_OP_CUDA_KERNEL( - poisson_grad, ops::PoissonGradKernel, - ops::PoissonGradKernel); diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h deleted file mode 100644 index 2bcb5244012c7663c413fceaa63a9dbbd78147b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/poisson_op.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PoissonKernel; - -template -class PoissonGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant functor; - auto& dev_ctx = ctx.template device_context(); - functor(dev_ctx, dx, static_cast(0)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc index 980351e12a030760b6793ab665d80db737bfa9d5..c5971632b03ef3811d0e836a306f26f7e9a51eb8 100644 --- a/paddle/fluid/operators/pscore/send_and_recv_op.cc +++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc index bdc2ea0b5bfbbfc45f02d4df3a7cf1dbae25bacf..1b28ab3c133f7d57250e3357b0d732603719ef99 100644 --- a/paddle/fluid/operators/randperm_op.cc +++ b/paddle/fluid/operators/randperm_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/randperm_op.h" #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -89,10 +88,3 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, paddle::operators::RandpermOpVarTypeInference); - -template -using kernel = - paddle::operators::RandpermKernel; - -REGISTER_OP_CPU_KERNEL(randperm, kernel, kernel, kernel, - kernel); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 6393ff2135d1dcae37b2b9e60775460668bf295a..21c23a7f602a35acf676e97a9134c2c43a73126c 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { @@ -115,7 +116,9 @@ void BufferedReader::ReadAsync(size_t i) { platform::CUDAPinnedPlace cuda_pinned_place; std::vector cuda_pinned_ptrs; cuda_pinned_ptrs.reserve(cpu.size()); - platform::RecordEvent record_event("BufferedReader:MemoryCopy"); + platform::RecordEvent record_event( + "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined, + 1); // NODE(chenweihang): When we use CUDAPinned Memory, we need call // cudaHostAlloc, that is a CUDA API, calling CUDA API need load // cuda lib into device, it will cost hundreds of MB of GPU memory. @@ -170,7 +173,9 @@ void BufferedReader::ReadAsync(size_t i) { cudaStreamWaitEvent(stream_.get(), events_[i].get(), 0)); #endif - platform::RecordEvent record_event("BufferedReader:MemoryCopy"); + platform::RecordEvent record_event( + "BufferedReader:MemoryCopy", platform::TracerEventType::UserDefined, + 1); for (size_t i = 0; i < cpu.size(); ++i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); @@ -229,7 +234,9 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUEventRecord(events_[i].get(), compute_stream_); platform::NPUStreamWaitEvent(stream_.get(), events_[i].get()); - platform::RecordEvent record_event("BufferedReader:MemoryCopy"); + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); for (size_t i = 0; i < cpu.size(); ++i) { auto cpu_place = cpu[i].place(); auto cpu_ptr = cpu[i].data(); diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 73bc67287c2780d541d93df620776f2936c6ec86..d406640bff240cc24400e858d5c8b274897e1f98 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace operators { @@ -106,7 +106,8 @@ class ReadOp : public framework::OperatorBase { std::vector ins; // For profiling - platform::RecordEvent record_event(Type()); + platform::RecordEvent record_event( + Type().c_str(), platform::TracerEventType::UserDefined, 1); reader->ReadNext(&ins); if (ins.empty()) { diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 1174e72a76b1bb5aa744b964e289f0ac9c66596c..1f3691978b577e2023eb4f784f2327752855b9b7 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/real_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,14 +23,6 @@ namespace operators { class RealOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Real"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Real"); - - auto x_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", "Out"); - } }; class RealOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,19 +82,13 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker, ops::RealGradOpMaker<::paddle::framework::OpDesc>, - ops::RealGradOpMaker<::paddle::imperative::OpBase>); + ops::RealGradOpMaker<::paddle::imperative::OpBase>, + RealInferShapeFunctor); REGISTER_OPERATOR(real_grad, ops::RealGradOp); - -REGISTER_OP_CPU_KERNEL(real, ops::RealKernel>, - ops::RealKernel>); -REGISTER_OP_CPU_KERNEL(real_grad, - ops::RealGradKernel>, - ops::RealGradKernel>); diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu deleted file mode 100644 index 9bfb2878a6261bb5c69a1fb543e5aa15a87c5a8f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/real_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/real_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(real, - ops::RealKernel>, - ops::RealKernel>); -REGISTER_OP_CUDA_KERNEL(real_grad, - ops::RealGradKernel>, - ops::RealGradKernel>); diff --git a/paddle/fluid/operators/real_op.h b/paddle/fluid/operators/real_op.h deleted file mode 100644 index c5a9724e8a3048a27aaadfc5e0c42be4816004bd..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/real_op.h +++ /dev/null @@ -1,67 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -class RealKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* out = ctx.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - ctx.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class RealGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor(dout_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 7764e52c2f6da1b401b01292969c4d3d04555933..09d2d906653e8c71ddeca7fa606cf5adac8cc596 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -32,6 +32,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); auto in_dims = in->dims(); int batch_size = in_dims[0]; @@ -117,7 +118,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel { dev_ctx.x_context(), in->data(), out->mutable_data(ctx.GetPlace()), rois->data(), roi_id_data, batch_size, channels, height, width, out->dims()[0], pooled_height, - pooled_width, spatial_scale, sampling_ratio, true); + pooled_width, spatial_scale, sampling_ratio, true, aligned); PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, platform::errors::External( "The roi_align XPU OP return wrong value[%d %s]", r, @@ -143,6 +144,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel { auto pooled_width = ctx.Attr("pooled_width"); auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); + auto aligned = ctx.Attr("aligned"); int rois_num = rois->dims()[0]; int channels = in->dims()[1]; @@ -197,7 +199,7 @@ class XPUROIAlignGradOpKernel : public framework::OpKernel { dev_ctx.x_context(), out_grad->data(), in_grad->data(), rois->data(), roi_id_data, in->dims()[0], channels, height, width, out_grad->dims()[0], pooled_height, pooled_width, spatial_scale, - sampling_ratio, true); + sampling_ratio, true, aligned); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External( diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 3def7875232e814b817a7957ab9db65ea611dcf6..c5794948aaec6b47396cbae66a962058812aba11 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -336,7 +336,8 @@ class RowConvKernel int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + paddle::framework::MixVector mix_vector(&batch_indices); + size_t *idx = mix_vector.CUDAMutableData(context.GetPlace()); auto stream = context.cuda_device_context().stream(); if (future_context <= 32) { @@ -352,6 +353,7 @@ class RowConvKernel RowConvForward<<>>( in, weight, num_sequence, input_dim, future_context, idx, out); } + mix_vector.CopyToCPU(); } }; @@ -392,7 +394,8 @@ class RowConvGradKernel // int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; - size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); + paddle::framework::MixVector mixv_batch_indices(&batch_indices); + size_t *idx = mixv_batch_indices.CUDAMutableData(context.GetPlace()); auto &device_ctx = context.cuda_device_context(); phi::funcs::SetConstant zero; @@ -444,6 +447,7 @@ class RowConvGradKernel dout, weights, num_sequence, input_dim, future_context, idx, din); } } + mixv_batch_indices.CopyToCPU(); } }; } // namespace operators diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 8092a40d19b195828c3742854e9b3656424feee7..9591f3e8b5bbfe70cb059b621eaca0ae1fff993e 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -71,7 +71,8 @@ class SequenceEnumerateOpCUDAKernel : public framework::OpKernel { out->Resize({in_dims[0], win_size}); auto out_data = out->mutable_data(context.GetPlace()); // Copy LoD to GPU - const size_t* dev_in_lod_ptr = lod0.CUDAData(context.GetPlace()); + paddle::framework::MixVector mixv_lod0(&lod0); + const size_t* dev_in_lod_ptr = mixv_lod0.CUDAData(context.GetPlace()); // Calc output tensor CalcOutPut<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index bb928cf401c3307b76160387e5108264cd5dbb89..12d3eee65da70edd3f360d448360bb59d2f1069f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -88,7 +88,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel { // Copy LoD to GPU auto last_lod = lod[lod.size() - 1]; auto lod_len = last_lod.size(); - const size_t* dev_in_lod_ptr = last_lod.CUDAData(ctx.GetPlace()); + paddle::framework::MixVector mixv_last_lod(&last_lod); + const size_t* dev_in_lod_ptr = mixv_last_lod.CUDAData(ctx.GetPlace()); // Calc output LoD thrust::device_vector dev_out_lod(lod_len); size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data()); diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index f13849fda41769af12aabf93be748e3ce2ad806b..7e1a06b9eca5b9046d2b772edee0efdb1a69437f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -81,8 +81,9 @@ struct SequenceExpandAsFunctor { dim3 block_size(thread_x); dim3 grid_size(block_x); + paddle::framework::MixVector mixv_ref_lod(&ref_lod); sequence_expand_as_kernel<<>>( - x.data(), ref_lod.CUDAData(context.GetPlace()), height, width, + x.data(), mixv_ref_lod.CUDAData(context.GetPlace()), height, width, out->mutable_data(context.GetPlace())); } }; @@ -107,10 +108,11 @@ struct SequenceExpandAsGradFunctor { dim3 block_size(thread_x); dim3 grid_size(block_x); + paddle::framework::MixVector mixv_ref_lod(&ref_lod); sequence_expand_as_grad_kernel<<>>( - dout.data(), ref_lod.CUDAData(context.GetPlace()), height, width, - dx->mutable_data(context.GetPlace())); + dout.data(), mixv_ref_lod.CUDAData(context.GetPlace()), height, + width, dx->mutable_data(context.GetPlace())); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index cbf5df001707592e03b315b357e3a5d484068011..7b7bc5183bf1f6c98ef386150fcfa4d048e73f01 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -157,7 +157,9 @@ struct SequenceExpandFunctor { out_offset[2 * x_lod_size + i] = ref_lod[i]; } - const size_t* out_offset_data = out_offset.CUDAData(context.GetPlace()); + paddle::framework::MixVector mixv_out_offset(&out_offset); + const size_t* out_offset_data = + mixv_out_offset.CUDAData(context.GetPlace()); const size_t* x_lod_data = out_offset_data + x_lod_size; const size_t* ref_lod_data = out_offset_data + 2 * x_lod_size; @@ -193,11 +195,14 @@ struct SequenceExpandGradFunctor { int block_x = static_cast(ref_lod.size()); dim3 block_size(thread_x, thread_y, thread_z); dim3 grid_size(block_x, 1); + paddle::framework::MixVector mixv_ref_lod(&ref_lod); + paddle::framework::MixVector mixv_x_lod(&x_lod); + paddle::framework::MixVector mixv_out_offset(&out_offset); sequence_expand_grad_kernel<<>>( - dout.data(), ref_lod.CUDAData(context.GetPlace()), - x_lod.CUDAData(context.GetPlace()), - out_offset.CUDAData(context.GetPlace()), ref_lod.size(), x_item_length, - dx->mutable_data(context.GetPlace())); + dout.data(), mixv_ref_lod.CUDAData(context.GetPlace()), + mixv_x_lod.CUDAData(context.GetPlace()), + mixv_out_offset.CUDAData(context.GetPlace()), ref_lod.size(), + x_item_length, dx->mutable_data(context.GetPlace())); } }; diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h index c42df836de15f5c51caf32e5d0b7b7d8123ff201..90a17d713cf299a3a61169cfc6f16fce7bb5901c 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h @@ -132,7 +132,9 @@ class SequenceReverseOpKernel : public framework::OpKernel { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(ctx.GetPlace())) { - lod = x.lod()[0].CUDAData(ctx.GetPlace()); + auto xlod = x.lod()[0]; + paddle::framework::MixVector mixv_xlod(&xlod); + lod = mixv_xlod.CUDAData(ctx.GetPlace()); } else { #endif lod = x.lod()[0].data(); diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu index 220165ac1bd4f6a80a2f3c0b21f5423352982588..c91c59dbfee9993711e777668063bec73a3746d8 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu @@ -133,9 +133,10 @@ struct SequenceSoftmaxFunctor { dim3 block_size(thread_x); dim3 grid_size(max_blocks); + paddle::framework::MixVector mixv_ref_lod(&ref_lod); sequence_softmax_kernel< T, kThreadsPerBlock><<>>( - x.data(), ref_lod.CUDAData(context.GetPlace()), height, + x.data(), mixv_ref_lod.CUDAData(context.GetPlace()), height, out->mutable_data(context.GetPlace())); } }; @@ -156,10 +157,12 @@ struct SequenceSoftmaxGradFunctor { dim3 block_size(thread_x); dim3 grid_size(max_blocks); + paddle::framework::MixVector mixv_ref_lod(&ref_lod); sequence_softmax_grad_kernel< T, kThreadsPerBlock><<>>( - dout.data(), out.data(), ref_lod.CUDAData(context.GetPlace()), - height, dx->mutable_data(context.GetPlace())); + dout.data(), out.data(), + mixv_ref_lod.CUDAData(context.GetPlace()), height, + dx->mutable_data(context.GetPlace())); } }; diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index bd3dc002990a7cf3af738eb2d914b3fc3dd9e79a..54f4deac80a74e2e471036c2e25d08a582e29a9d 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,14 +23,6 @@ namespace operators { class TruncOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "trunc"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "trunc"); - auto input_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", input_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class TruncOpMaker : public framework::OpProtoAndCheckerMaker { @@ -75,9 +69,13 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, ops::TruncGradOpMaker, - ops::TruncGradOpMaker); + ops::TruncGradOpMaker, + TruncInferShapeFunctor); REGISTER_OPERATOR(trunc_grad, ops::TruncGradOp); diff --git a/paddle/fluid/operators/unbind_op.cc b/paddle/fluid/operators/unbind_op.cc index 3fce0f8f47d32a602d56e88b43ddb9bf3d4b15f8..f2fc08308c6b32868adc8057c9bc2a92c4247c60 100644 --- a/paddle/fluid/operators/unbind_op.cc +++ b/paddle/fluid/operators/unbind_op.cc @@ -14,6 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/unbind_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -79,11 +82,3 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(unbind, ops::UnbindOp, ops::UnbindOpMaker, ops::UnbindGradMaker, ops::UnbindGradMaker); -namespace plat = paddle::platform; -REGISTER_OP_CPU_KERNEL( - unbind, ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel); diff --git a/paddle/fluid/operators/unbind_op.cu.cc b/paddle/fluid/operators/unbind_op.cu.cc deleted file mode 100644 index cec7058d3cf52eff55eb88afaa217204a72e4566..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unbind_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unbind_op.h" -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - unbind, ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel, - ops::UnbindOpKernel); diff --git a/paddle/fluid/operators/unbind_op.h b/paddle/fluid/operators/unbind_op.h index 69808e3f9fe9ed4a92152fc89532a7470bf85f6f..6e35f262de420744b5299fbf1ab540e34c711d92 100644 --- a/paddle/fluid/operators/unbind_op.h +++ b/paddle/fluid/operators/unbind_op.h @@ -34,27 +34,6 @@ static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims, } return phi::make_ddim(out_dims); } -template -class UnbindOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto outs = ctx.MultiOutput("Out"); - int axis = ctx.Attr("axis"); - - auto in_dims = in->dims(); - axis = axis < 0 ? in_dims.size() + axis : axis; - std::vector shape_refer; - for (size_t j = 0; j < outs.size(); ++j) { - outs[j]->mutable_data(ctx.GetPlace()); - shape_refer.emplace_back(outs[j]); - } - - auto& dev_ctx = ctx.template device_context(); - math::SplitFunctor functor; - functor(dev_ctx, *in, shape_refer, axis, &outs); - } -}; template class UnbindGradMaker : public framework::SingleGradOpMaker { diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index 0a8cd6e65f93e080797a17eb110b10e53b8ddc69..c45b839d5b40bd1d0db25743406bb8cc319f1280 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -12,7 +12,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/unfold_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -60,126 +62,6 @@ feature map, a series of such columns will be formed. class UnfoldOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of UnfoldOp should not be null")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Y"), true, - platform::errors::NotFound("Output(Y) of UnfoldOp should not be null")); - auto in_dims = ctx->GetInputDim("X"); - std::vector kernel_sizes = - ctx->Attrs().Get>("kernel_sizes"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - std::vector dilations = - ctx->Attrs().Get>("dilations"); - - // Only [N, C, H, W] input supported now - PADDLE_ENFORCE_EQ( - in_dims.size(), 4, - platform::errors::InvalidArgument( - "Input should be 4-D tensor of format [N, C, H, W], but get %u", - in_dims.size())); - PADDLE_ENFORCE_EQ( - in_dims.size() - kernel_sizes.size(), 2U, - platform::errors::InvalidArgument( - "The dims of X should be larger than that of kernel_sizes " - "by a number of 2, due to the batch size and input channel dim. " - "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2", - in_dims.size(), kernel_sizes.size())); - PADDLE_ENFORCE_EQ( - strides.size(), kernel_sizes.size(), - platform::errors::InvalidArgument( - "The dims of strides should be the same with that of kernel_sizes. " - "But recieved dims(strides: %u) != dims(kernel_sizes: %u).", - strides.size(), kernel_sizes.size())); - PADDLE_ENFORCE_EQ( - paddings.size(), 2 * strides.size(), - platform::errors::InvalidArgument( - "The dims of paddings should be 2 times of that of strides. " - "But recieved dims(paddings: %u) != 2*dims(strides: %u).", - paddings.size(), strides.size())); - PADDLE_ENFORCE_EQ( - strides.size(), dilations.size(), - platform::errors::InvalidArgument( - "The dims of strides should be the same with that of dilations. " - "But recieved dims(strides: %u) != dims(dilations: %u).", - strides.size(), dilations.size())); - - // check kernel_sizes - PADDLE_ENFORCE_GT(kernel_sizes[0], 0, - platform::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but recieved kernel_height: %d kernel_width: %d.", - kernel_sizes[0], kernel_sizes[1])); - PADDLE_ENFORCE_GT(kernel_sizes[1], 0, - platform::errors::InvalidArgument( - "The `kernel_sizes` should be greater than zero, " - "but recieved kernel_height: %d kernel_width: %d.", - kernel_sizes[0], kernel_sizes[1])); - // check strides - PADDLE_ENFORCE_GT(strides[0], 0, - platform::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but recieved strides_height: %d strides_width: %d.", - strides[0], strides[1])); - PADDLE_ENFORCE_GT(strides[1], 0, - platform::errors::InvalidArgument( - "The `strides` should be greater than zero, " - "but recieved strides_height: %d strides_width: %d.", - strides[0], strides[1])); - // check dilations - PADDLE_ENFORCE_GT( - dilations[0], 0, - platform::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but recieved dilations_height: %d dilations_width: %d.", - dilations[0], dilations[1])); - PADDLE_ENFORCE_GT( - dilations[1], 0, - platform::errors::InvalidArgument( - "The `dilations` should be greater than zero, " - "but recieved dilations_height: %d dilations_width: %d.", - dilations[0], dilations[1])); - - std::vector out_dims; - out_dims.push_back(in_dims[0]); - int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1]; - out_dims.push_back(output_channels); - - int output_height = - CalcOutputSize(in_dims[2], kernel_sizes[0], dilations[0], paddings[0], - paddings[2], strides[0]); - int output_width = CalcOutputSize(in_dims[3], kernel_sizes[1], dilations[1], - paddings[1], paddings[3], strides[1]); - if (ctx->IsRuntime()) { - // only check output height and width in runtime - PADDLE_ENFORCE_GT( - output_height, 0, - platform::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size " - "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), " - "dilations (%d, %d), is (%d, %d), which should be a " - "positive integer.", - in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1], - strides[0], strides[1], dilations[0], dilations[1], output_height, - output_width)); - PADDLE_ENFORCE_GT( - output_width, 0, - platform::errors::InvalidArgument( - "The sliding blocks calculated from input spatial size " - "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), " - "dilations (%d, %d), is (%d, %d), which should be a " - "positive integer.", - in_dims[2], in_dims[3], kernel_sizes[0], kernel_sizes[1], - strides[0], strides[1], dilations[0], dilations[1], output_height, - output_width)); - } - int output_col_length = output_height * output_width; - out_dims.push_back(output_col_length); - ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -237,16 +119,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, + PT_INFER_META(phi::UnfoldInferMeta)); REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker, ops::UnfoldGradMaker, - ops::UnfoldGradMaker); + ops::UnfoldGradMaker, + UnfoldInferShapeFunctor); REGISTER_OPERATOR(unfold_grad, ops::UnfoldGradOp, ops::UnfoldGradOpNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - unfold, ops::UnfoldOpKernel, - ops::UnfoldOpKernel); -REGISTER_OP_CPU_KERNEL( - unfold_grad, - ops::UnfoldGradOpKernel, - ops::UnfoldGradOpKernel); diff --git a/paddle/fluid/operators/unfold_op.cu b/paddle/fluid/operators/unfold_op.cu deleted file mode 100644 index 46584506d431564cfc7af11072eee6c544f03564..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unfold_op.cu +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -Indicesou may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/unfold_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - unfold, ops::UnfoldOpKernel, - ops::UnfoldOpKernel); - -REGISTER_OP_CUDA_KERNEL( - unfold_grad, - ops::UnfoldGradOpKernel, - ops::UnfoldGradOpKernel); diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h deleted file mode 100644 index f35bce3abff2b272d589067d27d31b4d3c6191a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/unfold_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/im2col.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -inline int CalcOutputSize(int input_size, int filter_size, int dilation, - int padding1, int padding2, int stride) { - const int dkernel = dilation * (filter_size - 1) + 1; - int output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1; - return output_size; -} - -template -class UnfoldOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* input = ctx.Input("X"); - const int batch_size = static_cast(input->dims()[0]); - Tensor* output = ctx.Output("Y"); - output->mutable_data(ctx.GetPlace()); - - std::vector kernel_sizes = ctx.Attr>("kernel_sizes"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - - math::Im2ColFunctor im2col; - auto& dev_ctx = ctx.template device_context(); - - auto input_dims = input->dims(); - - int output_height = - CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0], - paddings[0], paddings[2], strides[0]); - int output_width = - CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1], - paddings[1], paddings[3], strides[1]); - - framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]}); - framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0], - kernel_sizes[1], output_height, - output_width}); - - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); - Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); - im2col(dev_ctx, in_batch, dilations, strides, paddings, &out_batch); - } - } -}; - -template -class UnfoldGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* output_grad = ctx.Input(framework::GradVarName("Y")); - Tensor* input_grad = ctx.Output(framework::GradVarName("X")); - input_grad->mutable_data(ctx.GetPlace()); - - if ((!output_grad) || (!input_grad)) return; - - std::vector kernel_sizes = ctx.Attr>("kernel_sizes"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - - const int batch_size = static_cast(input_grad->dims()[0]); - - auto input_dims = input_grad->dims(); - - int output_height = - CalcOutputSize(input_dims[2], kernel_sizes[0], dilations[0], - paddings[0], paddings[2], strides[0]); - int output_width = - CalcOutputSize(input_dims[3], kernel_sizes[1], dilations[1], - paddings[1], paddings[3], strides[1]); - - framework::DDim input_shape({input_dims[1], input_dims[2], input_dims[3]}); - framework::DDim output_matrix_shape({input_dims[1], kernel_sizes[0], - kernel_sizes[1], output_height, - output_width}); - - math::Col2ImFunctor col2im; - auto& dev_ctx = ctx.template device_context(); - - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, input_grad, static_cast(0)); - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - output_grad->Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); - col2im(dev_ctx, out_grad_batch, dilations, strides, paddings, - &in_grad_batch); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu index a5231354eb47ea3b5cdd802a9b77f7ba7e313c1e..1c7b9a27f868821ceb20c720548b4df0ee6bcd40 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op.cu +++ b/paddle/fluid/operators/uniform_random_inplace_op.cu @@ -12,130 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/uniform_random_op.h" +#include "paddle/phi/kernels/full_kernel.h" namespace paddle { namespace operators { - -template -struct UniformGenerator { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num, - int diag_step, T diag_val) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - -template -struct UniformGeneratorOffset { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - int offset_; - __host__ __device__ UniformGeneratorOffset(T min, T max, int seed, - int diag_num, int diag_step, - T diag_val, int offset) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val), - offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n + offset_); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - -template -__global__ void fill_value(int64_t size, T* data, float value) { - for (int idx = threadIdx.x; idx < size; idx += blockDim.x) { - data[idx] = static_cast(value); - } -} - -// It seems that Eigen::Tensor::random in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random as uniform_random_op.cu. template class GPUUniformRandomInplaceKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto out_var = ctx.OutputVar("Out"); - auto* tensor = out_var->GetMutable(); - T* data = tensor->mutable_data(ctx.GetPlace()); - unsigned int seed = static_cast(ctx.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - - T min = static_cast(ctx.Attr("min")); - T max = static_cast(ctx.Attr("max")); - unsigned int diag_num = - static_cast(ctx.Attr("diag_num")); - unsigned int diag_step = - static_cast(ctx.Attr("diag_step")); - T diag_val = static_cast(ctx.Attr("diag_val")); - thrust::counting_iterator index_sequence_begin(0); - int64_t size = tensor->numel(); - int device_id = ctx.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - if (gen_cuda->GetIsInitPy() && seed_flag) { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGeneratorOffset(min, max, seed_offset.first, diag_num, - diag_step, diag_val, gen_offset)); - } else { - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGenerator(min, max, seed, diag_num, diag_step, diag_val)); - } + void Compute(const framework::ExecutionContext& context) const override { + auto* tensor = context.Output("Out"); + UniformRandom(context, tensor); } }; @@ -143,17 +30,15 @@ template class GPUUniformRandomInplaceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#ifdef __HIPCC__ - const int64_t kMaxBlockDim = 256; -#else - const int64_t kMaxBlockDim = 512; -#endif auto* dx = ctx.Output(framework::GradVarName("X")); - auto* data = dx->mutable_data(ctx.GetPlace()); - - auto size = dx->numel(); - int64_t kBlockDim = std::min(size, kMaxBlockDim); - fill_value<<<1, kBlockDim, 0>>>(size, data, static_cast(0)); + auto dims = vectorize(dx->dims()); + const auto& dev_cxt = + ctx.template device_context(); + float value = static_cast(0.0f); + phi::FullKernel( + static_cast::TYPE&>(dev_cxt), + dims, value, phi::DataType::UNDEFINED, dx); } }; diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 086c57527b48ffc940c029fb462afd6c22d86f98..fb38a6aded4cf173bb4c0dd96d131ff520b6701e 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -11,88 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include -#include -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/uniform_random_op.h" -DECLARE_bool(use_curand); - namespace paddle { namespace operators { -template -struct UniformGenerator { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num, - int diag_step, T diag_val) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - -template -struct UniformGeneratorOffset { - T min_, max_; - unsigned int seed_; - T diag_val_; - unsigned int diag_num_; - unsigned int diag_step_; - int offset_; - __host__ __device__ UniformGeneratorOffset(T min, T max, int seed, - int diag_num, int diag_step, - T diag_val, int offset) - : min_(min), - max_(max), - seed_(seed), - diag_num_(diag_num), - diag_step_(diag_step), - diag_val_(diag_val), - offset_(offset) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n + offset_); - T out = dist(rng); - unsigned int remainder = n % (diag_step_ + 1); - if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { - out = diag_val_; - } - return out; - } -}; - -// It seems that Eigen::Tensor::random in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. template class GPUUniformRandomKernel : public framework::OpKernel { public: @@ -128,50 +51,7 @@ class GPUUniformRandomKernel : public framework::OpKernel { "unsupport type: %s.", framework::ToTypeName(out_var->Type()))); } - auto& dev_cxt = - context.template device_context(); - T* data = tensor->mutable_data(dev_cxt.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - - T min = static_cast(context.Attr("min")); - T max = static_cast(context.Attr("max")); - unsigned int diag_num = - static_cast(context.Attr("diag_num")); - unsigned int diag_step = - static_cast(context.Attr("diag_step")); - T diag_val = static_cast(context.Attr("diag_val")); - thrust::counting_iterator index_sequence_begin(0); - int64_t size = tensor->numel(); - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_cxt, tensor, dist, - trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGeneratorOffset(min, max, seed_offset.first, diag_num, - diag_step, diag_val, gen_offset)); - } - } else { - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGenerator(min, max, seed, diag_num, diag_step, diag_val)); - } + UniformRandom(context, tensor); } }; diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index be6c3c740e692c17504fb36bd807c06768da2ee9..a864c48ad757411861b6d2b3be40361c347601f8 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -18,6 +18,16 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#if defined(__NVCC__) || defined(__HIPCC__) +DECLARE_bool(use_curand); +#include +#include +#include +#include +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/operators/index_impl.cu.h" +#include "paddle/phi/kernels/full_kernel.h" +#endif namespace paddle { namespace operators { @@ -102,5 +112,117 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } + +#if defined(__NVCC__) || defined(__HIPCC__) + +template +struct UniformGenerator { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num, + int diag_step, T diag_val) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +struct UniformGeneratorOffset { + T min_, max_; + unsigned int seed_; + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + int offset_; + __host__ __device__ UniformGeneratorOffset(T min, T max, int seed, + int diag_num, int diag_step, + T diag_val, int offset) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val), + offset_(offset) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n + offset_); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; + } +}; + +template +void UniformRandom(const framework::ExecutionContext& context, + framework::Tensor* tensor) { + int64_t size = tensor->numel(); + auto& dev_cxt = + context.template device_context(); + T* data = tensor->mutable_data(dev_cxt.GetPlace()); + if (size <= 0) return; + unsigned int seed = static_cast(context.Attr("seed")); + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + T min = static_cast(context.Attr("min")); + T max = static_cast(context.Attr("max")); + unsigned int diag_num = + static_cast(context.Attr("diag_num")); + unsigned int diag_step = + static_cast(context.Attr("diag_step")); + T diag_val = static_cast(context.Attr("diag_val")); + int device_id = context.GetPlace().GetDeviceId(); + auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); + if (gen_cuda->GetIsInitPy() && seed_flag) { + if (FLAGS_use_curand) { + using MT = typename details::MPTypeTrait::Type; + distribution::uniform_distribution dist; + distribution::uniform_transform trans(min, max); + distribution::distribution_and_transform(dev_cxt, tensor, dist, trans); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + auto func = + UniformGeneratorOffset(min, max, seed_offset.first, diag_num, + diag_step, diag_val, gen_offset); + IndexKernel>(dev_cxt, tensor, func); + } + } else { + auto func = + UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); + IndexKernel>(dev_cxt, tensor, func); + } +} +#endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu index 54b0d5b69086cda3ebdefa76636aff734d1a150c..61a1691e4fe265035917ed2407d5e3e24aa6bd88 100644 --- a/paddle/fluid/operators/where_op.cu +++ b/paddle/fluid/operators/where_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/where_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" @@ -20,6 +21,15 @@ namespace platform = paddle::platform; namespace paddle { namespace operators { +template +struct CondFunctor { + HOSTDEVICE inline CondFunctor() {} + + HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const { + return cond ? x : y; + } +}; + template __global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x, const T* y, T* out) { @@ -63,10 +73,11 @@ class WhereKernel auto stream = context.cuda_device_context().stream(); auto& dev_ctx = context.template device_context(); - auto config = GetGpuLaunchConfig1D(dev_ctx, numel); - WhereCUDAKernel< - T><<>>( - numel, cond_data, x_data, y_data, out_data); + auto functor = CondFunctor(); + std::vector ins = {condition, X, Y}; + std::vector outs = {out}; + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index be02bac1aa0ef7462e15f9471a84f79a6007cfb5..37709c953e13b07a9cead3684275a521333fa92a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -141,11 +141,21 @@ if(WITH_GPU OR WITH_ROCM) target_link_libraries(device_context gpu_info gpu_context pten_gpu_info) target_link_libraries(device_context gpu_resource_pool) endif() - +if (WITH_CUSTOM_DEVICE) + target_link_libraries(device_context custom_context) +endif() if(WITH_ASCEND_CL) target_link_libraries(device_context npu_resource_pool) endif() +if(WITH_MLU) + target_link_libraries(device_context mlu_resource_pool) +endif() + +if(WITH_CUSTOM_DEVICE) + target_link_libraries(device_context custom_context) +endif() + cc_test(init_test SRCS init_test.cc DEPS device_context) # Manage all device event library diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h index 40204c0ed83f94da9de11378cf49c652e4f63962..08beed532a7ec1bbc9cd866c90c938493a15f5c1 100644 --- a/paddle/fluid/platform/cuda_device_guard.h +++ b/paddle/fluid/platform/cuda_device_guard.h @@ -14,13 +14,28 @@ #pragma once #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { class CUDADeviceGuard { public: - explicit inline CUDADeviceGuard(int dev_id) { + explicit CUDADeviceGuard(int dev_id) { SetDeviceIndex(dev_id); } + + explicit CUDADeviceGuard(const CUDAPlace& place) + : CUDADeviceGuard(place.device) {} + + // create uninitialized CUDADeviceGuard + CUDADeviceGuard() {} + + ~CUDADeviceGuard() { + if (prev_id_ != -1) { + platform::SetDeviceId(prev_id_); + } + } + + inline void SetDeviceIndex(const int dev_id) { int prev_id = platform::GetCurrentDeviceId(); if (prev_id != dev_id) { prev_id_ = prev_id; @@ -28,10 +43,9 @@ class CUDADeviceGuard { } } - inline ~CUDADeviceGuard() { - if (prev_id_ != -1) { - platform::SetDeviceId(prev_id_); - } + void SetDevice(const CUDAPlace& place) { + int dev_id = place.device; + SetDeviceIndex(dev_id); } CUDADeviceGuard(const CUDADeviceGuard& o) = delete; diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index 367fb3de47c781b47f1b7794e6e873d1f784d697..f17a814175fa0748475099d5cc033d274134357f 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -105,6 +105,18 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, return float16(__shfl_xor_sync(mask, val.to_half(), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { +#if defined(PADDLE_CUDA_BF16) + return bfloat16(__shfl_xor_sync(mask, static_cast(val), width)); +#else + PADDLE_ENFORCE( + false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11."); +#endif +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) { diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 3e070da546b2ae85c40bb0e9cae05cc30d6d22c1..8616e969f69dfd469fec0372d40f6365e5038425 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } } #endif + +// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" +// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index, + const size_t numel, T value) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + CudaAtomicAdd(reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // whether the address is 32-byte aligned. + __half *target_addr = reinterpret_cast<__half *>(tensor + index); + bool aligned_half2 = + (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (aligned_half2 && index < (numel - 1)) { + __half2 value2; + value2.x = *reinterpret_cast<__half *>(&value); + value2.y = __int2half_rz(0); + atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + + } else if (!aligned_half2 && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = *reinterpret_cast<__half *>(&value); + atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + + } else { + atomicAdd(reinterpret_cast<__half *>(tensor) + index, + *reinterpret_cast<__half *>(&value)); + } +#endif +} + +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index, + const size_t numel, T value) { + CudaAtomicAdd(arr + index, value); +} + +#ifdef PADDLE_WITH_CUDA +/* + * One thead block deals with elementwise atomicAdd for vector of len. + * @in: [x1, x2, x3, ...] + * @out:[y1+x1, y2+x2, y3+x3, ...] + * */ +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +} + +// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + int i = 0; + int loops = len / 2 * 2; + + bool aligned_half2 = + (reinterpret_cast(out) % sizeof(__half2) == 0); + + if (aligned_half2) { + for (i = tid * 2; i < loops; i += threads_per_block * 2) { + __half2 value2; + T value_1 = in[i]; + T value_2 = in[i + 1]; + value2.x = *reinterpret_cast<__half *>(&value_1); + value2.y = *reinterpret_cast<__half *>(&value_2); + atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + } + for (; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } else { + for (int i = tid; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } +} +#endif #endif CUDA_ATOMIC_WRAPPER(Add, complex) { diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 1d6ccdc1280a9f9575c048c37700ecc7c8cd6892..1919f59f8c07f2a0a15393fe14f2055f8d0c19bf 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -56,6 +56,23 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { } } +inline ncclDataType_t ToNCCLDataType(experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return ncclFloat; + } else if (type == experimental::DataType::FLOAT64) { + return ncclDouble; + } else if (type == experimental::DataType::INT32) { + return ncclInt; + } else if (type == experimental::DataType::INT64) { + return ncclInt64; + } else if (type == experimental::DataType::FLOAT16) { + return ncclFloat16; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + // NOTE(minqiyang): according to the ncclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, // ncclGroupEnd will wait for all communicators to be initialized, which will diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 63897bd6717408bff4bd4db5e739b3ba64316350..61bf1905fdb74f084a60688094269b89c2a11c28 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -91,6 +91,13 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, return float16(__shfl_xor(static_cast(val), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleXorSync(unsigned mask, + bfloat16 val, + int width) { + return bfloat16(__shfl_xor(static_cast(val), width)); +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( unsigned mask, paddle::platform::complex val, int width) { diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index d54c6a33ecbf53071956aaf4b9d342efa5746f65..acf914c5087d0ff11cda2d663a490e84a8c33216 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -13,9 +13,9 @@ IF(WITH_IPU) "ipu_device.cc" ) - cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper) - cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce) - cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper) + cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper) + cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce) + add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC}) add_dependencies(paddle_ipu ipu_backend) set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "") set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "") diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc index 8f2a7ef78c9824d7706be48f117a86b19c334b8a..e0b3b08a2313d0ba80e807494eb74612caf81fd5 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.cc +++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc @@ -43,17 +43,17 @@ void IpuBackend::Compile(Graph* graph, const std::vector& feed_list, const std::vector& fetch_list) { VLOG(10) << "enter IpuBackend::Compile"; - compiler_->Prepare(); - executor_->SetCompilerResources(compiler_->GetResources()); - - compiler_->InitInputs(graph, feed_list); - compiler_->LowerConstants(graph, scope_); - compiler_->LowerWeights(graph, scope_); - compiler_->LowerBody(graph); + compiler_->Prepare(graph); + compiler_->InitInputs(feed_list); + compiler_->LowerConstants(scope_); + compiler_->LowerWeights(scope_); + compiler_->LowerBody(); compiler_->InitOutputs(fetch_list); if (ipu_strategy_->is_training) { - compiler_->LowerOptimier(graph, scope_); + compiler_->LowerOptimizer(scope_); } + executor_->SetCompilerResources(compiler_->GetResources()); + is_compiled_ = true; // when call compile, means a new graph is_prepared_ = false; @@ -95,11 +95,9 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) { ipu_strategy_ = &strategy; compiler_->SetIpuStrategy(strategy); executor_->SetIpuStrategy(strategy); -} - -void IpuBackend::SetCustomOps( - const std::vector& custom_ops) { - compiler_->SetCustomOps(custom_ops); + if (!strategy.custom_ops.empty()) { + compiler_->SetCustomOps(strategy.custom_ops); + } } void IpuBackend::SaveModelProto(const std::string& path) { diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h index b12e2539258dfefe93e0828fa1a7341e21d62e70..1244192490c16c4cfb01ac1c5f195cc123c4ba16 100644 --- a/paddle/fluid/platform/device/ipu/ipu_backend.h +++ b/paddle/fluid/platform/device/ipu/ipu_backend.h @@ -71,7 +71,6 @@ class IpuBackend { const Scope *GetScope() { return scope_; } void SetIpuStrategy(const IpuStrategy &strategy); const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; } - void SetCustomOps(const std::vector &custom_ops); // save compiled model to onnx void SaveModelProto(const std::string &path); diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc index df2e456383e1754956810f254cd98651e3139bcf..cdb3f6f9b3e285728d5c372b51492e42027aadba 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc @@ -98,6 +98,19 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) { } } +GraphHelper::GraphHelper(const Graph* g) { + graph = g; + sorted_ops = framework::ir::TopologySortOperations(*g); + for (auto* node : g->Nodes()) { + nodes_id_map[node->id()] = node; + if (node->IsVar()) { + vars_name_map[node->Name()] = node; + sorted_vars_id.push_back(node->id()); + } + } + std::sort(sorted_vars_id.begin(), sorted_vars_id.end()); +} + Compiler::Compiler() { RegisterOpFunc(); } Compiler::~Compiler() { @@ -105,9 +118,10 @@ Compiler::~Compiler() { resources_.reset(); } -void Compiler::Prepare() { +void Compiler::Prepare(const Graph* graph) { builder_ = popart::Builder::create(); resources_ = std::make_unique(); + graph_helper_ = std::make_unique(graph); } void Compiler::RegisterOpFunc() { @@ -171,93 +185,24 @@ void Compiler::RegisterOpFunc() { #undef INT_VEC } -void Compiler::LowerBody(const Graph* graph) { - VLOG(10) << "enter Compiler::LowerBody"; - auto nodes = framework::ir::TopologySortOperations(*graph); - for (auto* node : nodes) { - auto* op_desc = node->Op(); - auto op_type = op_desc->Type(); - VLOG(10) << "lowering op: " << op_type; - - if (op_type == "popart_constant") { - // pass - } else if (op_type == "popart_optimizer") { - // pass - } else if (op_type == "popart_checkpointoutput") { - auto inputs = GetOpInputs(op_desc); - auto outputs = GetOpOutputs(op_desc); - auto output_ids = builder_->checkpointOutput(inputs); - InsertTensors(outputs, output_ids); - } else if (op_type == "popart_custom_op") { - auto inputs = GetOpInputs(op_desc); - auto outputs = GetOpOutputs(op_desc); - auto debug_context = BuildDebugContext(op_desc); - auto attributes = std::map{}; - for (auto& attr : op_desc->GetAttrMap()) { - CustomOpAttrVisitor visitor(&attributes, attr.first); - boost::apply_visitor(visitor, attr.second); - } - auto __op_type = - BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); - VLOG(10) << "Build graph from custom op: " << __op_type; - auto it = custom_ops_.find(__op_type); - auto output_ids = - builder_->customOp(it->second.popart_op, it->second.popart_op.version, - inputs, outputs.size(), attributes, debug_context); - SetIpuIndexStage(output_ids, op_desc); - InsertTensors(outputs, output_ids); - } else if (op_type == "popart_printtensor") { - auto inputs = GetOpInputs(op_desc); - auto outputs = GetOpOutputs(op_desc); - auto debug_context = BuildDebugContext(op_desc); - auto print_gradient = - BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); - auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); - auto output_ids = builder_->aiGraphcoreOpset1().printtensor( - inputs, print_gradient, debug_context, title); - SetIpuIndexStage(output_ids, op_desc); - InsertTensors(outputs, output_ids); - } else { - auto itr = name_function_.find(op_type); - if (itr != name_function_.end()) { - itr->second(node->Op()); - } else { - PADDLE_THROW(platform::errors::NotFound( - "%s is not registered, please check for unsupported operators for " - "running on IPU", - op_type)); - } - } - } - VLOG(10) << "leave Compiler::LowerBody"; -} - -void Compiler::InitInputs(Graph* graph, - const std::vector& feed_list) { +void Compiler::InitInputs(const std::vector& feed_list) { for (const auto& feed_name : feed_list) { - feed_list_.push_back(feed_name); - for (const Node* n : graph->Nodes()) { - if (n->IsVar()) { - auto* var_desc = n->Var(); - if (feed_name == var_desc->Name()) { - VLOG(10) << "feed_name= " << var_desc->Name(); - auto data_type = VarType2PopartType(var_desc->GetDataType()); - popart::TensorInfo input_info{data_type, var_desc->GetShape()}; - VLOG(10) << "popart input_info = " << input_info; - popart::TensorId tensor_id = - builder_->addInputTensor(input_info, feed_name); - VLOG(10) << "popart input tensor id = " << tensor_id; - resources_->inputs.push_back(tensor_id); - resources_->tensors.emplace(var_desc->Name(), tensor_id); - } - } - } + auto* node = graph_helper_->vars_name_map[feed_name]; + auto* var_desc = node->Var(); + VLOG(10) << "feed_name= " << var_desc->Name(); + auto data_type = VarType2PopartType(var_desc->GetDataType()); + popart::TensorInfo input_info{data_type, var_desc->GetShape()}; + VLOG(10) << "popart input_info = " << input_info; + popart::TensorId tensor_id = + builder_->addInputTensor(input_info, feed_name); + VLOG(10) << "popart input tensor id = " << tensor_id; + resources_->inputs.push_back(tensor_id); + resources_->tensors.emplace(var_desc->Name(), tensor_id); } } void Compiler::InitOutputs(const std::vector& fetch_list) { for (const auto& fetch_name : fetch_list) { - fetch_list_.push_back(fetch_name); auto tensor = resources_->tensors.find(fetch_name); PADDLE_ENFORCE_NE( tensor, resources_->tensors.end(), @@ -271,14 +216,10 @@ void Compiler::InitOutputs(const std::vector& fetch_list) { } } -void Compiler::LowerConstants(const Graph* graph, const Scope* scope) { +void Compiler::LowerConstants(const Scope* scope) { auto& kid_scope = scope->NewScope(); VLOG(10) << "enter Compiler::LowerConstants"; - for (auto* node : graph->Nodes()) { - if (!node->IsOp()) { - continue; - } - + for (auto* node : graph_helper_->sorted_ops) { auto* op_desc = node->Op(); auto op_type = op_desc->Type(); if (op_type == "popart_constant") { @@ -308,17 +249,16 @@ void Compiler::LowerConstants(const Graph* graph, const Scope* scope) { VLOG(10) << "leave Compiler::LowerConstants"; } -void Compiler::LowerWeights(const Graph* graph, const Scope* scope) { +void Compiler::LowerWeights(const Scope* scope) { VLOG(10) << "enter Compiler::LowerWeights"; - PADDLE_ENFORCE_NOT_NULL(scope, - platform::errors::PreconditionNotMet( - "You should call set_scope before LowerWeights")); // at this step, the graph doesn't contains optimizer related states - for (const auto* node : graph->Nodes()) { + for (auto id : graph_helper_->sorted_vars_id) { + auto* node = graph_helper_->nodes_id_map[id]; if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { if (node->Var()->Persistable() && node->inputs.empty()) { auto var_name = node->Var()->Name(); if (resources_->tensors.count(var_name) != 0) { + VLOG(10) << "found existed one, skip lowering Weight: " << var_name; continue; } VLOG(10) << "lowering weight: " << var_name; @@ -344,12 +284,68 @@ void Compiler::LowerWeights(const Graph* graph, const Scope* scope) { VLOG(10) << "leave Compiler::LowerWeights"; } -void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) { - for (auto* node : graph->Nodes()) { - if (!node->IsOp()) { - continue; +void Compiler::LowerBody() { + VLOG(10) << "enter Compiler::LowerBody"; + for (auto* node : graph_helper_->sorted_ops) { + auto* op_desc = node->Op(); + auto op_type = op_desc->Type(); + VLOG(10) << "lowering op: " << op_type; + + if (op_type == "popart_constant") { + // pass + } else if (op_type == "popart_optimizer") { + // pass + } else if (op_type == "popart_checkpointoutput") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + auto output_ids = builder_->checkpointOutput(inputs); + InsertTensors(outputs, output_ids); + } else if (op_type == "popart_custom_op") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + auto debug_context = BuildDebugContext(op_desc); + auto attributes = std::map{}; + for (auto& attr : op_desc->GetAttrMap()) { + CustomOpAttrVisitor visitor(&attributes, attr.first); + boost::apply_visitor(visitor, attr.second); + } + auto __op_type = + BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type")); + VLOG(10) << "Build graph from custom op: " << __op_type; + auto it = custom_ops_.find(__op_type); + auto output_ids = + builder_->customOp(it->second.popart_op, it->second.popart_op.version, + inputs, outputs.size(), attributes, debug_context); + SetIpuIndexStage(output_ids, op_desc); + InsertTensors(outputs, output_ids); + } else if (op_type == "popart_printtensor") { + auto inputs = GetOpInputs(op_desc); + auto outputs = GetOpOutputs(op_desc); + auto debug_context = BuildDebugContext(op_desc); + auto print_gradient = + BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient")); + auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title")); + auto output_ids = builder_->aiGraphcoreOpset1().printtensor( + inputs, print_gradient, debug_context, title); + SetIpuIndexStage(output_ids, op_desc); + InsertTensors(outputs, output_ids); + } else { + auto itr = name_function_.find(op_type); + if (itr != name_function_.end()) { + itr->second(node->Op()); + } else { + PADDLE_THROW(platform::errors::NotFound( + "%s is not registered, please check for unsupported operators for " + "running on IPU", + op_type)); + } } + } + VLOG(10) << "leave Compiler::LowerBody"; +} +void Compiler::LowerOptimizer(const Scope* scope) { + for (auto* node : graph_helper_->sorted_ops) { auto* op_desc = node->Op(); auto op_type = op_desc->Type(); if (op_type == "popart_optimizer") { diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h index 5576266b1a771682ef949c9825309b64c08c0531..5d1e8c2727d8f9ca36c9380584505dbfcabfb064 100644 --- a/paddle/fluid/platform/device/ipu/ipu_compiler.h +++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h @@ -68,34 +68,29 @@ struct CompilerResources { std::unique_ptr optimizer; }; +// helper for lowering graph +struct GraphHelper { + explicit GraphHelper(const Graph *); + + const Graph *graph; + std::map vars_name_map; + std::map nodes_id_map; + std::vector sorted_ops; + std::vector sorted_vars_id; +}; + class Compiler { public: Compiler(); ~Compiler(); - void RegisterOpFunc(); - void Prepare(); - void LowerBody(const Graph *graph); - void InitInputs(Graph *graph, const std::vector &feed_list); + void Prepare(const Graph *graph); + void InitInputs(const std::vector &feed_list); void InitOutputs(const std::vector &fetch_list); - void LowerConstants(const Graph *graph, const Scope *scope); - void LowerWeights(const Graph *graph, const Scope *scope); - void LowerOptimier(const Graph *graph, const Scope *scope); - - void InsertTensors(const std::vector &output_names, - const std::vector &tensor_ids); - void InsertTensors(const std::vector &output_names, - const std::string &tensor_id); - void SetIpuIndexStage(const std::vector &tensor_ids, - const OpDesc *op_desc); - void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc); - void SetAMPAttributes(const std::vector &tensor_ids, - const OpDesc *op_desc); - void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc); - void SetSerializeAttributes(const std::vector &tensor_ids, - const OpDesc *op_desc); - void SetSerializeAttributes(const std::string &tensor_id, - const OpDesc *op_desc); + void LowerConstants(const Scope *scope); + void LowerWeights(const Scope *scope); + void LowerBody(); + void LowerOptimizer(const Scope *scope); void SetIpuStrategy(const IpuStrategy &strategy) { ipu_strategy_ = &strategy; @@ -112,21 +107,34 @@ class Compiler { void SaveModelProtoNoCheck(const std::string &path); private: + void RegisterOpFunc(); std::vector GetOpInputs(const OpDesc *op); const std::vector &GetOpOutputs(const OpDesc *op); popart::DebugContext BuildDebugContext(const OpDesc *op); + void InsertTensors(const std::vector &output_names, + const std::vector &tensor_ids); + void InsertTensors(const std::vector &output_names, + const std::string &tensor_id); + void SetIpuIndexStage(const std::vector &tensor_ids, + const OpDesc *op_desc); + void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc); + void SetAMPAttributes(const std::vector &tensor_ids, + const OpDesc *op_desc); + void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc); + void SetSerializeAttributes(const std::vector &tensor_ids, + const OpDesc *op_desc); + void SetSerializeAttributes(const std::string &tensor_id, + const OpDesc *op_desc); + private: std::unique_ptr builder_; std::unique_ptr resources_; + std::unique_ptr graph_helper_; using OpFunc = std::function; std::unordered_map name_function_; - // feed_list_ & fetch_list save paddle tensor id - std::vector feed_list_; - std::vector fetch_list_; - const IpuStrategy *ipu_strategy_ = nullptr; std::map custom_ops_; }; diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc index 4a9b9c00cb75cd042bab527532de3314075e6dcd..943dfcc6cffb875fc3cebfc88e35adeaba47fd63 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc @@ -241,6 +241,15 @@ IpuStrategy::IpuStrategy() { #undef ADD_POPART_BOOL_OPTION_ALIAS #undef ADD_POPART_ENUM_OPTION_ALIAS + RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector", + [&]() { + std::vector res; + for (auto x : custom_ops) { + res.push_back(x.repr()); + } + return res; + }); + RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) { if (value) { popart_options.virtualGraphMode = popart::VirtualGraphMode::Manual; @@ -429,6 +438,14 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor, } } +void IpuStrategy::AddCustomOp(const std::string& paddle_op, + const std::string& popart_op, + const std::string& domain, int version) { + LOG(INFO) << "IpuStrategy add custom op: " << paddle_op; + custom_ops.push_back( + IpuCustomOpIdentifier(paddle_op, popart_op, domain, version)); +} + std::string IpuStrategy::GetOption(const std::string& option) { return get(option, options_getter); } diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h index 0e2af26454c401960773de20744f285aecec6bed..64436dc14fec3393b0a2a4473ad436d7d08f5217 100644 --- a/paddle/fluid/platform/device/ipu/ipu_strategy.h +++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -71,6 +72,9 @@ struct IpuStrategy { // popart pattern manager popart::Patterns popart_patterns; + // custom ops + std::vector custom_ops; + private: std::map> bool_options; std::map> uint64_options; @@ -123,6 +127,8 @@ struct IpuStrategy { const std::string &value); void SetTensorLocation(const std::string &tensor, const std::string &option, std::uint64_t value); + void AddCustomOp(const std::string &paddle_op, const std::string &popart_op, + const std::string &domain, int version); std::string GetOption(const std::string &); std::vector GetVectorOption(const std::string &); diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt index 724776bfad2339a1cc58cbca30768311ce0cbd3f..1f3a7670849c2c8a0e8eb87bcd5ef63709fe6ec4 100644 --- a/paddle/fluid/platform/device/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt @@ -9,3 +9,4 @@ cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_man cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream) cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context) cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info) +cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info) diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..fbe3eca1c4d23fc07bec30b6b7ed22c731944ad2 --- /dev/null +++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(PADDLE_WITH_MLU) +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" + +namespace paddle { +namespace platform { + +MluStreamResourcePool::MluStreamResourcePool() { + int dev_cnt = platform::GetMLUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetMLUDeviceId(dev_idx); + mluStream stream; + cnrtQueueCreate(&stream); + return stream; + }; + + auto deleter = [dev_idx](mluStream stream) { + platform::SetMLUDeviceId(dev_idx); + cnrtQueueDestroy(stream); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +MluStreamResourcePool& MluStreamResourcePool::Instance() { + static MluStreamResourcePool pool; + return pool; +} + +std::shared_ptr MluStreamResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +MluEventResourcePool::MluEventResourcePool() { + int dev_cnt = platform::GetMLUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetMLUDeviceId(dev_idx); + mluEventHandle event; + cnrtNotifierCreate(&event); + return event; + }; + + auto deleter = [dev_idx](mluEventHandle event) { + platform::SetMLUDeviceId(dev_idx); + cnrtNotifierDestroy(event); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +MluEventResourcePool& MluEventResourcePool::Instance() { + static MluEventResourcePool pool; + return pool; +} + +std::shared_ptr MluEventResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.h b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..b0e2af7f024cb88a06f7e7bfa13c61d1a825a2a6 --- /dev/null +++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_MLU) +#include +#include +#include + +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/resource_pool.h" + +namespace paddle { +namespace platform { + +using MluStreamObject = std::remove_pointer::type; +using MluEventObject = std::remove_pointer::type; + +class MluStreamResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static MluStreamResourcePool &Instance(); + + private: + MluStreamResourcePool(); + + DISABLE_COPY_AND_ASSIGN(MluStreamResourcePool); + + private: + std::vector>> pool_; +}; + +class MluEventResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static MluEventResourcePool &Instance(); + + private: + MluEventResourcePool(); + + DISABLE_COPY_AND_ASSIGN(MluEventResourcePool); + + private: + std::vector>> pool_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index e27d56642efde7c9b5b11901e57a938050672bf3..e6b08ed7bc340b5150078fe0deb6a3187fb8e17b 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -196,6 +196,7 @@ XPUOpMap& get_kl2_ops() { {"hard_swish_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::FP16, XPUPlace())})}, + {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"huber_loss_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 6452f6f7984e376ab686c7a417d2431af1045410..6a7956628f80464740e3cd812b0b663cc36d6fc6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" namespace paddle { namespace memory { @@ -171,6 +172,7 @@ inline void EmplaceDeviceContext( .get()); dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get()); } + dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get()); dev_ctx->SetHostAllocator( memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CPUPlace()) @@ -322,7 +324,8 @@ NPUDeviceContext::~NPUDeviceContext() { } void NPUDeviceContext::Wait() const { - platform::RecordEvent record_event("NPUDeviceContext/wait"); + platform::RecordEvent record_event("NPUDeviceContext/wait", + platform::TracerEventType::UserDefined, 2); VLOG(4) << "NPU context(" << this << ") Wait"; stream_->Wait(); } @@ -897,21 +900,13 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE -CustomDeviceContext::CustomDeviceContext(CustomPlace place) : place_(place) { - DeviceGuard guard(place_); - stream_.reset(new stream::Stream()); - stream_->Init(place_); +CustomDeviceContext::CustomDeviceContext(CustomPlace place) + : phi::CustomContext(place) { + Init(); + stream_.reset(new platform::stream::Stream(place, stream())); } CustomDeviceContext::~CustomDeviceContext() {} - -const Place& CustomDeviceContext::GetPlace() const { return place_; } - -void CustomDeviceContext::Wait() const { - // platform::RecordEvent record_event("NPUDeviceContext/wait"); - VLOG(4) << "CustomDevice context(" << this << ") Wait"; - stream_->Wait(); -} #endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 0101286f0dfa87f3bc3b9ff0aae1e6f7342bace7..e9124dfc1f8a7ad3a88c843c1a1573ba3503d80b 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_decls.h" #include "paddle/phi/core/device_context.h" @@ -73,7 +74,10 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_ext.h" #include "paddle/fluid/platform/device/stream.h" + +#if !defined(PADDLE_WITH_XPU_KP) || defined(__xpu_on_host__) #include "unsupported/Eigen/CXX11/Tensor" +#endif namespace Eigen { struct DefaultDevice; @@ -819,17 +823,12 @@ class MKLDNNDeviceContext : public CPUDeviceContext { #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE -class CustomDeviceContext : public DeviceContext { +class CustomDeviceContext : public phi::CustomContext { public: explicit CustomDeviceContext(CustomPlace place); virtual ~CustomDeviceContext(); - const Place& GetPlace() const override; - void Wait() const override; Eigen::DefaultDevice* eigen_device() const { return nullptr; } - C_Stream stream() const { - return reinterpret_cast(stream_->raw_stream()); - } template void AddStreamCallback(Callback&& callback) const { @@ -839,13 +838,7 @@ class CustomDeviceContext : public DeviceContext { void WaitStreamCallback() const { return stream_->WaitCallback(); } private: - std::string device_type_; - - CustomPlace place_; - std::shared_ptr stream_; - - CustomDeviceContext(); }; template <> struct DefaultDeviceContextType { diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 39f95a9295661b2b3432d7ca062b2bdb1fe5c40a..baf043e860be4fd6b0f3b82a43bc5594a083e6eb 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -761,3 +761,15 @@ DEFINE_bool(enable_slotrecord_reset_shrink, false, "enable slotrecord obejct reset shrink memory, default false"); DEFINE_bool(enable_ins_parser_file, false, "enable parser ins file , default false"); + +/** + * ProcessGroupNCCL related FLAG + * Name: nccl_blocking_wait + * Since Version: + * Value Range: bool, default=false + * Example: + * Note: nccl blocking wait. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); +#endif diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index 4d3b807aba82ea91770dddfcf655ec2431cdb197..da12dccb74924fd27dee3047d29636341f7c47a2 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/platform/profiler/cupti_data_process.h" #include +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" namespace paddle { @@ -26,7 +27,7 @@ void AddKernelRecord(const CUpti_ActivityKernel4* kernel, uint64_t start_ns, return; } DeviceTraceEvent event; - event.name = kernel->name; + event.name = demangle(kernel->name); event.type = TracerEventType::Kernel; event.start_ns = kernel->start; event.end_ns = kernel->end; diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h old mode 100755 new mode 100644 diff --git a/paddle/fluid/platform/profiler/event_tracing.h b/paddle/fluid/platform/profiler/event_tracing.h index 54c5b219310a9c64214e721f2f6b310e20c5d733..fcaba9a43ca9385ab38e440f7b8659298a02ef05 100644 --- a/paddle/fluid/platform/profiler/event_tracing.h +++ b/paddle/fluid/platform/profiler/event_tracing.h @@ -21,26 +21,55 @@ limitations under the License. */ namespace paddle { namespace platform { +// Default tracing level. +// It is Recommended to set the level explicitly. static constexpr uint32_t kDefaultTraceLevel = 4; -// CPU event tracing. A trace marks something that happens but has no duration + +// Host event tracing. A trace marks something that happens but has no duration // associated with it. For example, thread starts working. // Chrome Trace Viewer Format: Instant Event struct RecordInstantEvent { + /** + * @param name: It is the caller's reponsibility to manage the underlying + * storage. RecordInstantEvent stores the pointer. + * @param type: Classification which is used to instruct the profiling + * data statistics. + * @param level: Used to filter events, works like glog VLOG(level). + * RecordEvent will works if HostTraceLevel >= level. + */ explicit RecordInstantEvent(const char* name, TracerEventType type, uint32_t level = kDefaultTraceLevel); }; -// CPU event tracing. A trace starts when an object of this clas is created and +// Host event tracing. A trace starts when an object of this clas is created and // stops when the object is destroyed. // Chrome Trace Viewer Format: Duration Event/Complte Event class RecordEvent { public: + /** + * @param name: If your string argument has a longer lifetime (e.g.: string + * literal, static variables, etc) than the event, use 'const char* name'. + * Do your best to avoid using 'std::string' as the argument type. It will + * cause deep-copy to harm performance. + * @param type: Classification which is used to instruct the profiling + * data statistics. + * @param level: Used to filter events, works like glog VLOG(level). + * RecordEvent will works if HostTraceLevel >= level. + */ explicit RecordEvent( const std::string& name, const TracerEventType type = TracerEventType::UserDefined, uint32_t level = kDefaultTraceLevel, const EventRole role = EventRole::kOrdinary); + /** + * @param name: It is the caller's reponsibility to manage the underlying + * storage. RecordEvent stores the pointer. + * @param type: Classification which is used to instruct the profiling + * data statistics. + * @param level: Used to filter events, works like glog VLOG(level). + * RecordEvent will works if HostTraceLevel >= level. + */ explicit RecordEvent(const char* name, const TracerEventType type = TracerEventType::UserDefined, uint32_t level = kDefaultTraceLevel, diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h index 3bcd68c55963082bfc0ce12bbcdc0b07a05bbe97..49f9362527591744dd0685375e0244673a7b3081 100644 --- a/paddle/fluid/platform/profiler/host_event_recorder.h +++ b/paddle/fluid/platform/profiler/host_event_recorder.h @@ -202,7 +202,7 @@ class ThreadEventRecorder { ThreadEventSection GatherEvents() { ThreadEventSection thr_sec; - thr_sec.thread_name = thread_name_; + thr_sec.thread_name = GetCurrentThreadName(); thr_sec.thread_id = thread_id_; thr_sec.events = std::move(base_evt_cntr_.Reduce()); return thr_sec; @@ -210,7 +210,6 @@ class ThreadEventRecorder { private: uint64_t thread_id_; - std::string thread_name_; EventContainer base_evt_cntr_; }; diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h old mode 100755 new mode 100644 diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 3453cff30f5ad2d1016dcd786733a7024ed0ae4a..e76183192bcee517279afe7ba5832af3b2e3d84b 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_ feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils) + cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils tcp_store) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) @@ -37,6 +37,10 @@ if (WITH_ASCEND_CL) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() +if (WITH_CNCL) + set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context) +endif() + if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) @@ -73,8 +77,17 @@ set(PYBIND_SRCS compatible.cc io.cc generator_py.cc + communication.cc cuda_streams_py.cc) +if(NOT ON_INFER) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup) + if (WITH_NCCL) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) + endif() + set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) +endif() + if(WITH_ASCEND) set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper) set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc) @@ -133,6 +146,10 @@ if(WITH_PYTHON) list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context) endif(WITH_ASCEND_CL) + if(WITH_CNCL) + list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context) + endif(WITH_CNCL) + add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) add_executable(eager_op_function_generator eager_op_function_generator.cc) diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 7bb7f03983eb9e8c88f46174a40664f1110682d1..b29cc10e8f56f5698874db8b357621aa4a88b238 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -24,10 +24,41 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/place.h" +#include "pybind11/pybind11.h" namespace py = pybind11; +namespace pybind11 { +namespace detail { + +// Note: use same enum number of float16 in numpy. +// import numpy as np +// print np.dtype(np.float16).num # 23 +constexpr int NPY_FLOAT16_ = 23; + +// Note: Since float16 is not a builtin type in C++, we register +// paddle::platform::float16 as numpy.float16. +// Ref: https://github.com/pybind/pybind11/issues/1776 +template <> +struct npy_format_descriptor { + static py::dtype dtype() { + handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_); + return reinterpret_borrow(ptr); + } + static std::string format() { + // Note: "e" represents float16. + // Details at: + // https://docs.python.org/3/library/struct.html#format-characters. + return "e"; + } + static constexpr auto name = _("float16"); +}; + +} // namespace detail +} // namespace pybind11 + namespace paddle { namespace pybind { @@ -175,6 +206,7 @@ void BindFleetExecutor(py::module* m) { .def(py::init(&DistModelDataBufCreate)) .def(py::init(&DistModelDataBufCreate)) .def(py::init(&DistModelDataBufCreate)) + .def(py::init(&DistModelDataBufCreate)) .def("reset", [](DistModelDataBuf& self, std::vector& data) { self.Resize(data.size() * sizeof(float)); @@ -183,29 +215,35 @@ void BindFleetExecutor(py::module* m) { .def("reset", &DistModelDataBufReset) .def("reset", &DistModelDataBufReset) .def("reset", &DistModelDataBufReset) + .def("reset", &DistModelDataBufReset) .def("length", &DistModelDataBuf::length) - .def("tolist", - [](DistModelDataBuf& self, const std::string& dtype) -> py::list { - py::list l; - if (dtype == "int32") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(int32_t); - l = py::cast(std::vector(data, data + size)); - } else if (dtype == "int64") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(int64_t); - l = py::cast(std::vector(data, data + size)); - } else if (dtype == "float32") { - auto* data = static_cast(self.data()); - auto size = self.length() / sizeof(float); - l = py::cast(std::vector(data, data + size)); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported data type. Now only supports INT32, INT64 and " - "FLOAT32.")); - } - return l; - }); + .def("tolist", [](DistModelDataBuf& self, + const std::string& dtype) -> py::list { + py::list l; + if (dtype == "int32") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(int32_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "int64") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(int64_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "float32") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(float); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "float16") { + auto* data = static_cast(self.data()); + auto size = self.length() / sizeof(paddle::platform::float16); + l = py::cast( + std::vector(data, data + size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported data type. Now only supports INT32, INT64, " + "FLOAT16 and FLOAT32.")); + } + return l; + }); py::class_(*m, "DistModelTensor") .def(py::init<>()) @@ -221,6 +259,10 @@ void BindFleetExecutor(py::module* m) { py::arg("name") = "", py::arg("lod") = std::vector>(), py::arg("copy") = true) + .def(py::init(&DistModelTensorCreate), + py::arg("data"), py::arg("name") = "", + py::arg("lod") = std::vector>(), + py::arg("copy") = true) .def_readwrite("name", &DistModelTensor::name) .def_readwrite("shape", &DistModelTensor::shape) .def_readwrite("data", &DistModelTensor::data) @@ -231,7 +273,8 @@ void BindFleetExecutor(py::module* m) { py::enum_(*m, "DistModelDataType") .value("FLOAT32", DistModelDataType::FLOAT32) .value("INT64", DistModelDataType::INT64) - .value("INT32", DistModelDataType::INT32); + .value("INT32", DistModelDataType::INT32) + .value("FLOAT16", DistModelDataType::FLOAT16); } } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc new file mode 100644 index 0000000000000000000000000000000000000000..a0d2777f825dc592e19230bc2ba4412f943d0c2b --- /dev/null +++ b/paddle/fluid/pybind/communication.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/store/tcp_store.h" +#include "paddle/fluid/pybind/communication.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +using TCPStore = paddle::distributed::TCPStore; + +void BindTCPStore(py::module* m) { + py::class_(*m, "TCPStore") + .def( + py::init()) + .def("add", &TCPStore::add) + .def("get", &TCPStore::get); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/operators/randperm_op.cu b/paddle/fluid/pybind/communication.h similarity index 54% rename from paddle/fluid/operators/randperm_op.cu rename to paddle/fluid/pybind/communication.h index 7ed52a8fd25b104f50446082ff3a040e90bf44ea..17045ccfe65cae25471ceff3abf0129b2a21acb0 100644 --- a/paddle/fluid/operators/randperm_op.cu +++ b/paddle/fluid/pybind/communication.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,13 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/randperm_op.h" +#pragma once -template -using kernel = - paddle::operators::RandpermKernel; +#include -REGISTER_OP_CUDA_KERNEL(randperm, kernel, kernel, kernel, - kernel); +#include "pybind11/chrono.h" +#include "pybind11/complex.h" +#include "pybind11/functional.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +void BindTCPStore(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..e057fb53ccecc7193fd52b8beda2c4f2880560e8 --- /dev/null +++ b/paddle/fluid/pybind/distributed_py.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/phi/api/all.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#endif + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +using Tensor = paddle::experimental::Tensor; + +void BindDistributed(py::module *m) { + py::enum_(*m, "ReduceOp") + .value("SUM", distributed::ReduceOp::SUM) + .value("AVG", distributed::ReduceOp::AVG) + .value("MAX", distributed::ReduceOp::MAX) + .value("MIN", distributed::ReduceOp::MIN) + .value("PRODUCT", distributed::ReduceOp::PRODUCT); + + py::class_(*m, "AllreduceOptions") + .def(py::init<>()) + .def_readwrite("reduce_op", &distributed::AllreduceOptions::reduce_op); + + py::class_(*m, "BroadcastOptions") + .def(py::init<>()) + .def_readwrite("source_rank", &distributed::BroadcastOptions::source_rank) + .def_readwrite("source_root", + &distributed::BroadcastOptions::source_root); + + auto ProcessGroup = + py::class_>(*m, "ProcessGroup") + .def("rank", &distributed::ProcessGroup::GetRank) + .def("size", &distributed::ProcessGroup::GetSize) + .def("name", &distributed::ProcessGroup::GetBackendName) + .def("allreduce", + [](distributed::ProcessGroup &self, py::handle py_tensor, + distributed::ReduceOp op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::AllreduceOptions opts; + opts.reduce_op = op; + std::vector tensors = {tensor}; + return self.AllReduce(tensors, opts); + }, + py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def("broadcast", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int source_rank) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::BroadcastOptions opts; + opts.source_rank = source_rank; + std::vector tensors = {tensor}; + return self.Broadcast(tensors, opts); + }, + py::arg("tensor"), py::arg("source_rank"), + py::call_guard()); + +#if defined(PADDLE_WITH_NCCL) + py::class_>( + *m, "ProcessGroupNCCL", ProcessGroup) + .def(py::init(), + py::call_guard()); + + py::class_>(*m, "task") + .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted) + .def("wait", &distributed::ProcessGroup::Task::Wait, + py::arg("timeout") = kWaitTimeout, + py::call_guard()) + .def("synchronize", &distributed::ProcessGroup::Task::Synchronize, + py::call_guard()); +#endif + + // define parallel strategy, it will be removed + py::class_ pg_strategy( + *m, "ProcessGroupStrategy", ""); + pg_strategy.def(py::init()) + .def_property("nranks", + [](const distributed::ProcessGroupStrategy &self) { + return self.nranks_; + }, + [](distributed::ProcessGroupStrategy &self, int nranks) { + self.nranks_ = nranks; + }) + .def_property("local_rank", + [](const distributed::ProcessGroupStrategy &self) { + return self.local_rank_; + }, + [](distributed::ProcessGroupStrategy &self, + int local_rank) { self.local_rank_ = local_rank; }) + .def_property( + "trainer_endpoints", + [](const distributed::ProcessGroupStrategy &self) { + return self.trainer_endpoints_; + }, + [](distributed::ProcessGroupStrategy &self, + std::vector eps) { self.trainer_endpoints_ = eps; }) + .def_property("current_endpoint", + [](const distributed::ProcessGroupStrategy &self) { + return self.current_endpoint_; + }, + [](distributed::ProcessGroupStrategy &self, + const std::string &ep) { self.current_endpoint_ = ep; }) + .def_property("nrings", + [](const distributed::ProcessGroupStrategy &self) { + return self.nrings_; + }, + [](distributed::ProcessGroupStrategy &self, int nrings) { + self.nrings_ = nrings; + }); +} + +} // end namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.h b/paddle/fluid/pybind/distributed_py.h new file mode 100644 index 0000000000000000000000000000000000000000..be5c7549b8e8d8f5d3ad91e90ed43112a664d339 --- /dev/null +++ b/paddle/fluid/pybind/distributed_py.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/chrono.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindDistributed(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 241e9f9058dfe35fa36df12515e3ffc1a2f38a6b..d9a2dcb6869096a5f08675bb6dc7994cc8c9889b 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -50,7 +50,6 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { if (obj) { auto v = reinterpret_cast(obj); new (&(v->tensor)) paddle::experimental::Tensor(); - Py_INCREF(obj); } return obj; } @@ -58,23 +57,19 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) { // TODO(jiabin): Overload this once we need more constructor in Python void EmptyTensorInitializer(TensorObject* self, const std::string& name, const paddle::platform::Place& place, - bool persistable = false, bool stop_gradient = true, + bool persistable = false, int stop_gradient = -1, framework::proto::VarType::Type dtype = paddle::framework::proto::VarType::FP32, const std::vector& dims = {}, framework::proto::VarType::Type var_type = paddle::framework::proto::VarType::LOD_TENSOR) { auto ddims = phi::make_ddim(dims); - PADDLE_ENFORCE_GE( - phi::product(ddims), 0, - paddle::platform::errors::InvalidArgument( - "Create Eager Tensor with dims contain minus num is ilegal" - "Please check your code and make sure you new a " - "eager tensor with fixed shape instead of using -1.")); self->tensor.set_name(name); auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor)); autograd_meta->SetPersistable(persistable); - autograd_meta->SetStopGradient(stop_gradient); + if (stop_gradient != -1) { + autograd_meta->SetStopGradient(static_cast(stop_gradient)); + } if (var_type == paddle::framework::proto::VarType::LOD_TENSOR) { // TODO(jiabin): Maybe support LOD later std::shared_ptr dense_tensor = @@ -82,19 +77,17 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, phi::make_intrusive(place), phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype), ddims)); - dense_tensor->mutable_data(place); + if (phi::product(ddims) > 0) { + dense_tensor->mutable_data(place); + } self->tensor.set_impl(dense_tensor); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "We only support LoDTensor to be constructed by this initializer, " - "please check your var type first and make sure you are going to " - "construct LoDTensor.")); } if (!autograd_meta->GetMutableGradNode()) { VLOG(3) << "Tensor(" << name << ") have not GradNode, add GradNodeAccumulation for it."; - autograd_meta->SetGradNode(std::make_shared()); + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); } } @@ -143,13 +136,12 @@ void InitTensorWithTensor(TensorObject* self, src.copy_to(phi::TransToPtenBackend(place), true).impl()); VLOG(4) << "Different place, do TensorCopy"; } - egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true); if (src.get_autograd_meta()) { - egr::EagerUtils::unsafe_autograd_meta(self->tensor) + egr::EagerUtils::autograd_meta(&(self->tensor)) ->SetPersistable( egr::EagerUtils::unsafe_autograd_meta(src)->Persistable()); } else { - egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false); + egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); } } @@ -168,8 +160,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self, temp.copy_to(phi::TransToPtenBackend(place), true).impl()); VLOG(4) << "Different place, do TensorCopy"; } - egr::EagerUtils::autograd_meta(&(self->tensor))->SetStopGradient(true); - egr::EagerUtils::unsafe_autograd_meta(self->tensor)->SetPersistable(false); + egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); } py::object ParsePyArray( @@ -218,21 +209,18 @@ paddle::platform::Place ParsePlace( } // boolean arguments: zero_copy, stop_gradient, persistable -bool ParseBooleanArgs(std::string key, - std::unordered_map kws_map, - std::unordered_map kw_order_map, - PyObject* args, bool flag_kwargs, Py_ssize_t args_num) { - bool res = false; - if (key == "stop_gradient") res = true; +int ParseBooleanArgs(std::string key, + std::unordered_map kws_map, + std::unordered_map kw_order_map, + PyObject* args, bool flag_kwargs, Py_ssize_t args_num) { + int res = -1; if (kw_order_map[key] <= args_num) { - res = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, kw_order_map[key] - 1), - kw_order_map[key] - 1); + res = static_cast(CastPyArg2AttrBoolean( + PyTuple_GET_ITEM(args, kw_order_map[key] - 1), kw_order_map[key] - 1)); } else { if (flag_kwargs && kws_map[key] != NULL) { - res = CastPyArg2AttrBoolean(kws_map[key], 0); - } else { - return res; + res = static_cast(CastPyArg2AttrBoolean(kws_map[key], 0)); } } return res; @@ -288,15 +276,15 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr, bool persistable = false; bool zero_copy = false; std::string act_name = ""; - bool stop_gradient = true; + int stop_gradient = -1; numpy_value = ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num); place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num); - persistable = ParseBooleanArgs("persistable", kws_map, kw_order_map, args, - flag_kwargs, args_num); - zero_copy = ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args, - flag_kwargs, args_num); + persistable = (1 == ParseBooleanArgs("persistable", kws_map, kw_order_map, + args, flag_kwargs, args_num)); + zero_copy = (1 == ParseBooleanArgs("zero_copy", kws_map, kw_order_map, args, + flag_kwargs, args_num)); act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num); stop_gradient = ParseBooleanArgs("stop_gradient", kws_map, kw_order_map, args, flag_kwargs, args_num); @@ -571,7 +559,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { EmptyTensorInitializer(py_tensor_ptr, act_name, egr::Controller::Instance().GetExpectedPlace(), persistable, - /* stop_gradient */ true, dtype, dims, var_type); + /* stop_gradient */ -1, dtype, dims, var_type); return 0; } else { @@ -655,7 +643,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { bool persistable = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); EmptyTensorInitializer(py_tensor_ptr, act_name, egr::Controller::Instance().GetExpectedPlace(), - persistable, true, dtype, dims, var_type); + persistable, -1, dtype, dims, var_type); return 0; } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { VLOG(6) << "Calling case3's initializer."; @@ -726,9 +714,8 @@ PyMappingMethods mapping_methods; void BindEager(pybind11::module* module) { auto m = module->def_submodule("eager"); - auto& internals = pybind11::detail::get_internals(); auto heap_type = reinterpret_cast( - internals.default_metaclass->tp_alloc(internals.default_metaclass, 0)); + PyType_Type.tp_alloc(&PyType_Type, 0)); heap_type->ht_name = ToPyObject("Tensor"); heap_type->ht_qualname = ToPyObject("Tensor"); auto type = &heap_type->ht_type; @@ -742,8 +729,8 @@ void BindEager(pybind11::module* module) { type->tp_getset = variable_properties; type->tp_init = TensorInit; type->tp_new = TensorNew; - Py_INCREF(internals.instance_base); - type->tp_base = reinterpret_cast(internals.instance_base); + Py_INCREF(&PyBaseObject_Type); + type->tp_base = reinterpret_cast(&PyBaseObject_Type); type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; #if PY_VERSION_HEX >= 0x03050000 diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index b1aef8fc08fea818045a97e29e5b4f2d0e30e222..4e900ae2ffbc11c4c0859ff65cf2b21048b3a649 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -177,7 +177,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args, if (!meta->GetMutableGradNode()) { VLOG(6) << "Make grad node of tensor: " << self->tensor.name() << "become accumulation node"; - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); } egr::egr_utils_api::RetainGradForTensor(self->tensor); } @@ -186,36 +186,51 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } -static PyObject* tensor__clear_gradient(TensorObject* self, PyObject* args, - PyObject* kwargs) { +static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, + PyObject* kwargs) { EAGER_TRY VLOG(4) << "ClearGradient " << self->tensor.name(); + Py_ssize_t args_num = PyTuple_Size(args); + bool set_to_zero = true; + if (args_num == (Py_ssize_t)1) { + CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); + } + paddle::experimental::Tensor* grad; if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - grad = accumulation_grad_node->Grad(); + grad = egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); grad = meta->MutableGrad(); } - if (grad->initialized()) { - VLOG(4) << "Gradient of " << self->tensor.name() - << " is initialized, will be released."; - auto dense_tensor = - std::dynamic_pointer_cast(grad->impl()); - dense_tensor->MoveMemoryHolder(); + if (grad->is_selected_rows()) { + auto selected_rows = + std::dynamic_pointer_cast(grad->impl()); + if (selected_rows->mutable_value()->IsInitialized()) { + selected_rows->mutable_rows()->clear(); + selected_rows->mutable_value()->clear(); + } + } else if (grad->is_dense_tensor()) { + if (grad->initialized()) { + if (set_to_zero) { + grad->set_impl(paddle::experimental::zeros_like(*grad).impl()); + } else { + VLOG(4) << "Gradient of " << self->tensor.name() + << " is initialized, will be released."; + auto dense_tensor = + std::dynamic_pointer_cast(grad->impl()); + dense_tensor->MoveMemoryHolder(); + } + } } + Py_INCREF(Py_None); return Py_None; EAGER_CATCH_AND_THROW_RETURN_NULL @@ -228,19 +243,15 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args, if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - if (accumulation_grad_node->Grad()->initialized()) { - accumulation_grad_node->Grad()->set_impl( - paddle::experimental::zeros_like(*(accumulation_grad_node->Grad())) - .impl()); + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + if (grad->initialized()) { + grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl()); } } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); @@ -407,7 +418,7 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"retain_grads", (PyCFunction)(void (*)(void))tensor_retain_grads, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_clear_gradient", (PyCFunction)(void (*)(void))tensor__clear_gradient, + {"clear_gradient", (PyCFunction)(void (*)(void))tensor_clear_gradient, METH_VARARGS | METH_KEYWORDS, NULL}, {"_zero_grads", (PyCFunction)(void (*)(void))tensor__zero_grads, METH_VARARGS | METH_KEYWORDS, NULL}, diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 43cfb50f2afe11a131e4bd71862b9efa84c841a9..2e1390cb96155c4832a8ceace889e331039ed43f 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -70,26 +70,13 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self, PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY - if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - return ToPyObject(*accumulation_grad_node->Grad()); + VLOG(6) << "Get grad for tensor: " << self->tensor.name(); + auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); + if (meta) { + return ToPyObject(meta->Grad()); } else { - VLOG(6) << "Get grad for tensor: " << self->tensor.name(); - auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); - if (meta) { - return ToPyObject(meta->Grad()); - } else { - Py_INCREF(Py_None); - return Py_None; - } + Py_INCREF(Py_None); + return Py_None; } EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -101,16 +88,15 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, PADDLE_ENFORCE( egr::egr_utils_api::IsLeafTensor(self->tensor), paddle::platform::errors::Fatal("Only leaf Tensor can be set grad.")); - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->Grad()->copy_(src, true); + + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + grad->copy_(src, true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 9c033376d6c439f8a89a13fecf7bb968706504ef..c1e8822eec22179266d69d3b97890aebe678b187 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -555,6 +555,32 @@ PyObject* ToPyObject( return dict; } +// For Final State Dygraph, +// We directly use paddle::optional(Tensor) as dispensable Tensor +paddle::optional GetOptionalTensorFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + + if (PyTuple_Check(obj)) { + obj = PyTuple_GET_ITEM(obj, 0); + } + + if (obj == nullptr || obj == Py_None) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got None", + op_type, arg_name, arg_idx)); + } + return {}; + } + + return paddle::make_optional( + reinterpret_cast(obj)->tensor); +} + +// For Intermediate State Dygraph, +// we use an uninitialized Tensor to represent dispensable Tensor paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index fb19e108aeb7035a52b708358539fa198643f9db..0c721d6124791edda7f41d46dcbbbfcccc80fb95 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -89,10 +89,15 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +paddle::optional GetOptionalTensorFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); + std::vector GetTensorListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); @@ -102,6 +107,7 @@ paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type, PyObject* args, ssize_t arg_idx, bool dispensable = false); + std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 73c8f362d145db078ac4c84c91372dcdd61c47af..3145a9cf7655c053c269990e00982226eae49c7a 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -103,11 +103,13 @@ void BindCommunicatorContext(py::module* m) { py::init&, const std::vector&, const std::vector&, const std::vector&, int, bool, bool, bool, int, - bool>()) + bool, bool, int64_t>()) .def("var_name", [](const CommContext& self) { return self.var_name; }) .def("trainer_id", [](const CommContext& self) { return self.trainer_id; }) .def("table_id", [](const CommContext& self) { return self.table_id; }) + .def("program_id", + [](const CommContext& self) { return self.program_id; }) .def("split_varnames", [](const CommContext& self) { return self.splited_varnames; }) .def("split_endpoints", @@ -122,6 +124,8 @@ void BindCommunicatorContext(py::module* m) { [](const CommContext& self) { return self.origin_varnames; }) .def("is_tensor_table", [](const CommContext& self) { return self.is_tensor_table; }) + .def("is_datanorm_table", + [](const CommContext& self) { return self.is_datanorm_table; }) .def("__str__", [](const CommContext& self) { return self.print(); }); } diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 387addda9edd1fd011281545f423527cac6d8bd6..8c5ed2d11830195a6fb70c54d12c9ef3eb3fc8b2 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/bkcl_context.h" +#include "paddle/fluid/imperative/cncl_context.h" #include "paddle/fluid/imperative/data_loader.h" #include "paddle/fluid/imperative/gloo_context.h" #include "paddle/fluid/imperative/hccl_context.h" @@ -2559,6 +2560,18 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif +#if defined(PADDLE_WITH_CNCL) + py::class_>( + m, "CNCLParallelContext") + .def(py::init()) + .def("init", [](imperative::CNCLParallelContext &self) { self.Init(); }) + .def("init_with_ring_id", + &imperative::CNCLParallelContext::InitWithRingID, + py::arg("ring_id")); +#endif + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) py::class_(m, "LodRankTable") .def("items", [](framework::LoDRankTable &table) { @@ -3893,6 +3896,9 @@ All parameter, weight, gradient are variables in Paddle. BindCompatible(&m); BindDataset(&m); BindGenerator(&m); +#ifndef PADDLE_ON_INFERENCE + BindDistributed(&m); +#endif #ifdef PADDLE_WITH_ASCEND BindAscendWrapper(&m); BindAscendGraph(&m); diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 2486c54d5addc40fae2c019ab6b0db4d6121a290..0f6dfb9d8f44e8be8fd41405ce635dff85ab2044 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -2,13 +2,13 @@ if (NOT WITH_INFRT) return() endif() -option(INFRT_WITH_PTEN "Compile INFRT with PTEN" ON) +option(INFRT_WITH_PHI "Compile INFRT with PHI" ON) #TODO(xiaowei) remove fluid include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) -if (INFRT_WITH_PTEN) - add_definitions("-DINFRT_WITH_PTEN") +if (INFRT_WITH_PHI) + add_definitions("-DINFRT_WITH_PHI") endif() # compile flags @@ -97,16 +97,16 @@ set(infrt_mlir_incs rewrite_inc trt_ops_inc ) -if (INFRT_WITH_PTEN) - set(pten_libs pten) +if (INFRT_WITH_PHI) + set(phi_libs pten) set(infrt_mlir_incs ${infrt_mlir_incs} - MLIRinfrt_pten_tensorIncGen - MLIRinfrt_pten_baseIncGen + MLIRinfrt_phi_tensorIncGen + MLIRinfrt_phi_baseIncGen ) endif() -cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto infrt_naive) -cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto) +cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto infrt_naive) +cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${phi_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/infrt/api/infrt_api.cc b/paddle/infrt/api/infrt_api.cc index c2a4e0aff7a08e6b66fb2b2ce6f3165e1adcfd0a..28f63db49f4baec12bb43afa9034d5578d9f6cb1 100644 --- a/paddle/infrt/api/infrt_api.cc +++ b/paddle/infrt/api/infrt_api.cc @@ -42,7 +42,6 @@ using namespace infrt::host_context; // NOLINT using namespace infrt::tensor; // NOLINT using namespace infrt::tensor; // NOLINT using infrt::dt::TensorMapType; // NOLINT -using infrt::dt::TensorType; // NOLINT namespace infrt { @@ -145,7 +144,7 @@ class PredictExecutor : public MlirToRuntimeTranslator { // process results auto& last_op = predict_func.front().back(); - if (last_op.getName().getStringRef() == "infrt.return") { + if (last_op.getName().getStringRef() == "Infrt.return") { for (size_t i = 0; i < last_op.getNumOperands(); ++i) { auto* value = AddValue(mlir::Value(last_op.getOperand(i))); results_.push_back(ValueRef(value)); diff --git a/paddle/infrt/backends/host/pten_allocator.h b/paddle/infrt/backends/host/phi_allocator.h similarity index 95% rename from paddle/infrt/backends/host/pten_allocator.h rename to paddle/infrt/backends/host/phi_allocator.h index fa61e04fb670741c959c427d8d12c42fb1217251..c8f97e04a1b8376efbac749fffa70d77c7b95e72 100644 --- a/paddle/infrt/backends/host/pten_allocator.h +++ b/paddle/infrt/backends/host/phi_allocator.h @@ -16,7 +16,7 @@ limitations under the License. */ namespace infrt { namespace backends { -class CpuPtenAllocator : public phi::Allocator { +class CpuPhiAllocator : public phi::Allocator { public: static void deleter(phi::Allocation* ptr) { ::operator delete(ptr); } diff --git a/paddle/infrt/backends/host/pten_context.h b/paddle/infrt/backends/host/phi_context.h similarity index 94% rename from paddle/infrt/backends/host/pten_context.h rename to paddle/infrt/backends/host/phi_context.h index 961c93529aeb44200f320d5804c561887257a4d6..9d0e3bc4fbb3158147283c1992cf1fee70c9b90d 100644 --- a/paddle/infrt/backends/host/pten_context.h +++ b/paddle/infrt/backends/host/phi_context.h @@ -16,7 +16,7 @@ limitations under the License. */ namespace infrt { namespace backends { -class CpuPtenContext : public phi::CPUContext { +class CpuPhiContext : public phi::CPUContext { public: using Base = phi::CPUContext; using phi::CPUContext::SetEigenDevice; diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt index 757d47a8de43e2a394ad5296e617ed6ed94078f3..e35989da2085b21f4dbfaadea05793fc9dcb8753 100644 --- a/paddle/infrt/dialect/CMakeLists.txt +++ b/paddle/infrt/dialect/CMakeLists.txt @@ -16,7 +16,7 @@ gather_srcs(infrt_src SRCS mlir_tablegen_on(basic_kernels) mlir_tablegen_on(test_kernels) -mlir_tablegen_on(infrt_base DIALECT infrt) +mlir_tablegen_on(infrt_base DIALECT Infrt) mlir_tablegen_on(tensor_shape DIALECT ts) mlir_tablegen_on(dense_tensor DIALECT dt) mlir_tablegen_on(pd_op_base DIALECT pd) @@ -36,6 +36,6 @@ cc_test_tiny(test_infrt_mlir_loader SRCS mlir_loader_test.cc DEPS infrt ${MLIR_I add_subdirectory(infrt) add_subdirectory(tensorrt) -if (INFRT_WITH_PTEN) - add_subdirectory(pten) +if (INFRT_WITH_PHI) + add_subdirectory(phi) endif() diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc index bad7e73ec5ae5c3216a912729637664bba17d3b0..c1aa75fb24650b99ea8371c0ecbe7e572df2f0ce 100644 --- a/paddle/infrt/dialect/basic_kernels.cc +++ b/paddle/infrt/dialect/basic_kernels.cc @@ -90,7 +90,7 @@ static ParseResult parseReturnOp(OpAsmParser &parser, // NOLINT } static void print(OpAsmPrinter &p, CallOp op) { // NOLINT - p << "infrt.call " << op->getAttr("callee") << "("; + p << op->getAttr("callee") << "("; p.printOperands(op.getOperands()); p << ")"; p.printOptionalAttrDict(op->getAttrs(), {"callee"}); @@ -98,7 +98,7 @@ static void print(OpAsmPrinter &p, CallOp op) { // NOLINT } static void printConstant(OpAsmPrinter &p, mlir::Operation *op) { // NOLINT - p << op->getName() << " "; + p << " "; p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"value"}); if (op->getAttrs().size() > 1) p << ' '; @@ -128,7 +128,6 @@ static void print(OpAsmPrinter &p, ConstantI64Op op) { // NOLINT } static void print(OpAsmPrinter &p, ReturnOp op) { // NOLINT - p << "infrt.return"; if (op.getNumOperands() > 0) { p << ' '; p.printOperands(op.getOperands()); diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td index 32845a09351f70fe1acd7659b8c5e3a579ff83e0..aadc146e36280f79902f3b9ed90f3203fb9e5384 100644 --- a/paddle/infrt/dialect/basic_kernels.td +++ b/paddle/infrt/dialect/basic_kernels.td @@ -48,10 +48,10 @@ def ConstantF64Op : ConstantOp<"f64", F64, F64Attr>; def ReturnOp : INFRT_Op<"return", [Terminator]> { let summary = "host executor return operation"; let description = [{ - The "infrt.return" operation represents a return operation within a function. + The "Infrt.return" operation represents a return operation within a function. func @foo() : (i32, f8) { - infrt.return %0, %1 : i32, f8 + Infrt.return %0, %1 : i32, f8 } }]; @@ -112,7 +112,7 @@ def PrintF32Op : PrintOp<"f32", F32>; def PrintF64Op : PrintOp<"f64", F64>; def GetStringOp : INFRT_Op<"get_string"> { - let summary = "infrt.get_string"; + let summary = "Infrt.get_string"; let description = [{ Get a !infrt.string value from the given string attribute. }]; @@ -124,7 +124,7 @@ def GetStringOp : INFRT_Op<"get_string"> { } def PrintStringOp : INFRT_Op<"print_string"> { - let summary = "infrt.print_string"; + let summary = "Infrt.print_string"; let description = [{ An operation that prints a string. }]; diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc index fde265765c6d2251019403a1a7bc861206d3fe0c..49d6887ada0322065946f95c9e39d932f268375e 100644 --- a/paddle/infrt/dialect/dense_tensor.cc +++ b/paddle/infrt/dialect/dense_tensor.cc @@ -39,52 +39,6 @@ void DTDialect::initialize() { >(); } -llvm::Optional GetTargetType(mlir::StringRef key) { - if (key.equals_insensitive("x86")) - return TargetType::X86; - else if (key.equals_insensitive("cuda")) - return TargetType::CUDA; - else - return llvm::None; -} - -llvm::Optional GetLayoutType(mlir::StringRef key) { - if (key.equals_insensitive("nchw")) - return LayoutType::NCHW; - else if (key.equals_insensitive("nhwc")) - return LayoutType::NHWC; - else - return llvm::None; -} - -llvm::Optional GetPrecisionType(mlir::StringRef key) { - if (key.equals_insensitive("i32")) - return PrecisionType::I32; - else if (key.equals_insensitive("f32")) - return PrecisionType::F32; - else - return llvm::None; -} - -TensorType TensorType::get(mlir::MLIRContext *ctx, - TargetType target, - LayoutType layout, - PrecisionType precision) { - return Base::get(ctx, target, layout, precision); -} - -TargetType TensorType::target() { return getImpl()->target_; } - -LayoutType TensorType::layout() { return getImpl()->layout_; } - -PrecisionType TensorType::precision() { return getImpl()->precision_; } - -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) { - os << "TensorType<" << tensorType.target() << ", " << tensorType.layout() - << ", " << tensorType.precision() << ">"; - return os; -} - TensorMapType TensorMapType::get() { return Base::get(::infrt::Global::getMLIRContext()); } @@ -101,48 +55,6 @@ StringType StringType::get(mlir::MLIRContext *context) { return Base::get(context); } -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) { - switch (type) { - case (TargetType::X86): - os << "X86"; - break; - case (TargetType::CUDA): - os << "CUDA"; - break; - default: - os << "Unsupported"; - } - return os; -} - -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) { - switch (type) { - case (LayoutType::NCHW): - os << "NCHW"; - break; - case (LayoutType::NHWC): - os << "NHWC"; - break; - default: - os << "Unsupported"; - } - return os; -} - -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) { - switch (type) { - case (PrecisionType::I32): - os << "I32"; - break; - case (PrecisionType::F32): - os << "F32"; - break; - default: - os << "Unsupported"; - } - return os; -} - static mlir::Type getTensorType(mlir::MLIRContext *context) { auto t_dialect = mlir::Identifier::get("t", context); return mlir::OpaqueType::get(t_dialect, "tensor"); @@ -165,7 +77,7 @@ static mlir::ParseResult parseCreateUninitTensorOp( if (parser.parseArrow()) return mlir::failure(); if (parser.parseType(outputRawTypes[0])) return mlir::failure(); - if (!outputRawTypes[0].isa()) + if (!outputRawTypes[0].isa()) return parser.emitError(loc, "invalid kind of type specified"); result.addTypes(outputTypes); return mlir::success(); diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h index 08ba8d720662b8c7ac4f224d8fe6366d4acc7d3e..b0a1ea412c53eb677fed1a1b76e704f3f3da11e5 100644 --- a/paddle/infrt/dialect/dense_tensor.h +++ b/paddle/infrt/dialect/dense_tensor.h @@ -19,68 +19,10 @@ #include +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" + namespace infrt { namespace dt { -enum class TargetType : uint8_t { X86, CUDA }; -enum class LayoutType : uint8_t { NCHW, NHWC }; -enum class PrecisionType : uint8_t { I32, F32 }; - -llvm::Optional GetTargetType(mlir::StringRef key); -llvm::Optional GetLayoutType(mlir::StringRef key); -llvm::Optional GetPrecisionType(mlir::StringRef key); - -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type); -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type); -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type); - -namespace detail { -struct TensorTypeStorage : public mlir::TypeStorage { - TensorTypeStorage(TargetType target, - LayoutType layout, - PrecisionType precision) - : target_(target), layout_(layout), precision_(precision) {} - - using KeyTy = std::tuple; - - bool operator==(const KeyTy &key) const { - return key == KeyTy(target_, layout_, precision_); - } - - static llvm::hash_code hashKey(const KeyTy &key) { - return llvm::hash_value(key); - } - - static TensorTypeStorage *construct( - mlir::TypeStorageAllocator &allocator, // NOLINT - const KeyTy &key) { - return new (allocator.allocate()) - TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key)); - } - - TargetType target_; - LayoutType layout_; - PrecisionType precision_; -}; -} // namespace detail - -class TensorType : public mlir::Type::TypeBase { - public: - using Base::Base; - - static TensorType get(mlir::MLIRContext *ctx, - TargetType target, - LayoutType layout, - PrecisionType precision); - - TargetType target(); - LayoutType layout(); - PrecisionType precision(); -}; - -mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType); - class TensorMapType : public mlir::Type::TypeBase { diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 75c8a0d88e4c11f5e27d7b6d38062e118475274b..7e6e838a72372d2f850d4fb37f6b2218577ba0ed 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -28,7 +28,7 @@ class CreateUninitTensorOp }]; let arguments = (ins I64ArrayAttr:$shape); - let results = (outs TensorType:$output); + let results = (outs DenseTensor:$output); let parser = [{ return infrt::dt::parseCreateUninitTensorOp(parser, result); }]; let printer = [{ return infrt::dt::printCreateUninitTensorOp(p, *this); }]; @@ -43,8 +43,8 @@ def ShallowCopyTensorOp An operation that copy a tensor shallowly. }]; - let arguments = (ins TensorType:$input); - let results = (outs TensorType:$output); + let arguments = (ins DenseTensor:$input); + let results = (outs DenseTensor:$output); let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; } @@ -59,7 +59,7 @@ class FillTensorWithConstantOp : }]; let arguments = (ins - TensorType:$input, + DenseTensor:$input, AnyAttr:$value ); let results = (outs); @@ -77,7 +77,7 @@ def PrintTensorOp : DT_Op<"print_tensor"> { An operation that prints a tensor. }]; - let arguments = (ins TensorType:$input); + let arguments = (ins DenseTensor:$input); let results = (outs); let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; } @@ -90,7 +90,7 @@ class SetTensorOp : An operation that sets an input tensor with given values. }]; - let arguments = (ins TensorType); + let arguments = (ins DenseTensor); let results = (outs); let parser = [{ return infrt::dt::parseSetTensorOp(parser, result); }]; @@ -125,7 +125,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { TensorMapType:$map, StrAttr:$name ); - let results = (outs TensorType:$output); + let results = (outs DenseTensor:$output); let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)"; let verifier = ?; } @@ -149,7 +149,7 @@ def GetTensorShapeOp : DT_Op<"get_tensor_shape", [NoSideEffect]> { An operation that returns the shape of the input tensor. }]; - let arguments = (ins TensorType:$input); + let arguments = (ins DenseTensor:$input); let results = (outs TS_Shape:$output); let assemblyFormat = "$input attr-dict `:` type($input) `->` type($output)"; } @@ -162,8 +162,8 @@ class NaiveElementwiseAddOp : Naive elementwise_add operation. Just for testing. }]; - let arguments = (ins TensorType:$a, TensorType:$b); - let results = (outs TensorType:$output); + let arguments = (ins DenseTensor:$a, DenseTensor:$b); + let results = (outs DenseTensor:$output); let assemblyFormat = "`(` $a `,` $b `)` attr-dict `:` `(` type($a) `,` type($b) `)` `->` type($output)"; } @@ -175,8 +175,8 @@ class NaiveMatmulOp : Naive matmul operation. Just for testing. }]; - let arguments = (ins TensorType:$x, TensorType:$w); - let results = (outs TensorType:$output); + let arguments = (ins DenseTensor:$x, DenseTensor:$w); + let results = (outs DenseTensor:$output); let assemblyFormat = "`(` $x `,` $w `)` attr-dict `:` `(` type($x) `,` type($w) `)` `->` type($output)"; } diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt index 98910d8d0ecf0b99bd1eb8b860ed573ae88ef203..daf710e0baf54549a2cc3e7a6e87c7b76a169f29 100644 --- a/paddle/infrt/dialect/infrt/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -1,7 +1,15 @@ core_gather_headers() gather_srcs(infrt_src SRCS + common_type.cc infrt_dialect.cc ) -add_mlir_dialect(infrt_ops Infrt) + +add_mlir_dialect(infrt_ops infrt) + +set(LLVM_TARGET_DEFINITIONS infrt_ops.td) +mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) +mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) +add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) +add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) diff --git a/paddle/infrt/dialect/infrt/common_type.cc b/paddle/infrt/dialect/infrt/common_type.cc new file mode 100644 index 0000000000000000000000000000000000000000..5cbd7b2cd6153f3724bc357811bdb0894eeb64ba --- /dev/null +++ b/paddle/infrt/dialect/infrt/common_type.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/common_type.h" + +namespace infrt { + +llvm::Optional GetTargetType(llvm::StringRef key) { + if (key.equals_insensitive("CPU")) + return TargetType::CPU; + else if (key.equals_insensitive("GPU")) + return TargetType::GPU; + else + return llvm::None; +} + +llvm::Optional GetLayoutType(llvm::StringRef key) { + if (key.equals_insensitive("NCHW")) + return LayoutType::NCHW; + else if (key.equals_insensitive("NHWC")) + return LayoutType::NHWC; + else + return llvm::None; +} + +llvm::Optional GetPrecisionType(llvm::StringRef key) { + if (key.equals_insensitive("FP32")) + return PrecisionType::FLOAT32; + else if (key.equals_insensitive("FP16")) + return PrecisionType::FLOAT16; + else + return llvm::None; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type) { + switch (type) { + case (TargetType::CPU): + os << "CPU"; + break; + case (TargetType::GPU): + os << "GPU"; + break; + default: + os << "Unsupported"; + } + return os; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type) { + switch (type) { + case (LayoutType::NCHW): + os << "NCHW"; + break; + case (LayoutType::NHWC): + os << "NHWC"; + break; + default: + os << "Unsupported"; + } + return os; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type) { + switch (type) { + case (PrecisionType::FLOAT32): + os << "FP32"; + break; + case (PrecisionType::FLOAT16): + os << "FP16"; + break; + default: + os << "Unsupported"; + } + return os; +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h new file mode 100644 index 0000000000000000000000000000000000000000..d6d6503c03be5722cf398c8abac4485aae5d9a8c --- /dev/null +++ b/paddle/infrt/dialect/infrt/common_type.h @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +namespace infrt { + +enum class TargetType : uint8_t { CPU, GPU, UNK }; +enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK }; +enum class LayoutType : uint8_t { NCHW, NHWC, UNK }; + +struct Place { + TargetType target; + PrecisionType precision; + LayoutType layout; + Place(TargetType tar, PrecisionType pre, LayoutType lay) + : target(tar), precision(pre), layout(lay) {} + Place() + : target(TargetType::UNK), + precision(PrecisionType::UNK), + layout(LayoutType::UNK) {} +}; + +llvm::Optional GetTargetType(llvm::StringRef key); +llvm::Optional GetLayoutType(llvm::StringRef key); +llvm::Optional GetPrecisionType(llvm::StringRef key); + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, TargetType type); +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, LayoutType type); +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, PrecisionType type); + +} // end namespace infrt diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.cc b/paddle/infrt/dialect/infrt/infrt_dialect.cc index 388de858b6572ea5900851b170d09589387c0b05..abb60016f90233cae68dc99e95885042517e9212 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.cc +++ b/paddle/infrt/dialect/infrt/infrt_dialect.cc @@ -23,6 +23,9 @@ #define GET_TYPEDEF_CLASSES #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" +#define GET_ATTRDEF_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" + #define GET_OP_CLASSES #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" @@ -33,6 +36,12 @@ void InfrtDialect::initialize() { #define GET_TYPEDEF_LIST #include "paddle/infrt/dialect/infrt/infrt_opsTypes.cpp.inc" // NOLINT >(); + + addAttributes< +#define GET_ATTRDEF_LIST +#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.cpp.inc" // NOLINT + >(); + addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/infrt/infrt_ops.cpp.inc" // NOLINT @@ -57,36 +66,104 @@ mlir::Type InfrtDialect::parseType(::mlir::DialectAsmParser &parser) const { // Parse the element type. if (parser.parseType(elementType)) return nullptr; - // parse "," - if (parser.parseComma()) return nullptr; - - // llvm::APInt lod_level; - if (parser.parseInteger(lod_level)) return nullptr; - + // parse optional lod_level + if (parser.parseOptionalComma().succeeded()) { + // llvm::APInt lod_level; + if (parser.parseInteger(lod_level)) return nullptr; + } // parse ">" if (parser.parseGreater()) return nullptr; return LoDTensorType::get( parser.getContext(), shape, elementType, lod_level); } + if (keyword == "dense_tensor") { + // parse DenseTensor, for example: !i=Infrt.tensor + llvm::StringRef target; + llvm::StringRef layout; + llvm::StringRef precision; + + // parse "<" + if (parser.parseLess()) return mlir::Type(); + // parse target + if (parser.parseKeyword(&target)) return mlir::Type(); + auto targetType = GetTargetType(target); + if (!targetType) { + parser.emitError(parser.getCurrentLocation(), "unknown target type: ") + << target; + return mlir::Type(); + } + + // parse "," + if (parser.parseComma()) return mlir::Type(); + // parse precision + if (parser.parseKeyword(&precision)) return mlir::Type(); + auto precisionType = GetPrecisionType(precision); + if (!precisionType) { + parser.emitError(parser.getCurrentLocation(), "unknown precision type: ") + << precision; + return mlir::Type(); + } + + // parse "," + if (parser.parseComma()) return mlir::Type(); + + // parse layout + if (parser.parseKeyword(&layout)) return mlir::Type(); + auto layoutType = GetLayoutType(layout); + if (!layoutType) { + parser.emitError(parser.getCurrentLocation(), "unknown layout type: ") + << layout; + return mlir::Type(); + } + // parse ">" + if (parser.parseGreater()) return mlir::Type(); + return DenseTensorType::get( + parser.getContext(), *targetType, *precisionType, *layoutType); + } // Todo: parse other type return mlir::Type(); } void InfrtDialect::printType(::mlir::Type type, ::mlir::DialectAsmPrinter &os) const { - // print TensorType, for example: !infrt.tensor + // print LoDTensorType, for example: !Infrt.lod_tensor<3x64x3x3xf32,5> if (type.isa()) { - auto lodTensorType = type.cast(); + auto lod_tensor_type = type.cast(); os << "lod_tensor<"; - auto shape = lodTensorType.getShape(); - for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) - os << *dim << 'x'; - os << shape.back() << 'x' << lodTensorType.getElementType() << ", " - << lodTensorType.getLod_level() << ">"; + auto shape = lod_tensor_type.getShape(); + for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim) { + *dim < 0 ? os << '?' : os << *dim; + os << 'x'; + } + shape.back() < 0 ? os << '?' : os << shape.back(); + os << 'x' << lod_tensor_type.getElementType() << ", " + << lod_tensor_type.getLod_level() << ">"; return; } + + // print DenseTensorType, for example: !infrt.dense_tensor + if (type.isa()) { + auto dense_tensor_type = type.cast(); + os << "dense_tensor<" << dense_tensor_type.getTarget() << ", " + << dense_tensor_type.getPrecision() << ", " + << dense_tensor_type.getLayout() << ">"; + return; + } + llvm_unreachable("unknown infrt type."); } +// /// Parse an attribute registered to this dialect. +// ::mlir::Attribute InfrtDialect::parseAttribute(::mlir::DialectAsmParser +// &parser, +// ::mlir::Type type) const { +// return mlir::Attribute(); +// } +// /// Print an attribute registered to this dialect. +// void InfrtDialect::printAttribute(::mlir::Attribute attr, +// ::mlir::DialectAsmPrinter &os) const { + +// } + } // namespace infrt diff --git a/paddle/infrt/dialect/infrt/infrt_dialect.h b/paddle/infrt/dialect/infrt/infrt_dialect.h index 21a1f6b34f6a5f33bd82c4e78669ee24221a08f1..ed5b36e556149dbc3026e732cf953c5562841921 100644 --- a/paddle/infrt/dialect/infrt/infrt_dialect.h +++ b/paddle/infrt/dialect/infrt/infrt_dialect.h @@ -17,13 +17,19 @@ //===----------------------------------------------------------------------===// // Dialect //===----------------------------------------------------------------------===// +#include #include #include #include #include +#include "paddle/infrt/dialect/infrt/common_type.h" #include "paddle/infrt/dialect/infrt/infrt_opsDialect.h.inc" #define GET_TYPEDEF_CLASSES #include "paddle/infrt/dialect/infrt/infrt_opsTypes.h.inc" + +#define GET_ATTRDEF_CLASSES +#include "paddle/infrt/dialect/infrt/infrt_opsAttributes.h.inc" + #define GET_OP_CLASSES #include "paddle/infrt/dialect/infrt/infrt_ops.h.inc" diff --git a/paddle/infrt/dialect/infrt/infrt_ops.td b/paddle/infrt/dialect/infrt/infrt_ops.td index 319760973cd90c667793e29761c030141990c242..00f94805c7db22e170c7395598bfe647174339c1 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops.td +++ b/paddle/infrt/dialect/infrt/infrt_ops.td @@ -1,34 +1,4 @@ -#ifndef Infrt_OpS -#define Infrt_OpS - -include "mlir/IR/OpBase.td" -include "mlir/Interfaces/SideEffectInterfaces.td" - -def Infrt_Dialect : Dialect { - let summary = - "A dialect containing the Infrt Attributes, Operations, and Types"; - - let name = "Infrt"; - let cppNamespace = "::infrt"; -} - -// Type definitions - -// Base class for Infrt dialect types. -class Infrt_Type traits = [], - string baseCppClass = "::mlir::Type"> - : TypeDef { -} - -def LoDTensor : Infrt_Type<"LoDTensor"> { - let summary = "infrt lod tensor"; - let description = [{lod_tensor<3x64x3x3xf32, 3>}]; - let parameters = (ins - ArrayRefParameter<"int64_t">:$shape, - "mlir::Type":$elementType, - "int32_t":$lod_level - ); -} +include "paddle/infrt/dialect/infrt/infrt_ops_base.td" // Op definition class Infrt_Op traits = []> : Op { @@ -39,14 +9,11 @@ class Infrt_Op traits = []> : Op { -// let summary = "kernel op"; -// let description = [{ -// kernel op! -// }]; -// let arguments = (ins StrAttr:$name, PD_Tensor:$X, PD_Tensor:$Y, DefaultValuedAttr:$Alpha, DefaultValuedAttr:$Beta); -// -// let results = (outs PD_Tensor:$Out); -// } - -#endif // Infrt_OpS +def Infrt_KernelOp : Infrt_Op<"kernel", [NoSideEffect]> { + let summary = "kernel op"; + let description = [{kernel op!}]; + let arguments = (ins Variadic:$operands, + StrAttr:$name, + OptionalAttr:$attrs); + let results = (outs Variadic); +} diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td new file mode 100644 index 0000000000000000000000000000000000000000..81d3d028a66bea29dd9a373e1905ac02468251fd --- /dev/null +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -0,0 +1,49 @@ +#ifndef INFRT_OPS_BASE +#define INFRT_OPS_BASE + +include "mlir/IR/OpBase.td" +include "mlir/Interfaces/SideEffectInterfaces.td" + +def Infrt_Dialect : Dialect { + let summary = + "A dialect containing the Infrt Attributes, Operations, and Types"; + + let name = "infrt"; + let cppNamespace = "::infrt"; +} + +// Type definitions + +// Base class for Infrt dialect types. +class Infrt_Type traits = [], + string baseCppClass = "::mlir::Type"> + : TypeDef { +} + +def LoDTensor : Infrt_Type<"LoDTensor"> { + let summary = "infrt lod tensor"; + let description = [{lod_tensor<3x64x3x3xf32, 3>}]; + let parameters = (ins + ArrayRefParameter<"int64_t">:$shape, + "mlir::Type":$elementType, + "int32_t":$lod_level + ); +} + +def DenseTensor : Infrt_Type<"DenseTensor"> { + let summary = "infrt dense tensor"; + let description = [{dense_tensor<, 3>}]; + let parameters = (ins + "TargetType":$target, + "PrecisionType":$precision, + "LayoutType":$layout + ); +} + +// Base class for infrt dialect attributes. +class Infrt_Attr traits = [], + string baseCppClass = "::mlir::Attribute"> + : AttrDef { + let mnemonic = ?; +} +#endif // INFRT_OPS_BASE diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc index c0101a8c16608bc732f7b786c62ed4ea90ab2628..8c595c06745f1be8453c4d1f08ba00f4d9ceaf90 100644 --- a/paddle/infrt/dialect/infrt_base.cc +++ b/paddle/infrt/dialect/infrt_base.cc @@ -27,7 +27,6 @@ void INFRTDialect::initialize() { allowUnknownOperations(); addTypes(); - addTypes(); addTypes(); addOperations< @@ -43,51 +42,6 @@ void INFRTDialect::initialize() { mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const { llvm::StringRef keyword; if (parser.parseKeyword(&keyword)) return mlir::Type(); - // parse TensorType, for example: !infrt.tensor - if (keyword == "tensor") { - llvm::StringRef target; - llvm::StringRef layout; - llvm::StringRef precision; - - // parse "<" - if (parser.parseLess()) return mlir::Type(); - // parse target - if (parser.parseKeyword(&target)) return mlir::Type(); - auto targetType = infrt::dt::GetTargetType(target); - if (!targetType) { - parser.emitError(parser.getCurrentLocation(), "unknown target type: ") - << target; - return mlir::Type(); - } - - // parse "," - if (parser.parseComma()) return mlir::Type(); - // parse layout - if (parser.parseKeyword(&layout)) return mlir::Type(); - auto layoutType = infrt::dt::GetLayoutType(layout); - if (!layoutType) { - parser.emitError(parser.getCurrentLocation(), "unknown layout type: ") - << layout; - return mlir::Type(); - } - - // parse "," - if (parser.parseComma()) return mlir::Type(); - // parse precision - if (parser.parseKeyword(&precision)) return mlir::Type(); - auto precisionType = infrt::dt::GetPrecisionType(precision); - if (!precisionType) { - parser.emitError(parser.getCurrentLocation(), "unknown precision type: ") - << precision; - return mlir::Type(); - } - - // parse ">" - if (parser.parseGreater()) return mlir::Type(); - - return infrt::dt::TensorType::get( - parser.getContext(), *targetType, *layoutType, *precisionType); - } // parse TensorMapType, for example: !infrt.tensor_map if (keyword == "tensor_map") { return infrt::dt::TensorMapType::get(); @@ -104,13 +58,6 @@ mlir::Type INFRTDialect::parseType(mlir::DialectAsmParser &parser) const { void INFRTDialect::printType(mlir::Type type, mlir::DialectAsmPrinter &printer) const { - // print TensorType, for example: !infrt.tensor - if (type.isa()) { - auto tensorType = type.cast(); - printer << "tensor<" << tensorType.target() << ", " << tensorType.layout() - << ", " << tensorType.precision() << ">"; - return; - } // print TensorMapType, for example: !infrt.tensor_map if (type.isa()) { printer << "tensor_map"; diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h index 4021a5a6d3cd2b6d7ca272b69c6cc477ea25cad2..a8e7e13a681caa4891c42ac01d2a759d878594d1 100644 --- a/paddle/infrt/dialect/infrt_base.h +++ b/paddle/infrt/dialect/infrt_base.h @@ -43,7 +43,7 @@ class INFRTDialect : public mlir::Dialect { friend class mlir::MLIRContext; public: - static ::llvm::StringRef getDialectNamespace() { return "infrt"; } + static ::llvm::StringRef getDialectNamespace() { return "Infrt"; } }; } // namespace dialect diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td index 1abd294236d93cfb0aa7ce70db25f2acfb57a06a..4d4727ee8e185032c6530cd293b0545283660e46 100644 --- a/paddle/infrt/dialect/infrt_base.td +++ b/paddle/infrt/dialect/infrt_base.td @@ -2,9 +2,10 @@ #define INFRT_BASE include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/infrt/infrt_ops_base.td" def INFRT_Dialect : Dialect { - let name = "infrt"; + let name = "Infrt"; let description = [{ The INFRT host dialect. @@ -18,9 +19,6 @@ def StringType : Type()">, "!infrt.string type">, BuildableType<"$_builder.getType<::infrt::dt::StringType>()">; -def TensorType : - Type()">, "!infrt.tensor type">; - def TensorMapType : Type()">, "!infrt.tensor_map type">, BuildableType<"$_builder.getType<::infrt::dt::TensorMapType>()">; diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index 090f1aea289109feda54b12131daf2993ea4e5e0..b5b8de7a20d0866802b8ce72e12dd7ed35dccbd1 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -21,8 +21,8 @@ #include "paddle/infrt/dialect/infrt/infrt_dialect.h" #include "paddle/infrt/dialect/infrt_base.h" #include "paddle/infrt/dialect/pd_ops.h" -#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h" -#include "paddle/infrt/dialect/pten/pten_base.h" +#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" +#include "paddle/infrt/dialect/phi/phi_base.h" #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { @@ -32,9 +32,9 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT infrt::InfrtDialect, dt::DTDialect, mlir::pd::PaddleDialect, -#ifdef INFRT_WITH_PTEN - pten::PTENDenseTensorDialect, - pten::PTENDialect +#ifdef INFRT_WITH_PHI + phi::PHIDenseTensorDialect, + phi::PHIDialect #endif >(); } diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc index 11150530730444ed74f547b9bb8abef5473c61b0..2f721e49a63096d1c3168805d373cbc8809542da 100644 --- a/paddle/infrt/dialect/mlir_loader_test.cc +++ b/paddle/infrt/dialect/mlir_loader_test.cc @@ -32,13 +32,13 @@ TEST(MlirLoader, basic) { auto source = R"ROC( func @main() -> f32 { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 - %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 + %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - "infrt.print.f32"(%v0) : (f32) -> () + "Infrt.print.f32"(%v0) : (f32) -> () - infrt.return %value : f32 + Infrt.return %value : f32 } )ROC"; diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index a61a4645eff76cc1fdcbf5176bf4d3e9a606f89e..266bdf60de788df0507a5bf0ef679945cb7c2abc 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -6,7 +6,7 @@ include "mlir/IR/OpBase.td" include "mlir/Interfaces/SideEffectInterfaces.td" -include "paddle/infrt/dialect/infrt/infrt_ops.td" +include "paddle/infrt/dialect/infrt/infrt_ops_base.td" def PD_Dialect : Dialect { let name = "pd"; diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..626b02c1f790d0a7f38887be33dace1c773a2cb1 --- /dev/null +++ b/paddle/infrt/dialect/phi/CMakeLists.txt @@ -0,0 +1,18 @@ +if (NOT INFRT_WITH_PHI) + return() +endif() + +#mlir_tablegen_on(infrt_phi_base DIALECT phi) +add_mlir_dialect(infrt_phi_base phi) +add_mlir_dialect(infrt_phi_tensor phi_dt) +add_mlir_dialect(infrt_phi_kernel phi_kernel) +#mlir_tablegen_on(infrt_phi_tensor) + +gather_srcs(infrt_src SRCS + phi_base.cc infrt_phi_tensor.cc + infrt_phi_tensor.cc) + +add_subdirectory(pass) + +add_executable(phi-exec phi_exec.cc) +target_link_libraries(phi-exec infrt) diff --git a/paddle/infrt/dialect/pten/infrt_pten_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td similarity index 54% rename from paddle/infrt/dialect/pten/infrt_pten_base.td rename to paddle/infrt/dialect/phi/infrt_phi_base.td index 20a43f9a92620debd4cf382222de5f9dfe93b9a2..907f912d9e638ba76e5010d5442381d1aa053bc2 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_base.td +++ b/paddle/infrt/dialect/phi/infrt_phi_base.td @@ -1,28 +1,30 @@ -#ifndef PTEN_BASE -#define PTEN_BASE +#ifndef PHI_BASE +#define PHI_BASE include "mlir/IR/OpBase.td" -def PTEN_Dialect : Dialect { - let name = "pten"; +def PHI_Dialect : Dialect { + let name = "phi"; let description = [{ - The PTEN host dialect. + The PHI host dialect. }]; - let cppNamespace = "::infrt::pten"; + let cppNamespace = "::infrt::phi"; } class AllocatorTypeOf traits=[]>: - TypeDef { - let summary = !strconcat("!pten.allocator_", place, " type"); + TypeDef { + let summary = !strconcat("!phi.allocator_", place, " type"); } class ContextTypeOf traits=[]>: - TypeDef { - let summary = !strconcat("!pten.context_", place, " type"); + TypeDef { + let summary = !strconcat("!phi.context_", place, " type"); } +def PhiOpTrait : NativeOpTrait<"PhiOpTrait">; + def CPU_Allocator : AllocatorTypeOf<"CPU">; def GPU_Allocator : AllocatorTypeOf<"GPU">; diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td new file mode 100644 index 0000000000000000000000000000000000000000..879994907cc0d951bde838b23fd129e865a360f2 --- /dev/null +++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td @@ -0,0 +1,29 @@ +#ifndef PHI_KERNEL +#define PHI_KERNEL + +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/phi/infrt_phi_base.td" + +def PHI_KernelDialect : Dialect { + let name = "phi_kernel"; + + let description = [{ + The PHI Kernel dialect. + }]; + + let cppNamespace = "::infrt::phi"; +} + +// PHI Kernel related ops. +class PDT_Kernel traits = []> : Op { +} + +def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> { + let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x); + let results = (outs DenseTensor:$output); +} + +#endif + diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc similarity index 65% rename from paddle/infrt/dialect/pten/infrt_pten_tensor.cc rename to paddle/infrt/dialect/phi/infrt_phi_tensor.cc index b3e99da8750fb9691833256b2d7d1f09aae2e27c..9df1a47031b1f726578291f628cda7d12900bcb7 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_tensor.cc +++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.cc @@ -12,25 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h" +#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h" #include -#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.cpp.inc" -#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.cpp.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc" namespace infrt { -namespace pten { +namespace phi { -void PTENDenseTensorDialect::initialize() { +void PHIDenseTensorDialect::initialize() { #define GET_OP_LIST addOperations< -#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" >(); } -} // namespace pten +} // namespace phi } // namespace infrt #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pten/infrt_pten_tensor.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.h b/paddle/infrt/dialect/phi/infrt_phi_tensor.h similarity index 83% rename from paddle/infrt/dialect/pten/infrt_pten_tensor.h rename to paddle/infrt/dialect/phi/infrt_phi_tensor.h index 5fe259300d2aec32fade1141de2dbf8cef314687..2780f9759185ef45bc19f43fc621f46eabbe7a66 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_tensor.h +++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.h @@ -29,11 +29,11 @@ #include #include -#include "paddle/infrt/dialect/pten/infrt_pten_tensorDialect.h.inc" -#include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.h.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc" #include "paddle/infrt/dialect/dense_tensor.h" -#include "paddle/infrt/dialect/pten/pten_base.h" +#include "paddle/infrt/dialect/phi/phi_base.h" // NOLINT #define GET_OP_CLASSES -#include "paddle/infrt/dialect/pten/infrt_pten_tensor.h.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc" diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td similarity index 63% rename from paddle/infrt/dialect/pten/infrt_pten_tensor.td rename to paddle/infrt/dialect/phi/infrt_phi_tensor.td index 528f0f919680d65dd9636b96686838b427459eff..b7b3b061fdbe42909ac503d9d387cb8aed6bdc1a 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_tensor.td +++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td @@ -1,36 +1,36 @@ -#ifdef PTEN_TENSOR +#ifdef PHI_TENSOR #else -#define PTEN_TENSOR +#define PHI_TENSOR -include "paddle/infrt/dialect/pten/infrt_pten_base.td" +include "paddle/infrt/dialect/phi/infrt_phi_base.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" -def PTEN_DenseTensorDialect : Dialect { - let name = "pten_dt"; +def PHI_DenseTensorDialect : Dialect { + let name = "phi_dt"; let description = [{ - The PTEN DenseTensor dialect. + The PHI DenseTensor dialect. }]; - let cppNamespace = "::infrt::pten"; + let cppNamespace = "::infrt::phi"; } -// PTEN DenseTensor related Op. -class PDT_Op traits = []> : Op { +// PHI DenseTensor related Op. +class PDT_Op traits = []> : Op { } class CreateDenseTensorOp : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> { let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod); - let results = (outs TensorType:$output); + let results = (outs DenseTensor:$output); } class FillDenseTensorOp : PDT_Op<"fill_dense_tensor." # dtype> { let arguments = (ins - TensorType:$input, + DenseTensor:$input, attr_type:$value ); let results = (outs); @@ -53,4 +53,9 @@ def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp; def PDT_CreateContextOp_cpu : CreateCPUContextOp; +def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { + let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); + let results = (outs DenseTensor:$output); +} + #endif diff --git a/paddle/infrt/dialect/phi/pass/CMakeLists.txt b/paddle/infrt/dialect/phi/pass/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c55a6b0acaed7be9ee86b4662d895d08ca05bdc --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/CMakeLists.txt @@ -0,0 +1,7 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + proto_arg_map_context.cc + phi_op_cvt_pass.cc + kernel_op_desc.cc + ) diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc new file mode 100644 index 0000000000000000000000000000000000000000..63869b7d7b9ea4fd7841dfe352a3b79e9cd18725 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include +#include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/core/kernel_registry.h" +namespace infrt { + +phi::Backend cvtTarget2Phi(TargetType target) { + switch (target) { + case TargetType::CPU: + return phi::Backend::CPU; + case TargetType::GPU: + return phi::Backend::GPU; + default: + return phi::Backend::UNDEFINED; + } +} + +TargetType cvtTargetFromPhi(phi::Backend backend) { + switch (backend) { + case phi::Backend::CPU: + return TargetType::CPU; + case phi::Backend::GPU: + return TargetType::GPU; + default: + return TargetType::UNK; + } +} + +phi::DataType cvtPrecision2Phi(PrecisionType precision) { + switch (precision) { + case PrecisionType::FLOAT32: + return phi::DataType::FLOAT32; + break; + case PrecisionType::FLOAT16: + return phi::DataType::FLOAT16; + default: + return phi::DataType::UNDEFINED; + } +} + +PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { + switch (datatype) { + case phi::DataType::FLOAT32: + return PrecisionType::FLOAT32; + case phi::DataType::FLOAT16: + return PrecisionType::FLOAT16; + default: + return PrecisionType::UNK; + } +} + +phi::DataLayout cvtLayout2Phi(LayoutType layout) { + switch (layout) { + case LayoutType::NCHW: + return phi::DataLayout::NCHW; + case LayoutType::NHWC: + return phi::DataLayout::NHWC; + default: + return phi::DataLayout::UNDEFINED; + } +} + +LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { + switch (layout) { + case phi::DataLayout::NCHW: + return LayoutType::NCHW; + case phi::DataLayout::NHWC: + return LayoutType::NHWC; + default: + return LayoutType::UNK; + } +} + +phi::KernelKey cvtPlace2Phi(const Place& place) { + return phi::KernelKey(cvtTarget2Phi(place.target), + cvtLayout2Phi(place.layout), + cvtPrecision2Phi(place.precision)); +} + +Place cvtPlaceFromPhi(phi::TensorArgDef tensor_arg) { + return Place(cvtTargetFromPhi(tensor_arg.backend), + cvtPrecisionFromPhi(tensor_arg.dtype), + cvtLayoutFromPhi(tensor_arg.layout)); +} + +std::vector getCandidateKernels( + std::string name, const std::vector& valid_palces) { + std::vector candidate_kernels; + PhiKernelDesc phi_kernel_desc; + phi::KernelKeyMap kernel_key_map = + phi::KernelFactory::Instance().SelectKernelMap(name); + for (const Place& place : valid_palces) { + phi::KernelKey kernel_key = cvtPlace2Phi(place); + if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) { + kernel_key = phi::KernelKey(kernel_key.backend(), + phi::DataLayout::ALL_LAYOUT, + kernel_key.dtype()); + if (kernel_key_map.find(kernel_key) == kernel_key_map.end()) continue; + } + phi_kernel_desc.kernelType = place; + phi_kernel_desc.inputsType.clear(); + phi_kernel_desc.outputsType.clear(); + phi::KernelArgsDef args_def = kernel_key_map.at(kernel_key).args_def(); + const paddle::SmallVector& input_arg = + args_def.input_defs(); + const paddle::SmallVector& output_arg = + args_def.output_defs(); + for (auto tensor_arg : input_arg) { + phi_kernel_desc.inputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + } + for (auto tensor_arg : output_arg) { + phi_kernel_desc.outputsType.emplace_back(cvtPlaceFromPhi(tensor_arg)); + } + candidate_kernels.emplace_back(phi_kernel_desc); + } + return candidate_kernels; +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/pten/pten_base.h b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h similarity index 65% rename from paddle/infrt/dialect/pten/pten_base.h rename to paddle/infrt/dialect/phi/pass/kernel_op_desc.h index c3be6ef4e8bf407ad31ed6318fa249b8e3e55ca5..b74107f674e51f6ca09c864d197d9334a08666ac 100644 --- a/paddle/infrt/dialect/pten/pten_base.h +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.h @@ -13,18 +13,20 @@ // limitations under the License. #pragma once -#include -#include -#include #include +#include +#include "paddle/infrt/dialect/infrt/common_type.h" -#include "paddle/infrt/dialect/pten/infrt_pten_base.h.inc" -#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.h.inc" +namespace infrt { -#define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.h.inc" +struct PhiKernelDesc { + std::vector inputsType; // kernel input place + std::vector outputsType; // kernel output place + Place kernelType; // kernel place +}; + +std::vector getCandidateKernels( + std::string name, const std::vector& valid_palces); -namespace infrt { -namespace pten {} // namespace pten } // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..df3472aa01dfb8bfa0e7f6122410c1b4788cd359 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/pass/kernel_op_desc.h" +#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" +#include "paddle/phi/core/compat/op_utils.h" +#include "paddle/phi/ops/compat/signatures.h" +namespace infrt { +// Implementation of the phiOpCvtPass. +void phiOpCvtPass::runOnFunction() { + convertStage(); + diapatchStage(); +} +void phiOpCvtPass::convertStage() { + mlir::Block &body = getFunction().front(); + std::vector worklist; + for (auto &op : body.without_terminator()) { + worklist.push_back(&op); + } + mlir::OpBuilder builder(&body, body.begin()); + while (!worklist.empty()) { + auto *op = worklist.back(); + worklist.pop_back(); + if (op == nullptr) continue; + + std::string op_name = op->getName().getIdentifier().str(); + + // only convert op in pd dialect. + if (op_name.substr(0, 3) != "pd.") continue; + op_name = op_name.substr(3); + if (pd_dialect_inputs_info_map_.find(op_name) == + pd_dialect_inputs_info_map_.end() || + pd_dialect_outputs_info_map_.find(op_name) == + pd_dialect_outputs_info_map_.end()) { + // Todo: print log + continue; + } + + phi::KernelSignature kernel_sign = + phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_name)( + ProtoArgumentMappingContext(op)); + // resort input&output according to kernel_sign + ::llvm::SmallVector inputs, ori_output; + ::llvm::SmallVector output_types; + for (const std::string &str : std::get<0>(kernel_sign.args)) { + if (pd_dialect_inputs_info_map_.at(op_name).count(str) == 0) { + // Todo: print error log + return; + } + uint8_t index = pd_dialect_inputs_info_map_.at(op_name).at(str); + inputs.push_back(op->getOperands()[index]); + } + + for (const std::string &str : std::get<2>(kernel_sign.args)) { + if (pd_dialect_outputs_info_map_.at(op_name).count(str) == 0) { + // Todo: print error log + return; + } + uint8_t index = pd_dialect_outputs_info_map_.at(op_name).at(str); + output_types.push_back(op->getResultTypes()[index]); + ori_output.push_back(op->getResult(index)); + } + + auto loc = getFunction().getLoc(); + builder.setInsertionPoint(op); + auto kernel_op = builder.create( + loc, output_types, inputs, kernel_sign.name, op->getAttrDictionary()); + for (size_t index = 0; index < ori_output.size(); ++index) { + ori_output[index].replaceAllUsesWith(kernel_op.getResult(index)); + } + if (!op->use_empty()) { + // Todo: print error log + return; + } + op->erase(); + } +} +void phiOpCvtPass::diapatchStage() { + std::vector worklist; + mlir::Block &block = getFunction().front(); + for (auto &op : block) { + infrt::KernelOp kernel_op = ::llvm::dyn_cast_or_null(&op); + if (nullptr != kernel_op) worklist.push_back(kernel_op); + } + // ToDo: implementation in the next PR + while (!worklist.empty()) { + // infrt::KernelOp kernel_op = worklist.back(); + worklist.pop_back(); + // std::string kernel_name = kernel_op.name().str(); + // std::vector candidates = + // getCandidateKernels(kernel_name, valid_places_); + } +} +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..051fee9b61a24772ff2295280fa1b0a1588d7bae --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h @@ -0,0 +1,57 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/infrt/dialect/infrt/common_type.h" + +namespace infrt { +/* + * phiOpCvtPass. + * + * Convert the general operators in pd Dialect to a infrt.kernelOp. + * + * source func: + * + * func @main() -> tensor { + * %a = "pd.feed"()... + * %c = "pd.conv2d"(%a) ... + * %d = "pd.conv3d"(%c) ... + * %f = "pd.conv2d"(%a) ... + * "pd.fetch" (%d, %f) + * } + * + * destination func: + * func @main() -> tensor { + * %a = "pd.feed"()... + * %c = "infrt.kernel"(%a){name = "conv2d"} ... + * %d = "infrt.kernel"(%c){name = "conv3d"}... + * %f = "infrt.kernel"(%a){name = "conv2d"}... + * "pd.fetch" (%d, %f) + * } + */ +class phiOpCvtPass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "phiOpCvtPass"; } + void runOnFunction() override; + explicit phiOpCvtPass(std::vector valid_places = std::vector()) + : valid_places_(valid_places) {} + + private: + void convertStage(); + void diapatchStage(); + std::vector valid_places_; +}; +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc new file mode 100644 index 0000000000000000000000000000000000000000..64b184359700ee2625e3c61d21617619a50771e3 --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/pass/proto_arg_map_context.h" + +namespace infrt { + +bool ProtoArgumentMappingContext::HasInput(const std::string& name) const { + if (input_map_.find(name) == input_map_.end()) { + return false; + } + uint8_t index = input_map_.at(name); + return static_cast(op_->getOperand(index)); +} + +bool ProtoArgumentMappingContext::HasOutput(const std::string& name) const { + if (output_map_.find(name) == output_map_.end()) { + return false; + } + return true; +} + +bool ProtoArgumentMappingContext::HasAttr(const std::string& name) const { + return op_->hasAttr(name); +} + +paddle::any ProtoArgumentMappingContext::Attr(const std::string& name) const { + mlir::Attribute attrs = op_->getAttr(name); + if (mlir::StringAttr str_attr = attrs.dyn_cast_or_null()) { + return paddle::any(str_attr.str()); + } else { + // ToDO: implementation in the ext PR. + return paddle::any(0); + } +} + +size_t ProtoArgumentMappingContext::InputSize(const std::string& name) const { + return op_->getNumOperands(); +} +size_t ProtoArgumentMappingContext::OutputSize(const std::string& name) const { + return op_->getNumResults(); +} + +bool ProtoArgumentMappingContext::IsDenseTensorInput( + const std::string& name) const { + return true; +} +bool ProtoArgumentMappingContext::IsSelectedRowsInput( + const std::string& name) const { + return false; +} + +bool ProtoArgumentMappingContext::IsDenseTensorOutput( + const std::string& name) const { + return true; +} +bool ProtoArgumentMappingContext::IsSelectedRowsOutput( + const std::string& name) const { + return false; +} + +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h new file mode 100644 index 0000000000000000000000000000000000000000..843b19d217feb332a278c80378aaeb856442de9a --- /dev/null +++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/infrt/dialect/pd_ops_info.h" +#include "paddle/phi/core/compat/arg_map_context.h" + +namespace infrt { +class ProtoArgumentMappingContext : public phi::ArgumentMappingContext { + public: + // only support op in pd dialect + explicit ProtoArgumentMappingContext(mlir::Operation* op) + : op_(op), + input_map_(pd_dialect_inputs_info_map_.at( + op->getName().getIdentifier().str().substr(3))), + output_map_(pd_dialect_outputs_info_map_.at( + op->getName().getIdentifier().str().substr(3))) {} + bool HasInput(const std::string& name) const override; + bool HasOutput(const std::string& name) const override; + bool HasAttr(const std::string& name) const override; + + // now we can't use Attribute here, it will cause phi relay on + // boost::variant and BlockDesc + paddle::any Attr(const std::string& name) const override; + + size_t InputSize(const std::string& name) const override; + size_t OutputSize(const std::string& name) const override; + + bool IsDenseTensorInput(const std::string& name) const override; + bool IsSelectedRowsInput(const std::string& name) const override; + + bool IsDenseTensorOutput(const std::string& name) const override; + bool IsSelectedRowsOutput(const std::string& name) const override; + + private: + mlir::Operation* op_; + const std::unordered_map& input_map_; + const std::unordered_map& output_map_; +}; + +} // namespace infrt diff --git a/paddle/infrt/dialect/pten/pten_base.cc b/paddle/infrt/dialect/phi/phi_base.cc similarity index 75% rename from paddle/infrt/dialect/pten/pten_base.cc rename to paddle/infrt/dialect/phi/phi_base.cc index ba87787dd7f7caa73a1387c687a96c44c52d26d0..a1caa40f6383b5016a9e237733a0b3ef016cbc97 100644 --- a/paddle/infrt/dialect/pten/pten_base.cc +++ b/paddle/infrt/dialect/phi/phi_base.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/dialect/pten/pten_base.h" +#include "paddle/infrt/dialect/phi/phi_base.h" #include #include @@ -21,14 +21,14 @@ #include #include #include "paddle/infrt/common/global.h" -#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc" -#include "paddle/infrt/dialect/pten/infrt_pten_baseDialect.cpp.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc" namespace infrt { -namespace pten { +namespace phi { -void PTENDialect::printType(::mlir::Type type, - mlir::DialectAsmPrinter& os) const { +void PHIDialect::printType(::mlir::Type type, + mlir::DialectAsmPrinter& os) const { if (type.isa()) { os << "CPU_Allocator"; return; @@ -48,18 +48,18 @@ void PTENDialect::printType(::mlir::Type type, llvm_unreachable("unexpected 'allocator/context' type kind"); } -void PTENDialect::initialize() { +void PHIDialect::initialize() { addOperations< #define GET_OP_LIST -#include "paddle/infrt/dialect/pten/infrt_pten_base.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc" // NOLINT >(); addTypes< #define GET_TYPEDEF_LIST -#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT >(); } -mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const { +mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const { llvm::StringRef keyword; if (parser.parseKeyword(&keyword)) return mlir::Type(); if (keyword == "CPU_allocator") { @@ -77,8 +77,8 @@ mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const { return mlir::Type(); } -} // namespace pten +} // namespace phi } // namespace infrt #define GET_TYPEDEF_CLASSES -#include "paddle/infrt/dialect/pten/infrt_pten_baseTypes.cpp.inc" // NOLINT +#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc" // NOLINT diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/phi_base.h new file mode 100644 index 0000000000000000000000000000000000000000..11174290f92bd18fdc91588d7eba89f61bb05413 --- /dev/null +++ b/paddle/infrt/dialect/phi/phi_base.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include + +#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc" +#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc" + +#define GET_TYPEDEF_CLASSES +#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc" + +namespace mlir { +namespace OpTrait { + +template +class PhiOpTrait : public OpTrait::TraitBase { + public: + static LogicalResult verifyTrait(Operation *op) { + return LogicalResult::success(); + } +}; + +} // namespace OpTrait +} // namespace mlir + +namespace infrt { +namespace phi {} // namespace phi +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/phi_exec.cc b/paddle/infrt/dialect/phi/phi_exec.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e99661a6a20590e7d36c1cf3a0e1e5d334b2464 --- /dev/null +++ b/paddle/infrt/dialect/phi/phi_exec.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/mlir_loader.h" +#include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" + +int main(int argc, char** argv) { + static llvm::cl::opt input_file( + llvm::cl::Positional, + llvm::cl::desc("Specify input filename"), + llvm::cl::init("-")); + + llvm::cl::ParseCommandLineOptions(argc, argv); + + mlir::MLIRContext* context = infrt::Global::getMLIRContext(); + auto module = infrt::dialect::LoadMlirFile(input_file.c_str(), context); + + module->dump(); + mlir::PassManager pm(context); + + mlir::OpPassManager& phi_pass_manager = pm.nest(); + std::vector valid_places = {{infrt::TargetType::CPU, + infrt::PrecisionType::FLOAT32, + infrt::LayoutType::NCHW}}; + phi_pass_manager.addPass(std::make_unique(valid_places)); + if (mlir::failed(pm.run(*module))) { + std::cout << "\npass failed!\n" << std::endl; + return 4; + } + module->dump(); + return 0; +} diff --git a/paddle/infrt/dialect/pten/CMakeLists.txt b/paddle/infrt/dialect/pten/CMakeLists.txt deleted file mode 100644 index b4ed5cdc1d82fd4a32f8594dc41b6e32c3e52459..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pten/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -if (NOT INFRT_WITH_PTEN) - return() -endif() - -#mlir_tablegen_on(infrt_pten_base DIALECT pten) -add_mlir_dialect(infrt_pten_base pten) -add_mlir_dialect(infrt_pten_tensor pten_dt) -add_mlir_dialect(infrt_pten_kernel pten_kernel) -#mlir_tablegen_on(infrt_pten_tensor) - -gather_srcs(infrt_src SRCS - pten_base.cc infrt_pten_tensor.cc - infrt_pten_tensor.cc) diff --git a/paddle/infrt/dialect/pten/infrt_pten_kernel.td b/paddle/infrt/dialect/pten/infrt_pten_kernel.td deleted file mode 100644 index a3a1609d9918aea754666b8ec0bcc467fad4d756..0000000000000000000000000000000000000000 --- a/paddle/infrt/dialect/pten/infrt_pten_kernel.td +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef PTEN_KERNEL -#define PTEN_KERNEL - -include "paddle/infrt/dialect/pten/infrt_pten_tensor.td" - -def PTEN_KernelDialect : Dialect { - let name = "pten_kernel"; - - let description = [{ - The PTEN Kernel dialect. - }]; - - let cppNamespace = "::infrt::pten"; -} - -// PTEN Kernel related ops. -class PDT_Kernel traits = []> : Op { -} - -def FakeKernelOp : PDT_Kernel<"pten.matmul.host.fp32"> { - let arguments = (ins CPU_Context:$dev_ctx, TensorType:$x, TensorType:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); - let results = (outs TensorType:$output); -} - -#endif - diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc index c4588d7cf8bab748832865fc3aaab1913f33d11b..f0c4723b49a7906cf5327771e26eb87e8b1248c0 100644 --- a/paddle/infrt/dialect/test_kernels.cc +++ b/paddle/infrt/dialect/test_kernels.cc @@ -147,7 +147,7 @@ static mlir::LogicalResult verify(BenchmarkOp op) { // Verify that the target benchmark region has exactly one return value. auto ®ion = op.region(); auto &last_op = region.front().back(); - if (last_op.getName().getStringRef() != "infrt.return") { + if (last_op.getName().getStringRef() != "Infrt.return") { return op.emitOpError("missing return statement"); } if (last_op.getNumOperands() != 1) { diff --git a/paddle/infrt/dialect/test_kernels.td b/paddle/infrt/dialect/test_kernels.td index 6aa12f252d0144c814e70e57c336a64df47de95b..6e4bc26aa1496dcb4caed83f98fc42dab9e3cce0 100644 --- a/paddle/infrt/dialect/test_kernels.td +++ b/paddle/infrt/dialect/test_kernels.td @@ -45,7 +45,7 @@ def BenchmarkOp : Test_Op<"benchmark"> { // The following code benchmarks the infrt.add.i32 kernel. %x = infrt.add.i32 %c, %c // The benchmarked function needs to return exactly one value. - infrt.return %x : i32 + Infrt.return %x : i32 } }]; diff --git a/paddle/infrt/external_kernels/basic.mlir b/paddle/infrt/external_kernels/basic.mlir index 843b12ced21a982b18b5a63f7bbef1d4d24eea16..1a7ea854c9ce469ee5719743287b4ee1b5de9286 100644 --- a/paddle/infrt/external_kernels/basic.mlir +++ b/paddle/infrt/external_kernels/basic.mlir @@ -1,7 +1,7 @@ // CHECK: basic func @basic() -> f32 { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 %v2 = "external.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 @@ -17,5 +17,5 @@ func @basic() -> f32 { // CHECK: 6 "external.print.f32"(%v3) : (f32) -> () - infrt.return %v3 : f32 + Infrt.return %v3 : f32 } diff --git a/paddle/infrt/external_kernels/fc.mlir b/paddle/infrt/external_kernels/fc.mlir index bdac9ded2ef65dd4a09830b69838cb67863f1823..b0cabddc3ebc4a9ede73d506ac58acaa140f03d5 100644 --- a/paddle/infrt/external_kernels/fc.mlir +++ b/paddle/infrt/external_kernels/fc.mlir @@ -1,43 +1,43 @@ // CHECK-LABEL: @fc -func @fc(%input : !infrt.tensor, - %w : !infrt.tensor, - %bias : !infrt.tensor) -> !infrt.tensor +func @fc(%input : !Infrt.tensor, + %w : !Infrt.tensor, + %bias : !Infrt.tensor) -> !Infrt.tensor { - %out = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor - // dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor + // dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} // fc1 - "external.matmul"(%input, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () + "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () // fc2 - "external.matmul"(%out, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () + "external.matmul"(%out, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () - infrt.return %out : !infrt.tensor + Infrt.return %out : !Infrt.tensor } // CHECK-LABEL: @benchmark func @benchmark() { - %input = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [50, 50] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [50, 50] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [30, 50] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [30, 50] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} - infrt.benchmark "add.f32"( - %input:!infrt.tensor, - %w:!infrt.tensor, - %bias:!infrt.tensor) + Infrt.benchmark "add.f32"( + %input:!Infrt.tensor, + %w:!Infrt.tensor, + %bias:!Infrt.tensor) duration_secs = 100, max_count = 300000, num_warmup_runs = 3 { - %res = infrt.call @fc(%input, %w, %bias) : (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> (!infrt.tensor) - infrt.return %res : !infrt.tensor + %res = Infrt.call @fc(%input, %w, %bias) : (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> (!Infrt.tensor) + Infrt.return %res : !Infrt.tensor } - infrt.return + Infrt.return } diff --git a/paddle/infrt/external_kernels/paddle.mlir b/paddle/infrt/external_kernels/paddle.mlir index e7b8e9efba838bded2fe86d901422fca7005e507..d55d9904b5bc4e43388abacf9e4b62bf06db458b 100644 --- a/paddle/infrt/external_kernels/paddle.mlir +++ b/paddle/infrt/external_kernels/paddle.mlir @@ -1,50 +1,50 @@ // CHECK: paddle_func func @paddle_func() -> () { - %input = dt.create_uninit_tensor.f32 [3, 5] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 5] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} - %w = dt.create_uninit_tensor.f32 [5, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%w : !infrt.tensor) {value=2.0:f32} + %w = dt.create_uninit_tensor.f32 [5, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%w : !Infrt.tensor) {value=2.0:f32} - %bias = dt.create_uninit_tensor.f32 [4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias : !infrt.tensor) {value=3.0:f32} + %bias = dt.create_uninit_tensor.f32 [4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias : !Infrt.tensor) {value=3.0:f32} - %out = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%out : !infrt.tensor) {value=0.0:f32} + %out = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%out : !Infrt.tensor) {value=0.0:f32} - "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!infrt.tensor, !infrt.tensor, !infrt.tensor, !infrt.tensor) -> () + "external.fc2"(%input, %w, %bias, %out) {in_num_col_dims=3:i32, test_attr=5:i32}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () // CHECK-LABEL: tensor: shape=shape[3,5], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%input : !infrt.tensor) + dt.print_tensor (%input : !Infrt.tensor) // CHECK-LABEL: tensor: shape=shape[5,4], values=[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] - dt.print_tensor (%w : !infrt.tensor) - dt.print_tensor (%bias : !infrt.tensor) - dt.print_tensor (%out : !infrt.tensor) + dt.print_tensor (%w : !Infrt.tensor) + dt.print_tensor (%bias : !Infrt.tensor) + dt.print_tensor (%out : !Infrt.tensor) // test external.matmul - %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%out1 : !infrt.tensor) {value=0.0:f32} - "external.matmul"(%input, %w, %out1) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - dt.print_tensor (%out1 : !infrt.tensor) + %out1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%out1 : !Infrt.tensor) {value=0.0:f32} + "external.matmul"(%input, %w, %out1) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + dt.print_tensor (%out1 : !Infrt.tensor) // test external.elementwise_add - %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%out2 : !infrt.tensor) {value=0.0:f32} - %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%bias1 : !infrt.tensor) {value=3.0:f32} - "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - dt.print_tensor (%out2 : !infrt.tensor) + %out2 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%out2 : !Infrt.tensor) {value=0.0:f32} + %bias1 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%bias1 : !Infrt.tensor) {value=3.0:f32} + "external.elementwise_add"(%out1, %bias1, %out2) {axis=-1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + dt.print_tensor (%out2 : !Infrt.tensor) // test external.relu - %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%out3 : !infrt.tensor) {value=0.0:f32} - "external.relu"(%out1, %out3) {}: (!infrt.tensor, !infrt.tensor) -> () - dt.print_tensor (%out3 : !infrt.tensor) + %out3 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%out3 : !Infrt.tensor) {value=0.0:f32} + "external.relu"(%out1, %out3) {}: (!Infrt.tensor, !Infrt.tensor) -> () + dt.print_tensor (%out3 : !Infrt.tensor) // test external.sigmoid - %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%out4 : !infrt.tensor) {value=0.0:f32} - "external.sigmoid"(%out1, %out4) {}: (!infrt.tensor, !infrt.tensor) -> () - dt.print_tensor (%out4 : !infrt.tensor) + %out4 = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%out4 : !Infrt.tensor) {value=0.0:f32} + "external.sigmoid"(%out1, %out4) {}: (!Infrt.tensor, !Infrt.tensor) -> () + dt.print_tensor (%out4 : !Infrt.tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc index 1acb35e898308a96fa53bc39c484f93887d70668..14e88be4b96bb58df87db3191db8bae444c4cc3d 100644 --- a/paddle/infrt/host_context/kernel_frame.cc +++ b/paddle/infrt/host_context/kernel_frame.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/host_context/kernel_frame.h" #include +#include namespace infrt { namespace host_context { @@ -25,5 +26,36 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) { return os; } +#ifndef NDEBUG +std::string KernelFrame::DumpArgTypes() const { + std::stringstream ss; + for (auto* value : GetValues(0, GetNumElements())) { + if (value->is_type()) { + ss << "bool (" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "DenseHostTensor(" << &value->get() + << "), "; + } else if (value->is_type()) { + ss << "float(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "int(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "phi::DenseTensor(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "phi::MetaTensor(" << &value->get() << "), "; + } else if (value->is_type<::phi::CPUContext>()) { + ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), "; + } else if (value->is_type()) { + ss << "none(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "CpuPhiContext(" << &value->get() << "), "; + } else { + ss << "typeid: " << value->index() << ", "; + } + } + return ss.str(); +} +#endif + } // namespace host_context } // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h index 35527872e624f74209e470de24653faa7bd778c3..90887edb991660083e9a6649658d40e96f1642af 100644 --- a/paddle/infrt/host_context/kernel_frame.h +++ b/paddle/infrt/host_context/kernel_frame.h @@ -31,20 +31,24 @@ namespace host_context { class KernelFrame { public: int GetNumArgs() const { return num_arguments_; } - int GetNumResults() const { return num_results_ == -1 ? 0 : num_results_; } - int GetNumAttributes() const { - return value_or_attrs_.size() - num_arguments_ - - (num_results_ == -1 ? 0 : num_results_); + int GetNumResults() const { + return value_or_attrs_.size() - num_arguments_ - GetNumAttributes(); } + int GetNumAttributes() const { return num_attrs_ == -1 ? 0 : num_attrs_; } //! Get something at a specific position \p index. The element might be an //! argument, an attribute or a result. template T& GetElementAt(int index) { - CHECK_LT(index, GetNumArgs() + GetNumAttributes() + GetNumResults()); + CHECK_LT(static_cast(index), GetNumElements()); return value_or_attrs_[index]->template get_or_default(); } + Value* GetElementAt(int index) { + CHECK_LT(static_cast(index), GetNumElements()); + return value_or_attrs_[index]; + } + // Get number of elements, either input, attributes or results. size_t GetNumElements() const { return value_or_attrs_.size(); } @@ -70,18 +74,21 @@ class KernelFrame { } Value* GetAttributeAt(int idx) { - CHECK_NE(num_results_, -1) - << "Must call SetNumResults before GetAttributeAt"; - CHECK_LT(idx, - static_cast(value_or_attrs_.size() - num_arguments_ - - num_results_)); - return value_or_attrs_[num_arguments_ + num_results_ + idx]; + // CHECK_NE(num_results_, -1) + //<< "Must call SetNumResults before GetAttributeAt"; + CHECK_LT(idx, GetNumAttributes()); + return value_or_attrs_[num_arguments_ + idx]; } void AddAttribute(Value* v) { - CHECK_NE(num_results_, -1) - << "Must call SetNumResults before calling AddAttribute"; + CHECK_LE(num_results_, 0) + << "Must call SetNumResults after calling AddAttribute"; value_or_attrs_.emplace_back(v); + if (num_attrs_ == -1) num_attrs_ = 0; + num_attrs_++; + + CHECK_EQ(value_or_attrs_.size(), + static_cast(num_arguments_ + num_attrs_)); } template @@ -96,35 +103,43 @@ class KernelFrame { template void SetResultAt(int index, T&& value) { - CHECK_LT(index, num_results_) << "Invalid result index"; - CHECK(value_or_attrs_[num_arguments_ + index]); - value_or_attrs_[num_arguments_ + index]->set(std::move(value)); + CHECK_LT(index, GetNumResults()) << "Invalid result index"; + CHECK(value_or_attrs_[num_arguments_ + GetNumAttributes() + index]); + value_or_attrs_[num_arguments_ + GetNumAttributes() + index]->set( + std::move(value)); } llvm::ArrayRef GetResults() const { - return GetValues(num_arguments_, num_results_); + CHECK_GE(num_results_, 0) << "Invalid results num"; + return GetValues(num_arguments_ + GetNumAttributes(), num_results_); } llvm::MutableArrayRef GetResults() { - return GetMutableValues(num_arguments_, num_results_); + CHECK_GE(num_results_, 0) << "Invalid results num"; + return GetMutableValues(num_arguments_ + GetNumAttributes(), num_results_); } llvm::ArrayRef GetValues(size_t from, size_t length) const { - CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + CHECK_LE(from + length, GetNumElements()); if (length == 0) return {}; return llvm::makeArrayRef(&value_or_attrs_[from], length); } llvm::MutableArrayRef GetMutableValues(size_t from, size_t length) { - CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + CHECK_LE(from + length, GetNumElements()); if (length == 0) return {}; return llvm::makeMutableArrayRef(&value_or_attrs_[from], length); } +#ifndef NDEBUG + std::string DumpArgTypes() const; +#endif + bool IsEmpty() const { return value_or_attrs_.empty(); } protected: int num_arguments_{}; + int num_attrs_{-1}; int num_results_{-1}; llvm::SmallVector value_or_attrs_; @@ -136,15 +151,15 @@ class KernelFrameBuilder : public KernelFrame { public: void AddArgument(Value* value) { CHECK(value); - CHECK_EQ(num_results_, -1) - << "Should call AddArgument before calling SetNumResults"; + CHECK_EQ(num_attrs_, -1) + << "Should call AddArgument before calling SetAttributes"; value_or_attrs_.push_back(value); ++num_arguments_; } void SetResults(llvm::ArrayRef values) { - CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); - CHECK_EQ(num_results_, -1); + CHECK_EQ(num_arguments_ + GetNumAttributes(), + static_cast(value_or_attrs_.size())); for (Value* x : values) { value_or_attrs_.push_back(x); } @@ -152,28 +167,30 @@ class KernelFrameBuilder : public KernelFrame { } void SetNumResults(size_t n) { - CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); - CHECK_EQ(num_results_, -1); - num_results_ = n; + CHECK_EQ(num_arguments_ + GetNumAttributes(), + static_cast(value_or_attrs_.size())); for (size_t i = 0; i < n; i++) { value_or_attrs_.emplace_back(new Value); } + num_results_ = n; } void SetResultAt(int result_id, Value* value) { CHECK_EQ(static_cast(value_or_attrs_.size()), - num_arguments_ + num_results_) + num_arguments_ + GetNumAttributes() + num_results_) << "Call SetNumResults first"; - CHECK_LT(result_id + num_arguments_, + CHECK_LT(result_id + num_arguments_ + GetNumAttributes(), static_cast(value_or_attrs_.size())); CHECK(value); - value_or_attrs_[num_arguments_ + result_id]->set(value); + value_or_attrs_[num_arguments_ + GetNumAttributes() + result_id]->set( + value); } void Reset() { value_or_attrs_.clear(); num_arguments_ = 0; num_results_ = -1; + num_attrs_ = -1; } }; diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h index 31d411006d2378eb77d254c76baf25809c79bb42..2f630dcc213cb6f46b7e48c5210124c3324a874a 100644 --- a/paddle/infrt/host_context/kernel_utils.h +++ b/paddle/infrt/host_context/kernel_utils.h @@ -209,9 +209,11 @@ struct KernelImpl { static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { static_assert(out_idx != -1, "Do not place Results after RemainingResults"); - static_assert(const_idx == 0, - "Arguments and results should appear before attributes"); - Result arg(&frame->GetResults()[out_idx]); + // static_assert(const_idx == 0, + // "Arguments and results should appear before attributes"); + + // Result arg(&frame->GetResults()[out_idx]); + Result arg(new ValueRef()); KernelCallHelper< Tail...>::template Invoke(frame, pargs..., @@ -224,8 +226,8 @@ struct KernelImpl { struct KernelCallHelper, Tail...> { template static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { - static_assert(const_idx != -1, - "Do not place Attributes after RemainingAttributes"); + // static_assert(const_idx != -1, + // "Do not place Attributes after RemainingAttributes"); Attribute arg(frame->GetAttributeAt(const_idx)); KernelCallHelper< Tail...>::template Invoke(frame, @@ -242,8 +244,8 @@ struct KernelImpl { static_assert(in_idx != -1, "Do not place Arguments after RemainingArguments"); static_assert(out_idx == 0, "Arguments should appear before results"); - static_assert(const_idx == 0, - "Arguments and results should appear before attributes."); + // static_assert(const_idx == 0, + // "Arguments and results should appear before attributes."); auto* arg = &frame->template GetElementAt(in_idx); KernelCallHelper< Tail...>::template Invoke(frame, @@ -265,7 +267,7 @@ struct KernelImpl { static_assert(const_idx == 0, "Arguments and results should appear before attributes."); - auto* value = frame->GetArgAt(in_idx); + auto* value = frame->GetElementAt(in_idx); auto&& arg = value->get(); KernelCallHelper< diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc index bebd8d86e50bbd6a2d80325f9fbd8254718c8d0a..71d8904eb798fbe638ea5a5e1af3824db94c4357 100644 --- a/paddle/infrt/host_context/kernel_utils_test.cc +++ b/paddle/infrt/host_context/kernel_utils_test.cc @@ -67,5 +67,45 @@ TEST(KernelImpl, pair) { ASSERT_EQ(results[1]->get(), 3.f); } +void TestFunc(const std::string& arg_0, + const std::string& arg_1, + const std::string& arg_2, + Attribute attr_0, + Result res_0, + Result res_1) { + CHECK_EQ(arg_0, "arg_0"); + CHECK_EQ(arg_1, "arg_1"); + CHECK_EQ(arg_2, "arg_2"); + CHECK_EQ(attr_0.get(), "attr_0"); + + // res_0.Set(Argument(ValueRef(new Value()))); + // res_1.Set(Argument(ValueRef(new Value()))); +} + +TEST(KernelRegistry, basic) { + KernelFrameBuilder kernel_frame; + + Value arg_0(std::string{"arg_0"}); + Value arg_1(std::string{"arg_1"}); + Value arg_2(std::string{"arg_2"}); + Value attr_0(std::string{"attr_0"}); + + kernel_frame.AddArgument(&arg_0); + kernel_frame.AddArgument(&arg_1); + kernel_frame.AddArgument(&arg_2); + kernel_frame.AddAttribute(&attr_0); + kernel_frame.SetNumResults(2); + + CHECK_EQ(kernel_frame.GetNumArgs(), 3); + CHECK_EQ(kernel_frame.GetNumResults(), 2); + CHECK_EQ(kernel_frame.GetNumAttributes(), 1); + CHECK_EQ(kernel_frame.GetNumElements(), 6UL); + + CHECK_EQ(kernel_frame.GetArgAt(2), "arg_2"); + CHECK_EQ(kernel_frame.GetAttributeAt(0)->get(), "attr_0"); + + KernelImpl::Invoke(&kernel_frame); +} + } // namespace host_context } // namespace infrt diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 62c907bc9159f4b3ee8e03878736fb30106c4616..79717ba2cc034650726f9e88c9dc31f1f1349c66 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -28,8 +28,8 @@ #include "paddle/infrt/kernel/tensor_kernels.h" #include "paddle/infrt/kernel/tensor_shape_kernels.h" #include "paddle/infrt/kernel/test_kernels.h" -#ifdef INFRT_WITH_PTEN -#include "paddle/infrt/kernel/pten/registry.h" +#ifdef INFRT_WITH_PHI +#include "paddle/infrt/kernel/phi/registry.h" #endif static llvm::cl::list cl_shared_libs( // NOLINT @@ -56,8 +56,8 @@ int main(int argc, char** argv) { kernel::RegisterTensorShapeKernels(®istry); kernel::RegisterTensorKernels(®istry); kernel::RegisterControlFlowKernels(®istry); -#ifdef INFRT_WITH_PTEN - kernel::RegisterPtenKernels(®istry); +#ifdef INFRT_WITH_PHI + kernel::RegisterPhiKernels(®istry); #endif // load extra shared library diff --git a/paddle/infrt/host_context/mlir_tests/basic.mlir b/paddle/infrt/host_context/mlir_tests/basic.mlir index 263d5884134b143aa8d3403c5cd05672df39636f..1b55b408f2b082c09d06d51037e8c9d967a171f4 100644 --- a/paddle/infrt/host_context/mlir_tests/basic.mlir +++ b/paddle/infrt/host_context/mlir_tests/basic.mlir @@ -1,30 +1,30 @@ // CHECK-LABEL: basic func @basic() -> f32 { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 - %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 + %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK: 1 - "infrt.print.f32"(%v0) : (f32) -> () + "Infrt.print.f32"(%v0) : (f32) -> () // CHECK: 2 - "infrt.print.f32"(%v1) : (f32) -> () + "Infrt.print.f32"(%v1) : (f32) -> () // CHECK: 3 - "infrt.print.f32"(%v2) : (f32) -> () + "Infrt.print.f32"(%v2) : (f32) -> () - %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 // CHECK: 6 - "infrt.print.f32"(%v3) : (f32) -> () + "Infrt.print.f32"(%v3) : (f32) -> () - infrt.return %v3 : f32 + Infrt.return %v3 : f32 } // CHECK-LABEL: basic1 // Check the mlir executor can work with more than one function in a file. func @basic1() -> () { - %v0 = infrt.constant.f32 1.0 - "infrt.print.f32"(%v0) : (f32) -> () + %v0 = Infrt.constant.f32 1.0 + "Infrt.print.f32"(%v0) : (f32) -> () // CHECK: 1 - infrt.return + Infrt.return } \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir index 83afa1db8a91c03f1b22c5b6728e398ed361b472..5a973a3eb23e6015ede2d69d83ab8c26de669908 100644 --- a/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir +++ b/paddle/infrt/host_context/mlir_tests/dense_tensor.mlir @@ -1,9 +1,9 @@ // CHECK-LABEL: build_tensor1 func @build_tensor1() { - %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [3, 4] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%a : !Infrt.tensor) {value=1.0:f32} // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%a : !infrt.tensor) + dt.print_tensor (%a : !Infrt.tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/host_context/mlir_tests/shape.mlir b/paddle/infrt/host_context/mlir_tests/shape.mlir index a3130857b0ef7d50821a20cfbc9138aaecc74ff7..22df1c8010d8dbd6a4b8e332e01602b4421ebcdd 100644 --- a/paddle/infrt/host_context/mlir_tests/shape.mlir +++ b/paddle/infrt/host_context/mlir_tests/shape.mlir @@ -3,5 +3,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - infrt.return + Infrt.return } \ No newline at end of file diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index 3dbc7a702be38d986b6f77b345abe85f939370e6..17e6f7cb563d25186f9a76de8fe67af2ddb90e7b 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -31,6 +31,7 @@ #include "boost/optional.hpp" #include "paddle/infrt/common/string.h" +#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/host_context/core_runtime.h" @@ -74,7 +75,7 @@ struct MlirToRuntimeTranslator::Impl { }; bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) { - if (!infrt::Startswith(op->getName().getStringRef().str(), "infrt.constant")) + if (!infrt::Startswith(op->getName().getStringRef().str(), "Infrt.constant")) return false; VLOG(3) << "Emitting constant op [" << op->getName().getStringRef().str() << "]"; @@ -150,6 +151,17 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( return boost::none; } +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); + return val.getValue(); + } + return boost::none; +} + template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { @@ -187,6 +199,7 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( return res; \ } +PROCESS_ARRAY_INT(bool, 1); PROCESS_ARRAY_INT(int16_t, 16); PROCESS_ARRAY_INT(int32_t, 32); PROCESS_ARRAY_INT(int64_t, 64); @@ -224,7 +237,7 @@ boost::optional> MlirToRuntimeTranslator::EmitAttribute( } static bool IsReturn(mlir::Operation* op) { - return op->getName().getStringRef() == "infrt.return"; + return op->getName().getStringRef() == "Infrt.return"; } bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { @@ -262,25 +275,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { << GetValue(operand) << " vs " << arg_value; } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - res_values.push_back(AddValue(res)); - - VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); - } - impl_->cur_op->SetResults(res_values); - -#ifdef INFRT_DEBUG - { - VLOG(3) << "check result"; - for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { - VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; - } - } -#endif - // process attributes auto attrs = op->getAttrs(); @@ -296,6 +290,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { impl_->cur_op->AppendAttribute(new Value(*v)); } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute(attr.getValue())) { + impl_->cur_op->AppendAttribute(new Value(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); } else if (auto v = EmitAttribute>(attr.getValue())) { @@ -311,6 +307,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { } } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + if (res.getType().isa<::infrt::DenseTensorType>()) { + auto r = impl_->value_map.try_emplace( + res, ValueRef(new Value{::phi::DenseTensor()})); + CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) + << "]"; + res_values.push_back(r.first->second.get()); + } else { + res_values.push_back(AddValue(res)); + } + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + // process regions, we treat regions as attribute. auto num_regions = op->getNumRegions(); if (num_regions > 0) { @@ -345,7 +368,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { bool MlirToRuntimeTranslator::EmitReturnOp( mlir::Operation* op, llvm::SmallVectorImpl* results) { CHECK(results); - if (op->getName().getStringRef() == "infrt.return") { + if (op->getName().getStringRef() == "Infrt.return") { for (size_t i = 0; i < op->getNumOperands(); i++) { results->push_back(op->getOperand(i)); } @@ -418,7 +441,7 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, function_defs_t* function_table) { CHECK(op); CHECK(function_table); - if (op->getName().getStringRef() != "infrt.call") return false; + if (op->getName().getStringRef() != "Infrt.call") return false; impl_->cur_op = impl_->runtime->NewOpExecutable(op->getName().getStringRef().str()); @@ -440,14 +463,6 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, impl_->cur_op->AppendArgument(arg_value); } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - res_values.push_back(AddValue(res)); - } - impl_->cur_op->SetResults(res_values); - // process attribute auto& table = function_table ? *function_table : impl_->func_defs; { @@ -460,6 +475,14 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, impl_->cur_op->AppendAttribute(new Value(function)); } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + res_values.push_back(AddValue(res)); + } + impl_->cur_op->SetResults(res_values); + VLOG(3) << "Emit call " << callee_name.getValue().str() << " " << impl_->cur_op->frame(); return true; diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h index fcd79eaf386eed5a6a8eaa31712e344bab56dbd4..0c453651d9e6dc44adaf108ec6a1b0df984fe8be 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.h +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h @@ -57,7 +57,7 @@ class MlirToRuntimeTranslator { protected: //! Emit a "infrt.constant.*" operation, return true if succeed. bool EmitConstantOp(mlir::Operation* op); - //! Emit a "infrt.return" operation. + //! Emit a "Infrt.return" operation. bool EmitReturnOp(mlir::Operation* op, llvm::SmallVectorImpl* results); //! Emit a "ts.build_shape" operation. diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc index 375daa4515e17fe1618c71d642825d112a3f788f..5824e40abf97a4d63543948d056e815bbeebce3a 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc @@ -37,14 +37,14 @@ TEST(MlirToRuntimeTranslate, basic) { auto source = R"ROC( func @main() -> () { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 - %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 + %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "infrt.print.f32"(%v1) : (f32) -> () + "Infrt.print.f32"(%v1) : (f32) -> () - infrt.return + Infrt.return } )ROC"; @@ -63,14 +63,14 @@ TEST(TestMlir, basic) { auto source = R"ROC( func @main() -> () { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 - %v2 = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 - %v3 = "infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 + %v2 = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v3 = "Infrt.mul.f32"(%v2, %v1) : (f32, f32) -> f32 - "infrt.print.f32"(%v1) : (f32) -> () + "Infrt.print.f32"(%v1) : (f32) -> () - infrt.return + Infrt.return } )ROC"; @@ -88,18 +88,20 @@ TEST(TestMlir, shadow_copy_tensor_profile) { mlir::MLIRContext* context = infrt::Global::getMLIRContext(); auto head = R"ROC( -func @predict(%a: !infrt.tensor, %b: !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) { +func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { )ROC"; auto tpl0 = - "%a{0} = dt.shallow_copy_tensor %a : !infrt.tensor -> " - "!infrt.tensor"; + "%a{0} = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> " + "!infrt.dense_tensor"; auto tpl1 = - "%b{0} = dt.shallow_copy_tensor %b : !infrt.tensor -> " - "!infrt.tensor"; + "%b{0} = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> " + "!infrt.dense_tensor"; auto end = R"ROC( -infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor +Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } )ROC"; diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index cf40d7315c6a58e4c9cca5e2be4fe2a24922d0ac..59a73e71083286b81f2bbdfa20a4ed96a8353a2f 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -133,7 +133,8 @@ void OpExecutable::Execute() { VLOG(3) << "execute " << name() << " --- frame args: " << impl_->frame.GetNumArgs() << " results " << impl_->frame.GetNumResults() << " attributes " - << impl_->frame.GetNumAttributes(); + << impl_->frame.GetNumAttributes() << "\n" + << frame().DumpArgTypes(); for (int i = 0; i < impl_->frame.GetNumArgs(); i++) { VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i); } diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc index 222c5dcd6c57550b273bb4d29fa5290c46ec1cf9..3f40490557290fcc34a188882c4d4d251f4ba16e 100644 --- a/paddle/infrt/host_context/value.cc +++ b/paddle/infrt/host_context/value.cc @@ -24,7 +24,7 @@ ValueRef::ValueRef(int64_t val) : Shared(new Value(val)) {} ValueRef::ValueRef(float val) : Shared(new Value(val)) {} ValueRef::ValueRef(double val) : Shared(new Value(val)) {} ValueRef::ValueRef(bool val) : Shared(new Value(val)) {} -ValueRef::ValueRef(backends::CpuPtenContext&& val) +ValueRef::ValueRef(backends::CpuPhiContext&& val) : Shared(new Value(std::move(val))) {} ValueRef::ValueRef(::phi::CPUContext&& val) : Shared(new Value(std::move(val))) {} diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index c39ddf69a90e2735db2081bdf0b49bfa1ec50b2e..eb9a2092657aa079ee6a4007d7ded9f8896e93aa 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -29,9 +29,9 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" -#ifdef INFRT_WITH_PTEN -#include "paddle/infrt/backends/host/pten_allocator.h" -#include "paddle/infrt/backends/host/pten_context.h" +#ifdef INFRT_WITH_PHI +#include "paddle/infrt/backends/host/phi_allocator.h" +#include "paddle/infrt/backends/host/phi_context.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -45,10 +45,13 @@ namespace infrt { namespace host_context { +struct None {}; + struct MlirFunctionExecutable; using ValueVariantType = - Variant, paddle::experimental::ScalarBase, @@ -108,23 +111,25 @@ class Value : public common::Object { explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {} explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {} explicit Value(MlirFunctionExecutable* x) : data(x) {} -#ifdef INFRT_WITH_PTEN - explicit Value(backends::CpuPtenContext&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_PHI + explicit Value(backends::CpuPhiContext&& x) : data(std::move(x)) {} explicit Value(::phi::CPUContext&& x) : data(std::move(x)) {} explicit Value(::phi::DenseTensor&& x) : data(std::move(x)) {} explicit Value(::phi::MetaTensor&& x) : data(std::move(x)) {} - explicit Value(backends::CpuPtenAllocator&& x) : data(std::move(x)) {} + explicit Value(backends::CpuPhiAllocator&& x) : data(std::move(x)) {} #endif template const T& get() const { - CHECK(data.template is()); + CHECK(data.template is()) << "typeid: " << data.index() + << " != " << ValueVariantType::IndexOf; return data.get(); } template T& get() { - CHECK(data.template is()); + CHECK(data.template is()) << "typeid: " << data.index() + << " != " << ValueVariantType::IndexOf; return data.get(); } @@ -153,6 +158,8 @@ class Value : public common::Object { const char* type_info() const override; + ValueVariantType::IndexT index() const { return data.index(); } + friend void CopyTo(const Value& from, Value* to); private: @@ -173,7 +180,7 @@ class ValueRef : common::Shared { explicit ValueRef(double val); explicit ValueRef(bool val); explicit ValueRef(::phi::MetaTensor&& val); - explicit ValueRef(backends::CpuPtenContext&& x); + explicit ValueRef(backends::CpuPhiContext&& x); explicit ValueRef(::phi::CPUContext&& x); explicit ValueRef(::phi::DenseTensor&& x); diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index 402665119ac2dd93214b5b9733352846004c75b3..f1cbfba1c46b33e461a7c9f08cf646625fbafb24 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -1,10 +1,10 @@ -add_subdirectory(pten) +add_subdirectory(phi) core_gather_headers() gather_srcs(infrt_src SRCS basic_kernels.cc - # pten_kernels.cc + # phi_kernels.cc test_kernels.cc tensor_shape_kernels.cc tensor_kernels.cc diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc index b186cfcfd2b355f97711ecc916e497c2916d4060..23e50a5ddc87427bbf0f49c559f185084e42c8ec 100644 --- a/paddle/infrt/kernel/basic_kernels.cc +++ b/paddle/infrt/kernel/basic_kernels.cc @@ -63,24 +63,24 @@ static void PrintString(const std::string &str) { void RegisterBasicKernels(host_context::KernelRegistry *registry) { RegisterIntBasicKernels(registry); RegisterFloatBasicKernels(registry); - registry->AddKernel("infrt.get_string", INFRT_KERNEL(GetString)); - registry->AddKernel("infrt.print_string", INFRT_KERNEL(PrintString)); + registry->AddKernel("Infrt.get_string", INFRT_KERNEL(GetString)); + registry->AddKernel("Infrt.print_string", INFRT_KERNEL(PrintString)); } void RegisterIntBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("infrt.add.i32", INFRT_KERNEL(add)); - registry->AddKernel("infrt.sub.i32", INFRT_KERNEL(sub)); - registry->AddKernel("infrt.mul.i32", INFRT_KERNEL(mul)); - registry->AddKernel("infrt.div.i32", INFRT_KERNEL(div)); - registry->AddKernel("infrt.print.i32", INFRT_KERNEL(print)); + registry->AddKernel("Infrt.add.i32", INFRT_KERNEL(add)); + registry->AddKernel("Infrt.sub.i32", INFRT_KERNEL(sub)); + registry->AddKernel("Infrt.mul.i32", INFRT_KERNEL(mul)); + registry->AddKernel("Infrt.div.i32", INFRT_KERNEL(div)); + registry->AddKernel("Infrt.print.i32", INFRT_KERNEL(print)); } void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("infrt.add.f32", INFRT_KERNEL(add)); - registry->AddKernel("infrt.sub.f32", INFRT_KERNEL(sub)); - registry->AddKernel("infrt.mul.f32", INFRT_KERNEL(mul)); - registry->AddKernel("infrt.div.f32", INFRT_KERNEL(div)); - registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print)); + registry->AddKernel("Infrt.add.f32", INFRT_KERNEL(add)); + registry->AddKernel("Infrt.sub.f32", INFRT_KERNEL(sub)); + registry->AddKernel("Infrt.mul.f32", INFRT_KERNEL(mul)); + registry->AddKernel("Infrt.div.f32", INFRT_KERNEL(div)); + registry->AddKernel("Infrt.print.f32", INFRT_KERNEL(print)); } } // namespace kernel diff --git a/paddle/infrt/kernel/control_flow_kernels.cc b/paddle/infrt/kernel/control_flow_kernels.cc index 6cc94dbcce0775cb6b74f993bfdd262fd6a47e6f..8b18aca0210860f4ae688f2133ffa022fda3195d 100644 --- a/paddle/infrt/kernel/control_flow_kernels.cc +++ b/paddle/infrt/kernel/control_flow_kernels.cc @@ -37,7 +37,7 @@ static void INFRTCall( } void RegisterControlFlowKernels(host_context::KernelRegistry* registry) { - registry->AddKernel("infrt.call", INFRT_KERNEL(INFRTCall)); + registry->AddKernel("Infrt.call", INFRT_KERNEL(INFRTCall)); } } // namespace kernel diff --git a/paddle/infrt/kernel/pten/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt similarity index 61% rename from paddle/infrt/kernel/pten/CMakeLists.txt rename to paddle/infrt/kernel/phi/CMakeLists.txt index fbb205e2af011e32057349dff3be08409cef68b9..e21cacfbc10b3eaa13004f3aa71a3cb6c9c6f5e8 100644 --- a/paddle/infrt/kernel/pten/CMakeLists.txt +++ b/paddle/infrt/kernel/phi/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT INFRT_WITH_PTEN) +if (NOT INFRT_WITH_PHI) return() endif() @@ -11,16 +11,16 @@ gather_srcs(infrt_src SRCS allocator_kernels.cc ) -set(infrt_register_pten_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc) -set(infrt_register_pten_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_pten_kernel_function.sh) +set(infrt_register_phi_kernels_gen_source_file ${CMAKE_SOURCE_DIR}/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc) +set(infrt_register_phi_kernels_gen_file ${CMAKE_SOURCE_DIR}/tools/infrt/get_phi_kernel_function.sh) set(wrapped_infermeta_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.h) set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/generated.cc) add_custom_command( - OUTPUT ${infrt_register_pten_kernels_gen_source_file} - COMMAND sh ${infrt_register_pten_kernels_gen_file} + OUTPUT ${infrt_register_phi_kernels_gen_source_file} + COMMAND sh ${infrt_register_phi_kernels_gen_file} DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} - COMMENT "infrt generate ${infrt_register_pten_kernels_gen_source_file}" + COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}" VERBATIM) cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc diff --git a/paddle/infrt/kernel/pten/allocator_kernels.cc b/paddle/infrt/kernel/phi/allocator_kernels.cc similarity index 81% rename from paddle/infrt/kernel/pten/allocator_kernels.cc rename to paddle/infrt/kernel/phi/allocator_kernels.cc index d3ecbed15da9691514b3688006d547ae54c42db0..eba12e688b4ae2cf9bdd4fa46bb479be882b02fc 100644 --- a/paddle/infrt/kernel/pten/allocator_kernels.cc +++ b/paddle/infrt/kernel/phi/allocator_kernels.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/kernel/pten/allocator_kernels.h" +#include "paddle/infrt/kernel/phi/allocator_kernels.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { -backends::CpuPtenAllocator CreateCpuAllocator() { return {}; } +backends::CpuPhiAllocator CreateCpuAllocator() { return {}; } -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/allocator_kernels.h b/paddle/infrt/kernel/phi/allocator_kernels.h similarity index 84% rename from paddle/infrt/kernel/pten/allocator_kernels.h rename to paddle/infrt/kernel/phi/allocator_kernels.h index ddc316c269923e3fc302523e86f64d6233d0c0cf..d10382f5e6014c2b04dab65c8439d99e4563aaef 100644 --- a/paddle/infrt/kernel/pten/allocator_kernels.h +++ b/paddle/infrt/kernel/phi/allocator_kernels.h @@ -14,15 +14,15 @@ #pragma once -#include "paddle/infrt/backends/host/pten_allocator.h" +#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { -backends::CpuPtenAllocator CreateCpuAllocator(); +backends::CpuPhiAllocator CreateCpuAllocator(); -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc similarity index 82% rename from paddle/infrt/kernel/pten/context_kernels.cc rename to paddle/infrt/kernel/phi/context_kernels.cc index 0c5e53212113be02e3d57471be80bc1564f8f51f..5284f499916c309c03cbada25ab0de44d5549eec 100644 --- a/paddle/infrt/kernel/pten/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/kernel/pten/context_kernels.h" +#include "paddle/infrt/kernel/phi/context_kernels.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { -backends::CpuPtenContext CreateCpuContext() { return {}; } +::phi::CPUContext CreateCpuContext() { return {}; } -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h similarity index 84% rename from paddle/infrt/kernel/pten/context_kernels.h rename to paddle/infrt/kernel/phi/context_kernels.h index 95a20f912efbf1662cf0c1f474bf5f9295ba5861..8082dc6c2ff2950bdcbc8a99e602b7caab2b6ad7 100644 --- a/paddle/infrt/kernel/pten/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -14,15 +14,15 @@ #pragma once -#include "paddle/infrt/backends/host/pten_context.h" +#include "paddle/infrt/backends/host/phi_context.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { -backends::CpuPtenContext CreateCpuContext(); +::phi::CPUContext CreateCpuContext(); -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc similarity index 90% rename from paddle/infrt/kernel/pten/dense_tensor_kernels.cc rename to paddle/infrt/kernel/phi/dense_tensor_kernels.cc index b21e418789663e506cf08307528e693ebfb72e7b..ce9200b9918c0a2cfe2ff80312562375bc3dc23f 100644 --- a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -12,14 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h" +#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { ::phi::DenseTensor CreateDenseTensorCpuF32Nchw( - backends::CpuPtenAllocator* allocator, + backends::CpuPhiAllocator* allocator, host_context::Attribute> dims, host_context::Attribute> lod) { return ::phi::DenseTensor(allocator, @@ -32,6 +32,6 @@ namespace pten { void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values) {} -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h similarity index 89% rename from paddle/infrt/kernel/pten/dense_tensor_kernels.h rename to paddle/infrt/kernel/phi/dense_tensor_kernels.h index 41f701b01032acb415852ac03b147cda47bd015a..25daf7027e8cb1371ae40cec7e45b6ef285ef9e5 100644 --- a/paddle/infrt/kernel/pten/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -14,22 +14,22 @@ #pragma once -#include "paddle/infrt/backends/host/pten_allocator.h" +#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { namespace kernel { -namespace pten { +namespace phi { ::phi::DenseTensor CreateDenseTensorCpuF32Nchw( - backends::CpuPtenAllocator* allocator, + backends::CpuPhiAllocator* allocator, host_context::Attribute> dims, host_context::Attribute> lod); void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, host_context::Attribute> values); -} // namespace pten +} // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc similarity index 93% rename from paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc rename to paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index c781ca908fdf0d4dec9281f72bdee154611b0c26..331ebcfb4a5d2b1444f1ed475c5f6467f6fb0361 100644 --- a/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -14,9 +14,9 @@ #include -#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" -#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h" -#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc similarity index 88% rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc index c21339bed38727fd5f7eeb124de7959489893bb6..165f7f7c94377f8b9c1f9c240ee1418cab922cdc 100644 --- a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { @@ -26,9 +26,6 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape( if (value->is_type<::phi::DenseTensor>()) { values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()}); infershape_kernel_frame_builder.AddArgument(values.back().get()); - } else if (value->is_type()) { - values.emplace_back(phi::MetaTensor{&value->get()}); - infershape_kernel_frame_builder.AddArgument(values.back().get()); } else { infershape_kernel_frame_builder.AddArgument(value); } diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h similarity index 100% rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h similarity index 100% rename from paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h rename to paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_utils.h b/paddle/infrt/kernel/phi/infershaped/infershaped_utils.h similarity index 100% rename from paddle/infrt/kernel/pten/infershaped/infershaped_utils.h rename to paddle/infrt/kernel/phi/infershaped/infershaped_utils.h diff --git a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h similarity index 63% rename from paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h rename to paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h index 9a3e978e966b0702ef29623da6578a3858f8cc64..a0a5b391ea669b1358b14098e32750d709e52fe2 100644 --- a/paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h @@ -14,14 +14,36 @@ #pragma once #include +#include +#include "paddle/infrt/backends/host/phi_context.h" #include "paddle/infrt/host_context/kernel_utils.h" -#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" -#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" namespace infrt { namespace kernel { +static void FakePhiInferShape(const ::phi::MetaTensor& a, + const ::phi::MetaTensor& b, + bool arg_0, + bool arg_1, + ::phi::MetaTensor* c) { + LOG(INFO) << "the ptr of c: " << c; + LOG(INFO) << "c->numel(): " << c->numel(); +} + +static void FakePhiKernel(const ::phi::CPUContext& /*Context*/, + const ::phi::DenseTensor& a, + const ::phi::DenseTensor& b, + bool arg_0, + bool arg_1, + ::phi::DenseTensor* c) { + std::cout << "@FakePhiKernel@" << std::endl; + LOG(INFO) << "the ptr of c: " << c; + LOG(INFO) << "c->numel(): " << c->numel(); +} + template ::count}; static const bool turn_on_infer_shape_cache{true}; void Invoke(host_context::KernelFrame* frame) override { +#ifndef NDEBUG + LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes(); +#endif // Build the infershape KernelFrame if needed. // TODO(Superjomn) add unlikely here. if (infershape_kernel_frame_builder.IsEmpty()) { CreateKernelFrameForInferShape(frame); +#ifndef NDEBUG + LOG(INFO) << "infershape.frame: " + << infershape_kernel_frame_builder.DumpArgTypes(); +#endif } if (turn_on_infer_shape_cache) { if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) { diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d79814d4bec7fd5a80913f3f3c470e956526c1f --- /dev/null +++ b/paddle/infrt/kernel/phi/registry.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/phi/registry.h" + +#include +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/phi/allocator_kernels.h" +#include "paddle/infrt/kernel/phi/context_kernels.h" +#include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" +#include "paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h" +#include "paddle/phi/include/infermeta.h" +#include "paddle/phi/include/kernels.h" +#include "paddle/phi/kernels/matmul_kernel.h" + +using infrt::host_context::Attribute; + +namespace infrt { +namespace kernel { + +void RegisterPhiKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("phi_dt.create_allocator.cpu", + INFRT_KERNEL(infrt::kernel::phi::CreateCpuAllocator)); + registry->AddKernel("phi_dt.create_context.cpu", + INFRT_KERNEL(infrt::kernel::phi::CreateCpuContext)); + registry->AddKernel( + "phi_dt.create_dense_tensor.cpu.f32.nchw", + INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw)); + registry->AddKernel("phi_dt.fill_dense_tensor.f32", + INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); + registry->AddKernel( + "phi_dt.fake_phi_kernel", + std::bind(&KernelLauncherFunc, + KernelLauncher(), + std::placeholders::_1)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/pten/registry.h b/paddle/infrt/kernel/phi/registry.h similarity index 88% rename from paddle/infrt/kernel/pten/registry.h rename to paddle/infrt/kernel/phi/registry.h index c290f8ea524fb5d5305445ada409bd03844820c5..c72085a50c1e721543c85d1fa40065502dda0091 100644 --- a/paddle/infrt/kernel/pten/registry.h +++ b/paddle/infrt/kernel/phi/registry.h @@ -27,9 +27,9 @@ namespace infrt { namespace kernel { /** - * Register all the pten kernels to registry. + * Register all the phi kernels to registry. */ -void RegisterPtenKernels(host_context::KernelRegistry* registry); +void RegisterPhiKernels(host_context::KernelRegistry* registry); } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/registry.cc b/paddle/infrt/kernel/pten/registry.cc deleted file mode 100644 index d70f5deca6aeafa439ce5b19bee78edc46cae368..0000000000000000000000000000000000000000 --- a/paddle/infrt/kernel/pten/registry.cc +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/kernel/pten/registry.h" - -#include -#include - -#include "paddle/infrt/host_context/kernel_registry.h" -#include "paddle/infrt/host_context/kernel_utils.h" -#include "paddle/infrt/kernel/pten/allocator_kernels.h" -#include "paddle/infrt/kernel/pten/context_kernels.h" -#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h" -#include "paddle/infrt/kernel/pten/infershaped/pten_kernel_launcher.h" -#include "paddle/phi/include/infermeta.h" -#include "paddle/phi/include/kernels.h" -#include "paddle/phi/kernels/matmul_kernel.h" - -using infrt::host_context::Attribute; - -namespace infrt { -namespace kernel { - -void RegisterPtenKernels(host_context::KernelRegistry* registry) { - registry->AddKernel("pten_dt.create_allocator.cpu", - INFRT_KERNEL(infrt::kernel::pten::CreateCpuAllocator)); - registry->AddKernel("pten_dt.create_context.cpu", - INFRT_KERNEL(infrt::kernel::pten::CreateCpuContext)); - registry->AddKernel( - "pten_dt.create_dense_tensor.cpu.f32.nchw", - INFRT_KERNEL(infrt::kernel::pten::CreateDenseTensorCpuF32Nchw)); - registry->AddKernel("pten_dt.fill_dense_tensor.f32", - INFRT_KERNEL(infrt::kernel::pten::FillDenseTensorF32)); - registry->AddKernel( - "pten.matmul.host.fp32", - std::bind(&kernel::KernelLauncherFunc< - decltype(&::phi::MatmulKernel), - &::phi::MatmulKernel, - decltype(&::phi::MatmulInferMeta), - &::phi::MatmulInferMeta>, - kernel::KernelLauncher< - decltype(&::phi::MatmulKernel), - &::phi::MatmulKernel, - decltype(&::phi::MatmulInferMeta), - &::phi::MatmulInferMeta>(), - std::placeholders::_1)); -} - -} // namespace kernel -} // namespace infrt diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 1e55bcd07ae8009cd5ca26ccf565ac3036ad8d19..9de1350e97d1af31dc18a116ed7cb38bf0d2f4ef 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -45,7 +45,7 @@ void PrintTensor(const DenseHostTensor &tensor) { } template -void FillTensorWithConstant(DenseHostTensor *tensor, Attribute v) { +void FillTensorWithConstant(Attribute v, DenseHostTensor *tensor) { MutableDTArrayView(tensor).Fill(v.get()); } @@ -53,13 +53,11 @@ TensorMap LoadParams(const std::string &path) { return *(infrt::tensor::LoadParams(path)); } -void TensorMapGetTensor(TensorMap map, - DenseHostTensor *out, - Attribute name) { +DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute name) { auto it = map.find(name.get()); CHECK(it != map.end()) << "No tensor called " << name.get() << " in the TensorMap"; - *out = *it->second; + return *it->second; } int32_t TensorMapGetSize(TensorMap map) { return map.size(); } diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc index ccfb3356a855f418f14e42ed8a368f31d2fe8b27..d15bbe221f91a87b047863121f32699175183c54 100644 --- a/paddle/infrt/kernel/test_kernels.cc +++ b/paddle/infrt/kernel/test_kernels.cc @@ -193,8 +193,8 @@ tensor::DenseHostTensor ShadowCopyTensor(tensor::DenseHostTensor src) { } void RegisterTestKernels(host_context::KernelRegistry *registry) { - registry->AddKernel("infrt.benchmark", INFRT_KERNEL(benchmark)); - registry->AddKernel("infrt.test.shadow_copy_tensor", + registry->AddKernel("Infrt.benchmark", INFRT_KERNEL(benchmark)); + registry->AddKernel("Infrt.test.shadow_copy_tensor", INFRT_KERNEL(ShadowCopyTensor)); } diff --git a/paddle/infrt/pass/CMakeLists.txt b/paddle/infrt/pass/CMakeLists.txt new file mode 100755 index 0000000000000000000000000000000000000000..51fecdf907798eb7280a17b294a263fe40993fe2 --- /dev/null +++ b/paddle/infrt/pass/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(phi) diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h index 2f415b21c80109f92193db155130a43f3f95557a..b8dcd21ae27fef48811a8e12fda995f687dd828c 100644 --- a/paddle/infrt/support/variant.h +++ b/paddle/infrt/support/variant.h @@ -136,12 +136,12 @@ class Variant { return nullptr; } - IndexT index() { return index_; } + IndexT index() const { return index_; } - private: template static constexpr size_t IndexOf = TupleIndexOf::value; + private: static constexpr size_t kStorageSize = std::max({sizeof(Ts)...}); static constexpr size_t kAlignment = std::max({alignof(Ts)...}); diff --git a/paddle/infrt/tests/dialect/basic.mlir b/paddle/infrt/tests/dialect/basic.mlir index 3c76b438a0ebaf253d4971c71dc82749a05c3083..2d4d6f2629ec7df989499f0a2e9649c01ae8428a 100644 --- a/paddle/infrt/tests/dialect/basic.mlir +++ b/paddle/infrt/tests/dialect/basic.mlir @@ -1,41 +1,33 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: @basic_f32 func @basic_f32() -> f32 { - %v0 = infrt.constant.f32 1.0 - %v1 = infrt.constant.f32 2.0 - %value = "infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 + %v0 = Infrt.constant.f32 1.0 + %v1 = Infrt.constant.f32 2.0 + %value = "Infrt.add.f32"(%v0, %v1) : (f32, f32) -> f32 // CHECK-NEXT: 3 - "infrt.print.f32"(%value) : (f32) -> () + "Infrt.print.f32"(%value) : (f32) -> () - infrt.return %value : f32 + Infrt.return %value : f32 } /// ================================================================ /// @caller call the other function @callee func @callee.add.f32(%x : f32, %y : f32, %y1 : f32) -> f32 { - %z = "infrt.add.f32"(%x, %y) : (f32, f32) -> f32 - %z1 = "infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 - infrt.return %z1 : f32 + %z = "Infrt.add.f32"(%x, %y) : (f32, f32) -> f32 + %z1 = "Infrt.add.f32"(%z, %y1) : (f32, f32) -> f32 + Infrt.return %z1 : f32 } // CHECK-LABEL: @caller.add.f32 func @caller.add.f32() -> f32 { - %x = infrt.constant.f32 1.0 - %y = infrt.constant.f32 2.0 - %y1 = infrt.constant.f32 3.0 - %z = infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 + %x = Infrt.constant.f32 1.0 + %y = Infrt.constant.f32 2.0 + %y1 = Infrt.constant.f32 3.0 + %z = Infrt.call @callee.add.f32(%x, %y, %y1) : (f32, f32, f32) -> f32 // CHECK-NEXT: 6 - "infrt.print.f32"(%z) : (f32) -> () - infrt.return %z : f32 + "Infrt.print.f32"(%z) : (f32) -> () + Infrt.return %z : f32 } /// <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - -// CHECK-LABEL: @string_test -func @string_test() { - %path = infrt.get_string("this is get_string op.") - // CHECK-LABEL: string = this is get_string op. - infrt.print_string(%path) - infrt.return -} diff --git a/paddle/infrt/tests/dialect/benchmark.mlir b/paddle/infrt/tests/dialect/benchmark.mlir index 1a57b43499062410b346b38412a533d3edd6fbcc..381fd534f6a5a09e3091203de88ebf00101074af 100644 --- a/paddle/infrt/tests/dialect/benchmark.mlir +++ b/paddle/infrt/tests/dialect/benchmark.mlir @@ -12,13 +12,13 @@ func @benchmark() { // CHECK-LABEL: BM:add.f32:CPU 95%(ns) // CHECK-LABEL: BM:add.f32:CPU 99%(ns) // CHECK-LABEL: BM:add.f32:CPU utilization(percent) - infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 + Infrt.benchmark "add.f32"() duration_secs = 1, max_count = 3, num_warmup_runs = 3 { - %0 = infrt.constant.f32 1.0 - %1 = infrt.constant.f32 2.0 - %res = "infrt.add.f32"(%0, %1) : (f32, f32) -> f32 - "infrt.print.f32"(%res) : (f32) -> () - infrt.return %res : f32 + %0 = Infrt.constant.f32 1.0 + %1 = Infrt.constant.f32 2.0 + %res = "Infrt.add.f32"(%0, %1) : (f32, f32) -> f32 + "Infrt.print.f32"(%res) : (f32) -> () + Infrt.return %res : f32 } - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/dense_tensor.mlir b/paddle/infrt/tests/dialect/dense_tensor.mlir index f1def17aa87961d70322ec20b4a86a018250e58d..faade62d35063b1d85c4c1d3ddad98b085a7726c 100644 --- a/paddle/infrt/tests/dialect/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/dense_tensor.mlir @@ -2,23 +2,23 @@ // CHECK-LABEL: dense_shape0 func @dense_shape0() { %shape = ts.build_shape [1:i64, 57:i64] - %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - infrt.return + Infrt.return } -func @predict(%a: !infrt.tensor, %b: !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) { - %a0 = dt.shallow_copy_tensor %a : !infrt.tensor -> !infrt.tensor - %b0 = dt.shallow_copy_tensor %b : !infrt.tensor -> !infrt.tensor +func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { + %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor + %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor + Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } func @main() { %shape = ts.build_shape [1:i64, 57:i64] - %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor, !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) - infrt.return + %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + Infrt.return } diff --git a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir index 111c01c9a108bacb0a72ed5e6ff2044487552642..8e2d3bc49b96c645fc72e33af6300307d855e5a4 100644 --- a/paddle/infrt/tests/dialect/disabled_tensor_map.mlir +++ b/paddle/infrt/tests/dialect/disabled_tensor_map.mlir @@ -1,31 +1,31 @@ // CHECK-LABEL: @predict -func @predict(%input:!infrt.tensor, %map: !infrt.tensor_map) -> (!infrt.tensor) { - %w = dt.get_param(%map, "create_parameter_0.w_0") -> !infrt.tensor - %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !infrt.tensor +func @predict(%input:!Infrt.tensor, %map: !Infrt.tensor_map) -> (!Infrt.tensor) { + %w = dt.get_param(%map, "create_parameter_0.w_0") -> !Infrt.tensor + %bias = dt.get_param(%map, "create_parameter_1.w_0") -> !Infrt.tensor - %out = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor + %out = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor // fc - "external.matmul"(%input, %w, %out) {}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!infrt.tensor, !infrt.tensor, !infrt.tensor) -> () - "external.sigmoid"(%out, %out) {}: (!infrt.tensor, !infrt.tensor) -> () - //dt.print_tensor (%out : !infrt.tensor) + "external.matmul"(%input, %w, %out) {}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.elementwise_add"(%out, %bias, %out) {axis = -1}: (!Infrt.tensor, !Infrt.tensor, !Infrt.tensor) -> () + "external.sigmoid"(%out, %out) {}: (!Infrt.tensor, !Infrt.tensor) -> () + //dt.print_tensor (%out : !Infrt.tensor) - infrt.return %out : !infrt.tensor + Infrt.return %out : !Infrt.tensor } // CHECK-LABEL: @main func @main() { - %input = dt.create_uninit_tensor.f32 [3, 3] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%input : !infrt.tensor) {value=1.0:f32} + %input = dt.create_uninit_tensor.f32 [3, 3] -> !Infrt.tensor + dt.fill_tensor_with_constant.f32 (%input : !Infrt.tensor) {value=1.0:f32} - %path = infrt.get_string("/infrt/build/paddle/paddle_1.8_fc_model") + %path = Infrt.get_string("/Infrt/build/paddle/paddle_1.8_fc_model") // CHECK-LABEL: loading params %map = dt.load_params(%path) - %out = infrt.call @predict(%input, %map): (!infrt.tensor, !infrt.tensor_map) -> (!infrt.tensor) - dt.print_tensor (%out : !infrt.tensor) + %out = Infrt.call @predict(%input, %map): (!Infrt.tensor, !Infrt.tensor_map) -> (!Infrt.tensor) + dt.print_tensor (%out : !Infrt.tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir index d98f107bab41e959d82acfd681d762d7981eab51..75ec98f04661a7d8cfe55c5fbea9dbc87933ad18 100644 --- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir +++ b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir @@ -7,15 +7,15 @@ func @main() -> tensor { %bias1 = "pd.feed"() {name="input4"} : () -> tensor %bias2 = "pd.feed"() {name="input5"} : () -> tensor - %d = "pd.elementwise_add"(%c, %bias) {axis=1:i32} : (tensor, tensor) -> tensor + %d = "pd.elementwise_add"(%c, %bias) {axis=1:si32} : (tensor, tensor) -> tensor %e = "pd.relu6"(%d) {} : (tensor) -> tensor %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor - %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:i32} : (tensor, tensor) -> tensor + %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:si32} : (tensor, tensor) -> tensor %e1 = "pd.relu"(%d1) {} : (tensor) -> tensor %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor, tensor) -> tensor + %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:si32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor "pd.fetch"(%e2) {name="output"} :(tensor)->() diff --git a/paddle/infrt/tests/dialect/paddle_ops.mlir b/paddle/infrt/tests/dialect/paddle_ops.mlir index 02511b21e4792bb37c416093a7c272090eae44c1..48ee4b9d725c0aa36d4849c2842c99997de5c8ee 100644 --- a/paddle/infrt/tests/dialect/paddle_ops.mlir +++ b/paddle/infrt/tests/dialect/paddle_ops.mlir @@ -3,8 +3,7 @@ func @ops() { %a = pd.feed() {name="input0"} : tensor %b = pd.feed() {name="input1"}: tensor - %d = pd.feed() {name="input3"}: !Infrt.lod_tensor<3x4x9xf32, 0> + %d = pd.feed() {name="input3"}: !infrt.lod_tensor<3x4x9xf32, 0> %c = "pd.matmul"(%a, %b) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir index 88f5b289fd9f843803fddf0cd98859839ef271de..f0b0b849b93cb1d42ce172c2cff90a41741c1d3d 100644 --- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir @@ -1,11 +1,13 @@ -// RUN: infrtopt %s | FileCheck %s +// RUN: infrtexec -i %s | FileCheck %s -// CHECK-LABEL: @basic_tensor -func @basic_tensor() { - %a = "pten_dt.create_allocator.cpu" (): () -> !pten.CPU_allocator - %b = "pten_dt.create_context.cpu" (): () -> !pten.CPU_context - %c = "pten_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!pten.CPU_allocator) -> (!infrt.tensor) - // "pten_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!infrt.tensor) -> () +// CHECK-LABEL: @fake_phi_kernel_execute +func @fake_phi_kernel_execute() { + %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator + %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context + %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) - infrt.return + // CHECK: @FakePhiKernel@ + %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) + Infrt.return } + diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/pten/pten_pass.mlir new file mode 100644 index 0000000000000000000000000000000000000000..30ff2636ae5a41674883e63ff931629a0d140b84 --- /dev/null +++ b/paddle/infrt/tests/dialect/pten/pten_pass.mlir @@ -0,0 +1,10 @@ +// RUN: infrtopt %s | FileCheck %s +// CHECK-LABEL: @ops +func @ops() { + %a = pd.feed() {name="input0"} : !infrt.lod_tensor + %b = pd.feed() {name="input1"} : !infrt.lod_tensor + %d = pd.feed() {name="input3"} : !infrt.lod_tensor<3x4x9xf32, 0> + %g = "pd.elementwise_add"(%a, %b) {axis=1:si32} : (!infrt.lod_tensor, !infrt.lod_tensor) -> tensor + %h = "pd.abs"(%g):(tensor) -> tensor + "pd.fetch"(%h) {name="output"} :(tensor)->() +} diff --git a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir index ff7f36f5078d62d7e8713bba226f7271a7a2664b..76ae140dd6cbd741f992315ee35d3e94058d4674 100644 --- a/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/tensor/dense_tensor.mlir @@ -1,23 +1,23 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: dense_shape0 func @dense_shape0() { - %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - infrt.return + Infrt.return } -func @predict(%a: !infrt.tensor, %b: !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) { - %a0 = dt.shallow_copy_tensor %a : !infrt.tensor -> !infrt.tensor - %b0 = dt.shallow_copy_tensor %b : !infrt.tensor -> !infrt.tensor +func @predict(%a: !infrt.dense_tensor, %b: !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) { + %a0 = dt.shallow_copy_tensor %a : !infrt.dense_tensor -> !infrt.dense_tensor + %b0 = dt.shallow_copy_tensor %b : !infrt.dense_tensor -> !infrt.dense_tensor - infrt.return %a0, %b0: !infrt.tensor, !infrt.tensor + Infrt.return %a0, %b0: !infrt.dense_tensor, !infrt.dense_tensor } func @main() { %shape = ts.build_shape [1:i64, 57:i64] - %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.tensor + %a = dt.create_uninit_tensor.f32 [12:i64, 23:i64] -> !infrt.dense_tensor - %b, %c = infrt.call @predict(%a, %a) : (!infrt.tensor, !infrt.tensor) -> (!infrt.tensor, !infrt.tensor) - infrt.return + %b, %c = Infrt.call @predict(%a, %a) : (!infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor, !infrt.dense_tensor) + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir index 914e863db49cca3320c74b11b624e3d7dfe3b6f8..52b296e06cd365fbaa1249108f877dc9f7480ff0 100644 --- a/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir +++ b/paddle/infrt/tests/dialect/tensor/naive_kernels.mlir @@ -2,34 +2,34 @@ // CHECK-LABEL: naive_elementwise_add func @naive_elementwise_add() { // create a - %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // create b - %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor) {value=2.0:f32} + %b = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor) {value=2.0:f32} // get c - %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.tensor, !infrt.tensor) -> !infrt.tensor + %c = dt.naive_elementwise_add.f32(%a, %b) {} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor // CHECK: tensor: shape=shape[2,8], values=[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] - dt.print_tensor (%c : !infrt.tensor) + dt.print_tensor (%c : !infrt.dense_tensor) - infrt.return + Infrt.return } // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: naive_matmul func @naive_matmul() { // create a - %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [2:i64, 8:i64] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // create b - %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%b : !infrt.tensor) {value=2.0:f32} + %b = dt.create_uninit_tensor.f32 [8:i64, 4:i64] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%b : !infrt.dense_tensor) {value=2.0:f32} // get c - %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.tensor, !infrt.tensor) -> !infrt.tensor + %c = dt.naive_matmul.f32(%a, %b) {} : (!infrt.dense_tensor, !infrt.dense_tensor) -> !infrt.dense_tensor // CHECK: tensor: shape=shape[2,4], values=[16, 16, 16, 16, 16, 16, 16, 16] - dt.print_tensor (%c : !infrt.tensor) + dt.print_tensor (%c : !infrt.dense_tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in index 4edb918b5a28fdfed2b68b647167f41c90d27d9a..5c1396d47f551618bcdf95ef55c875aa2cb0d684 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in +++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in @@ -1,15 +1,15 @@ // RUN: infrtexec -i %s | FileCheck %s func @load_tensor_map() { - %path = infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model") + %path = Infrt.get_string("@CMAKE_BINARY_DIR@/multi_fc_model") %map = dt.load_params(%path) %size = dt.tensor_map_get_size(%map) -> i32 - infrt.print.i32 %size + Infrt.print.i32 %size - %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.tensor + %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.dense_tensor // CHECK: tensor: shape=shape[2], values=[0, 0] - dt.print_tensor (%a : !infrt.tensor) + dt.print_tensor (%a : !infrt.dense_tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir index 09210078b9d7d139f2bc2534acf07e83aa1146bb..5623aef71aa2c33ff0bd3524855c56e9dcab5e9b 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir index 01a2f7df32608ad64d2929b4b24f96cf4e5062c4..e580634055a72eae66196f67c8321c308599a1af 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor/tensor_type.mlir @@ -1,10 +1,10 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: test_tensor_type func @test_tensor_type() { - %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%a : !infrt.tensor) + dt.print_tensor (%a : !infrt.dense_tensor) - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_shape.mlir b/paddle/infrt/tests/dialect/tensor_shape.mlir index 09210078b9d7d139f2bc2534acf07e83aa1146bb..5623aef71aa2c33ff0bd3524855c56e9dcab5e9b 100644 --- a/paddle/infrt/tests/dialect/tensor_shape.mlir +++ b/paddle/infrt/tests/dialect/tensor_shape.mlir @@ -4,5 +4,5 @@ func @build_tensor1() { %a = ts.build_shape [1:i64, 57:i64, 92:i64] // CHECK: shape[1,57,92] ts.print_shape %a - infrt.return + Infrt.return } diff --git a/paddle/infrt/tests/dialect/tensor_type.mlir b/paddle/infrt/tests/dialect/tensor_type.mlir index 01a2f7df32608ad64d2929b4b24f96cf4e5062c4..e580634055a72eae66196f67c8321c308599a1af 100644 --- a/paddle/infrt/tests/dialect/tensor_type.mlir +++ b/paddle/infrt/tests/dialect/tensor_type.mlir @@ -1,10 +1,10 @@ // RUN: infrtexec -i %s | FileCheck %s // CHECK-LABEL: test_tensor_type func @test_tensor_type() { - %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.tensor - dt.fill_tensor_with_constant.f32 (%a : !infrt.tensor) {value=1.0:f32} + %a = dt.create_uninit_tensor.f32 [3, 4] -> !infrt.dense_tensor + dt.fill_tensor_with_constant.f32 (%a : !infrt.dense_tensor) {value=1.0:f32} // CHECK: tensor: shape=shape[3,4], values=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] - dt.print_tensor (%a : !infrt.tensor) + dt.print_tensor (%a : !infrt.dense_tensor) - infrt.return + Infrt.return } diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index 8d840214092ba9b1d7e6cc351cee1abfc816e7f8..06f3cd844760616b44a1bece9a889a1a2a5f61e9 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -41,7 +41,6 @@ limitations under the License. */ #include "paddle/phi/api/ext/dispatch.h" #include "paddle/phi/api/ext/dll_decl.h" #include "paddle/phi/api/ext/exception.h" -#include "paddle/phi/api/ext/op_kernel_info.h" #include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/ext/place.h" #include "paddle/phi/api/ext/tensor_compat.h" diff --git a/paddle/phi/api/ext/dispatch.h b/paddle/phi/api/ext/dispatch.h index 4e5fa879a2cfc759cea753be8db19e116d91669e..6b6d0ae7fe7230263454d0bf08da40e4a793549b 100644 --- a/paddle/phi/api/ext/dispatch.h +++ b/paddle/phi/api/ext/dispatch.h @@ -292,7 +292,7 @@ namespace paddle { paddle::experimental::complex128, \ __VA_ARGS__) \ default: \ - PADDLE_THROW(paddle::platform::errors::InvalidArgument( \ + PADDLE_THROW(phi::errors::InvalidArgument( \ "Invalid enum data type `%d`.", static_cast(__dtype__))); \ } \ }() diff --git a/paddle/phi/api/ext/op_kernel_info.h b/paddle/phi/api/ext/op_kernel_info.h deleted file mode 100644 index b52b0abe9e745d7a559a4f4752bb9a77e4137245..0000000000000000000000000000000000000000 --- a/paddle/phi/api/ext/op_kernel_info.h +++ /dev/null @@ -1,1256 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "paddle/phi/api/ext/dll_decl.h" -#include "paddle/phi/api/ext/exception.h" -#include "paddle/phi/api/ext/op_meta_info.h" -#include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/utils/any.h" -#include "paddle/utils/small_vector.h" - -#include "paddle/phi/common/data_type.h" - -/** - * Custom Kernel Info Define. - * - * Used to maintain custom kernel core information before registering. - * Pten is working on exposing headers, custom kernel depends on them, and - * we prefer outer users following pten-kernel-function-style and registering - * macro. So, we have to re-implement some structs or class and functions to - * make sure users' custom kernel functions can be registered to pten. - * - * TODO(Aganlengzi): We should upgrade following pten. - */ - -namespace paddle { -namespace framework { -class PADDLE_API OpKernelInfoHelper; -} // namespace framework - -// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting -// before phi::DeviceContext is exposed. -class DeviceContext { - public: - DeviceContext() { stream_ = nullptr; } - void set_stream(void* stream) { stream_ = stream; } - void* stream() const { return stream_; } - - private: - void* stream_; -}; -class CPUContext : public DeviceContext {}; - -// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed -using Tensor = paddle::experimental::Tensor; -using Scalar = phi::Scalar; -using ScalarArray = phi::ScalarArray; - -// Record custom kernel core information -// We can not use phi::KernelFn directly, so users' custom kernel function -// is signatured to `CustomKernelFunc', notice that the first parameter is -// fixed to `const DeviceContext&'. -using CustomKernelFunc = - void (*)(const DeviceContext& dev_ctx, - const std::vector& inputs, - const std::vector>& vec_inputs, - const std::vector& attrs, - std::vector* outputs, - std::vector>* vec_outputs); - -////////////////////// Kernel Function (PD_PT_KERNEL) //////////////////////// -#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx) \ - template \ - struct CustomComputeCallHelper { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - static_assert(in_idx == 0, \ - "Kernel's DeviceContext should appear before Inputs."); \ - static_assert(vec_in_idx == 0, \ - "Kernel's DeviceContext should appear before Inputs."); \ - static_assert( \ - attr_idx == 0, \ - "Kernel's DeviceContext should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's DeviceContext should appear before Outputs."); \ - static_assert(vec_out_idx == 0, \ - "Kernel's DeviceContext should appear before Outputs."); \ - const device_ctx& arg = static_cast(dev_ctx); \ - CustomComputeCallHelper::template Compute( \ - dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } \ - } - -#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type) \ - template \ - struct CustomComputeCallHelper { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - static_assert(vec_out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const Tensor& arg = inputs[in_idx]; \ - CustomComputeCallHelper::template Compute( \ - dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } \ - } - -#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ - template \ - struct CustomComputeCallHelper&, Tail...> { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - static_assert(attr_idx == 0, \ - "Kernel's Input should appear before Attributes."); \ - static_assert(out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - static_assert(vec_out_idx == 0, \ - "Kernel's Input should appear before Outputs."); \ - const std::vector& arg = vec_inputs[vec_in_idx]; \ - CustomComputeCallHelper::template Compute( \ - dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } \ - } - -#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ - template \ - struct CustomComputeCallHelper { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - static_assert(out_idx == 0, \ - "Kernel's Attributes should appear before Outputs."); \ - static_assert(vec_out_idx == 0, \ - "Kernel's Attributes should appear before Outputs."); \ - try { \ - attr_type arg = paddle::any_cast(attrs[attr_idx]); \ - return CustomComputeCallHelper::template Compute< \ - dev_ctx_idx, \ - in_idx, \ - vec_in_idx, \ - attr_idx + 1, \ - out_idx, \ - vec_out_idx>(dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } catch (paddle::bad_any_cast&) { \ - PD_THROW( \ - "Attribute cast error in custom operator. Expected " #attr_type \ - " value."); \ - } \ - } \ - } - -#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type) \ - template \ - struct CustomComputeCallHelper { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - tensor_type* arg = (*outputs)[out_idx]; \ - CustomComputeCallHelper::template Compute( \ - dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } \ - } - -#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type) \ - template \ - struct CustomComputeCallHelper, Tail...> { \ - template \ - static void Compute(const DeviceContext& dev_ctx, \ - const std::vector& inputs, \ - const std::vector>& vec_inputs, \ - const std::vector& attrs, \ - std::vector* outputs, \ - std::vector>* vec_outputs, \ - PreviousArgs... pargs) { \ - std::vector arg = (*vec_outputs)[vec_out_idx]; \ - CustomComputeCallHelper::template Compute( \ - dev_ctx, \ - inputs, \ - vec_inputs, \ - attrs, \ - outputs, \ - vec_outputs, \ - pargs..., \ - arg); \ - } \ - } - -template -struct PtenTypeTag {}; - -template -struct CustomKernelFuncImpl; - -template -struct CustomKernelFuncImpl { - static void Compute(const DeviceContext& dev_ctx, - const std::vector& inputs, - const std::vector>& vec_inputs, - const std::vector& attrs, - std::vector* outputs, - std::vector>* vec_outputs) { - CustomComputeCallHelper>:: - template Compute<0, 0, 0, 0, 0, 0>( - dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs); - } - - // NOTE: Tensor in args is paddle::Tensor but not DenseTensor - static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) { - return impl_fn(static_cast(dev_ctx), std::forward(args)...); - } - - private: - template - struct CustomComputeCallHelper; - - /* DeviceContext Helpers */ - PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext); - - /* Input Helpers */ - PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor); - PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor); - - /* Attribute Helpers */ - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(phi::dtype::float16); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); - - /* Output Helpers */ - PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor); - PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor); - - // End: base template - template - struct CustomComputeCallHelper> { - template - static void Compute(const DeviceContext& dev_ctx, - const std::vector& inputs, - const std::vector>& vec_inputs, - const std::vector& attrs, - std::vector* outputs, - std::vector>* vec_outputs, - DevCtx device_ctx, - Args... args) { - return impl_fn(device_ctx, args...); - } - }; -}; - -#define PD_PT_KERNEL(...) \ - ::paddle::CustomKernelFuncImpl::Compute - -#define PD_PT_VARIADIC_KERNEL(...) \ - reinterpret_cast( \ - &::paddle::CustomKernelFuncImpl::VariadicCompute) - -////////////////////// Op Kernel Info depended structs ////////////////////// -// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily. -// TensorArgDef follows phi::TensorArgDef in kernel_factory.h, the -// difference is that custom_kernel needs extra `is_vector' to ensure we can -// deal with case like vector with only one element. -struct TensorArgDef { - phi::Backend backend; - phi::DataLayout layout; - phi::DataType dtype; - bool is_vector{false}; - - TensorArgDef(phi::Backend in_backend, - phi::DataLayout in_layout, - phi::DataType in_dtype, - bool is_vector = false) - : backend(in_backend), - layout(in_layout), - dtype(in_dtype), - is_vector(is_vector) {} - - TensorArgDef& SetBackend(phi::Backend in_backend) { - backend = in_backend; - return *this; - } - - TensorArgDef& SetDataLayout(phi::DataLayout in_layout) { - layout = in_layout; - return *this; - } - - TensorArgDef& SetDataType(phi::DataType in_dtype) { - dtype = in_dtype; - return *this; - } -}; - -// AttributeArgDef follows phi::AttributeArgDef in kernel_factory.h -struct AttributeArgDef { - std::type_index type_index; - - explicit AttributeArgDef(std::type_index type_index) - : type_index(type_index) {} -}; - -////////////////////// Op Kernel Info ////////////////////// -// OpKernelInfo stores all info parsed from user kernel function, includes: -// 0. op_name and kernel key(backend, data_layout and data_type) -// 1. unified custom kernel function -// 2. variadic kernel function(use paddle::Tensor) -// 3. args info and user defined change for specific arg -class PADDLE_API OpKernelInfo { - public: - explicit OpKernelInfo(const std::string& op_name, - phi::Backend backend, - phi::DataLayout data_layout, - phi::DataType data_type) - : op_name_(op_name), - backend_(backend), - layout_(data_layout), - dtype_(data_type) {} - - // format: PD_PT_KERNEL(...) - OpKernelInfo& SetKernelFn(CustomKernelFunc&& func); - // format: PD_PT_VARIADIC_KERNEL(...) - OpKernelInfo& SetVariadicKernelFn(void* func); - - // for Args parsing and storing - void AppendInput(phi::Backend backend, - phi::DataLayout layout, - phi::DataType dtype, - bool is_vector = false) { - input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector)); - } - - void AppendOutput(phi::Backend backend, - phi::DataLayout layout, - phi::DataType dtype, - bool is_vector = false) { - output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector)); - } - - void AppendAttribute(std::type_index type_index) { - attribute_defs_.emplace_back(AttributeArgDef(type_index)); - } - - // for Args user-def function - TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); } - TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); } - - const phi::Backend& GetBackend() const { return backend_; } - const phi::DataLayout& GetDataLayout() const { return layout_; } - const phi::DataType& GetDataType() const { return dtype_; } - - private: - friend class framework::OpKernelInfoHelper; - - // 1. op info - std::string op_name_; - - // 2. kernel key info - phi::Backend backend_{phi::Backend::UNDEFINED}; - phi::DataLayout layout_{phi::DataLayout::UNDEFINED}; - phi::DataType dtype_{phi::DataType::UNDEFINED}; - - // 3. args info - paddle::SmallVector input_defs_{{}}; - paddle::SmallVector output_defs_{{}}; - paddle::SmallVector attribute_defs_{{}}; - - // 4. func info - CustomKernelFunc kernel_fn_{nullptr}; - void* variadic_kernel_fn_{nullptr}; -}; - -////////////////////// Op Kernel Args Parser ////////////////////// -// Define CustomKernelArgsParseFunctor for args parsing -// We have to store parsed info into OpKernelInfo before -// mapping to phi::KernelArgsDef in phi::Kernel -template -struct CustomKernelArgsParseFunctor; - -template -struct CustomKernelArgsParseFunctor { - using Args = std::tuple; - enum : std::size_t { Arity = sizeof...(Args_) }; - using Indices = std::make_index_sequence; - template - using Arg = typename std::tuple_element::type; - - static void Parse(OpKernelInfo* op_kernel_info) { - const phi::Backend& backend = op_kernel_info->GetBackend(); - const phi::DataLayout& layout = op_kernel_info->GetDataLayout(); - const phi::DataType& dtype = op_kernel_info->GetDataType(); - - auto default_tensor_layout = phi::DataLayout::NCHW; - if (layout != phi::DataLayout::ANY) { - default_tensor_layout = layout; - } - auto args_type = ParseArgType(Indices{}); - for (auto arg_type : args_type) { - if (arg_type == std::type_index(typeid(const CPUContext&))) { - // do nothing, skip context arg now - } else if (arg_type == std::type_index(typeid(const Tensor&))) { - op_kernel_info->AppendInput(backend, default_tensor_layout, dtype); - } else if (arg_type == - std::type_index(typeid(const std::vector&))) { - op_kernel_info->AppendInput( - backend, default_tensor_layout, dtype, true); - } else if (arg_type == std::type_index(typeid(Tensor*))) { - op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype); - } else if (arg_type == std::type_index(typeid(std::vector))) { - op_kernel_info->AppendOutput( - backend, default_tensor_layout, dtype, true); - } else { - op_kernel_info->AppendAttribute(arg_type); - } - } - } - - private: - template - static std::vector ParseArgType( - std::index_sequence) { - return {std::type_index(typeid(Arg))...}; - } -}; - -#define PD_PT_ARGS_PARSE(...) \ - ::paddle::CustomKernelArgsParseFunctor::Parse - -//////////////// Op Kernel Info Map ///////////////// -// all user custom kernels information are stored in this map -class PADDLE_API OpKernelInfoMap { - public: - static OpKernelInfoMap& Instance() { - static OpKernelInfoMap g_custom_kernel_info_map; - return g_custom_kernel_info_map; - } - - std::vector& operator[](const std::string& name); - - const std::unordered_map>& GetMap() - const; - - private: - OpKernelInfoMap() = default; - std::unordered_map> map_; - - PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap); -}; - -//////////////// Op Kernel Info Builder ///////////////// -// format: PD_PT_ARGS_PARSE(...) -using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info); -using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel); - -class PADDLE_API OpKernelInfoBuilder { - public: - explicit OpKernelInfoBuilder(std::string&& op_name, - phi::Backend backend, - phi::DataLayout data_layout, - phi::DataType data_type); - - OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func); - OpKernelInfoBuilder& SetVariadicKernelFn(void* func); - OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func); - OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func); - - private: - // op name - std::string op_name_; - - // kernel key info - phi::Backend backend_{phi::Backend::UNDEFINED}; - phi::DataLayout layout_{phi::DataLayout::UNDEFINED}; - phi::DataType dtype_{phi::DataType::UNDEFINED}; - - // ref current info ptr - OpKernelInfo* info_ptr_; -}; -/////////////////////// Custom kernel register API ///////////////////////// -// For inference: compile directly with framework -// Call after PD_REGISTER_KERNEL(...) -void RegisterAllCustomKernel(); - -//////////////// Custom kernel register macro ///////////////////// -// Refer to paddle/phi/core/kernel_registry.h, we can not use -// PT_REGISTER_KERNEL directly, common macros and functions are -// not ready for custom kernel now. -// Difference: custom_kernel stores all kernels' info into global -// g_custom_kernel_info_map before loading and registering into -// pten kernel management. Only providing PD_REGISTER_KERNEL which -// supports 2 template arguments. - -#define PD_BACKEND(arg__) phi::Backend::arg__ -#define PD_DATALAYOUT(arg__) phi::DataLayout::arg__ -#define PD_DATATYPE(arg__) phi::DataType::arg__ - -#define PD_NARGS(...) _PD_NARGS((__VA_ARGS__, _PD_RESQ_N())) -#define _PD_NARGS(...) _PD_ARG_N(__VA_ARGS__) -#define _PD_ARG_N_EXPAND( \ - _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, N, ...) \ - N -#define _PD_ARG_N(args) _PD_ARG_N_EXPAND args -#define _PD_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -#define PD_CONCATENATE(arg1, arg2) PD_CONCATENATE1(arg1, arg2) -#define PD_CONCATENATE1(arg1, arg2) PD_CONCATENATE2(arg1, arg2) -#define PD_CONCATENATE2(arg1, arg2) arg1##arg2 - -#define PD_EXPAND(x) x - -#ifdef __COUNTER__ -#define PD_ID __COUNTER__ -#else -#define PD_ID __LINE__ -#endif - -#define PD_REGISTER_KERNEL(kernel_name, backend, layout, func, cpp_dtype, ...) \ - STATIC_ASSERT_GLOBAL_NAMESPACE( \ - _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PD_REGISTER_KERNEL must be called in global namespace."); \ - _PD_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, func, cpp_dtype, ##__VA_ARGS__) - -// WIN32 is not supported -#define _PD_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__); \ - static void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::paddle::OpKernelInfo* kernel); \ - PD_KERNEL_REGISTRAR_INIT( \ - kernel_name, \ - backend, \ - layout, \ - &__PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ - meta_kernel_fn, \ - cpp_dtype, \ - ##__VA_ARGS__); \ - void __PD_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::paddle::OpKernelInfo* kernel) - -#define PD_KERNEL_INSTANTIATION(meta_kernel_fn, backend, cpp_dtype, ...) \ - _PD_KERNEL_INSTANTIATION(PD_NARGS(cpp_dtype, ##__VA_ARGS__), \ - meta_kernel_fn, \ - backend, \ - cpp_dtype, \ - ##__VA_ARGS__) - -#define _PD_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, cpp_dtype, ...) \ - PD_CONCATENATE(_PD_KERNEL_INSTANTIATION_, N) \ - (meta_kernel_fn, backend, cpp_dtype, ##__VA_ARGS__) - -#define _PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn -#define _PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, ##__VA_ARGS__)) -#define _PD_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PD_EXPAND(_PD_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, ##__VA_ARGS__)) - -#define PD_KERNEL_REGISTRAR_INIT( \ - kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \ - _PD_KERNEL_REGISTRAR_INIT(PD_NARGS(cpp_dtype, ##__VA_ARGS__), \ - kernel_name, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ##__VA_ARGS__) - -// clang-format off - -/* The =pre-commit always treats this macro into the wrong format, - and multi-line macros cannot be skipped with NOLINT.*/ -#define _PD_KERNEL_REGISTRAR_INIT(N, \ - kernel_name, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - PD_CONCATENATE(_PD_KERNEL_REGISTRAR_INIT_, N) ( \ - kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ##__VA_ARGS__) - -// clang-format on - -#define _PD_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); - -#define _PD_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) - -#define _PD_KERNEL_REGISTRAR_INIT_15(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static ::paddle::OpKernelInfoBuilder PD_CONCATENATE( \ - custom_kernel_info_##kernel_name##_##backend##_##layout##_, \ - registrar_id) = \ - ::paddle::OpKernelInfoBuilder( \ - #kernel_name, \ - PD_BACKEND(backend), \ - PD_DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type()) \ - .SetKernelFn(PD_PT_KERNEL( \ - meta_kernel_fn)) \ - .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL( \ - meta_kernel_fn)) \ - .ArgsParse(PD_PT_ARGS_PARSE( \ - meta_kernel_fn)) \ - .ArgsDef(args_def_fn); \ - PD_EXPAND(_PD_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - backend, \ - layout, \ - PD_ID, \ - args_def_fn, \ - meta_kernel_fn, \ - ##__VA_ARGS__)) -} // namespace paddle diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 175bf34c0da66fbd4ee8bc8451e5b35334b813ce..1ebddc3d3cd1baefcfcb362806d522fe2b3bcb72 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -90,10 +90,10 @@ cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispat cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api) cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor) -cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor_raw) + +cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten) cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform) -cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform) +cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform wrapped_infermeta) cc_library(pten_dygraph_api SRCS ${dygraph_api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform) cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api) -cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten) diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h index 650161a933a8cb9ba02d1385eef3c7bd0dc09a08..26408290bd325e60952f8f88d413b90451544044 100644 --- a/paddle/phi/api/lib/api_declare.h +++ b/paddle/phi/api/lib/api_declare.h @@ -17,6 +17,6 @@ limitations under the License. */ // api symbols declare, remove in the future #include "paddle/phi/api/lib/api_registry.h" -PT_DECLARE_API(Math); -PT_DECLARE_API(Utils); -PT_DECLARE_API(SparseApi); +PD_DECLARE_API(Math); +PD_DECLARE_API(Utils); +PD_DECLARE_API(SparseApi); diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h index 2812bede8e09ba99577efd69d928d89e8431cf25..3783620ea449b46ab17ae1ac7d9f7e80ef08cae9 100644 --- a/paddle/phi/api/lib/api_registry.h +++ b/paddle/phi/api/lib/api_registry.h @@ -36,10 +36,10 @@ namespace experimental { */ // use to declare symbol -#define PT_REGISTER_API(name) \ +#define PD_REGISTER_API(name) \ PADDLE_API int RegisterSymbolsFor##name() { return 0; } -#define PT_DECLARE_API(name) \ +#define PD_DECLARE_API(name) \ extern PADDLE_API int RegisterSymbolsFor##name(); \ UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name() diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/manual_api.cc index e0da15eac39b79f3b8ffde3f4c068d02ce28ae6c..7bd4711cc3f308173ce6fd12225faa46f516cb91 100644 --- a/paddle/phi/api/lib/manual_api.cc +++ b/paddle/phi/api/lib/manual_api.cc @@ -27,15 +27,15 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/infermeta/unary.h" -PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); #endif #ifdef PADDLE_WITH_XPU -PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT); #endif namespace paddle { @@ -147,4 +147,4 @@ PADDLE_API std::vector split(const Tensor& x, } // namespace experimental } // namespace paddle -PT_REGISTER_API(Utils); +PD_REGISTER_API(Utils); diff --git a/paddle/phi/api/lib/op_kernel_info.cc b/paddle/phi/api/lib/op_kernel_info.cc deleted file mode 100644 index 78b4955f321da0a3b37cc766287806acd37f37ac..0000000000000000000000000000000000000000 --- a/paddle/phi/api/lib/op_kernel_info.cc +++ /dev/null @@ -1,108 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/api/ext/op_kernel_info.h" -#include "paddle/fluid/framework/custom_kernel.h" - -namespace paddle { - -////////////////////// Op Kernel Info ////////////////////// - -OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) { - kernel_fn_ = std::forward(func); - return *this; -} - -OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) { - variadic_kernel_fn_ = func; - return *this; -} - -//////////////// Op Kernel Info Map ///////////////// - -std::vector& OpKernelInfoMap::operator[]( - const std::string& name) { - return map_[name]; -} - -const std::unordered_map>& -OpKernelInfoMap::GetMap() const { - return map_; -} - -//////////////// Op Kernel Info Builder ///////////////// - -OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name, - phi::Backend backend, - phi::DataLayout data_layout, - phi::DataType data_type) { - // 1. member assign - op_name_ = std::forward(op_name); - backend_ = backend; - layout_ = data_layout; - dtype_ = data_type; - - // 2. info parse - auto& info_vector = OpKernelInfoMap::Instance()[op_name_]; - auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_); - info_vector.emplace_back(std::move(op_kernel_info)); - - // 3. get current info ptr - info_ptr_ = &(info_vector.back()); -} - -OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) { - info_ptr_->SetKernelFn(std::forward(func)); - return *this; -} - -OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) { - info_ptr_->SetVariadicKernelFn(func); - return *this; -} - -OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse( - CustomKernelArgsParseFn func) { - func(this->info_ptr_); - return *this; -} - -OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) { - func(this->info_ptr_); - return *this; -} - -/////////////////////// Op register API ///////////////////////// - -// For inference: compile directly with framework -// Call after PD_REGISTER_KERNEL(...) -void RegisterAllCustomKernel() { - auto& op_kernel_info_map = OpKernelInfoMap::Instance(); - framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map); -} - -} // namespace paddle - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global OpKernelInfoMap. -paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() { - return paddle::OpKernelInfoMap::Instance(); -} - -#ifdef __cplusplus -} // end extern "C" -#endif diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc index 5a22d617492d2121de3acdb2e10bcaaa60f78a24..cc90c2b819daefd725a71f2787d75e42e37899bd 100644 --- a/paddle/phi/api/lib/sparse_api.cc +++ b/paddle/phi/api/lib/sparse_api.cc @@ -22,20 +22,20 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/infermeta/unary.h" -PT_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT); #endif namespace paddle { @@ -228,4 +228,4 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { } // namespace experimental } // namespace paddle -PT_REGISTER_API(SparseApi); +PD_REGISTER_API(SparseApi); diff --git a/paddle/phi/api/lib/utils/storage.cc b/paddle/phi/api/lib/utils/storage.cc index db3f5f0c8f98bcd4831ba7be69537e9db9efbee2..09ff18d10e312f1f1be130bb2411316dca184458 100644 --- a/paddle/phi/api/lib/utils/storage.cc +++ b/paddle/phi/api/lib/utils/storage.cc @@ -19,7 +19,7 @@ namespace experimental { ExternalStorage::ExternalStorage(void* ptr, size_t size, - const paddle::platform::Place& place) + const phi::Place& place) : phi::Storage(std::make_shared(ptr, size, place)), size_(size) {} @@ -29,11 +29,11 @@ ExternalStorage::ExternalStorage(const phi::intrusive_ptr& root, : Storage(std::make_shared( static_cast(root->data()) + delta, size, root->place())), size_(size) { - PADDLE_ENFORCE_LE(static_cast(delta + size), - root->size(), - paddle::platform::errors::InvalidArgument( - "The size of the external storage does " - "not meet the metadata requirements.")); + PADDLE_ENFORCE_LE( + static_cast(delta + size), + root->size(), + phi::errors::InvalidArgument("The size of the external storage does " + "not meet the metadata requirements.")); } } // namespace experimental diff --git a/paddle/phi/api/lib/utils/storage.h b/paddle/phi/api/lib/utils/storage.h index ede5f804836621a88a294d05cbae6a15c9eceb81..c2eedd0fa63f787d7aff6e5f20d807f363bc8b95 100644 --- a/paddle/phi/api/lib/utils/storage.h +++ b/paddle/phi/api/lib/utils/storage.h @@ -30,7 +30,7 @@ class ExternalStorage : public phi::Storage { static const char* name() { return "ExternalStorage"; } void Realloc(size_t n) override { - PADDLE_THROW(paddle::platform::errors::Unavailable( + PADDLE_THROW(phi::errors::Unavailable( "The external shared storage cannot be reallocated.")); } @@ -55,7 +55,7 @@ class ExternalStorage : public phi::Storage { const phi::Place& place() const override { PADDLE_ENFORCE_NOT_NULL( data_, - paddle::platform::errors::Unavailable( + phi::errors::Unavailable( "Unable to visit place as data_ has not been initialized yet.")); return data_->place(); } diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 441bd0a8c303b5e45f173f20e78ca2e65b9fc314..38366d57841b006726de386a32a5bd09a80f05a7 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -21,3 +21,7 @@ endif() if(WITH_GPU) add_dependencies(pten_context gpu_context) endif() + +if(WITH_CUSTOM_DEVICE) + add_dependencies(pten_context custom_context) +endif() diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index b53c5ce5c780cb09bd752de1c27c6ef87776aff2..3fe03905e42dd33afeedb3a04c2deae6fb0ca1ee 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -21,12 +21,15 @@ limitations under the License. */ // path replacement after implementing pten DeviceContext #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h" +#ifndef PADDLE_WITH_CUSTOM_KERNEL // TODO(wilber): DeviceContextPool nees include fluid file. #include "paddle/fluid/platform/device_context.h" namespace phi { using DeviceContextPool = paddle::platform::DeviceContextPool; } // namespace phi +#endif diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc index 445f550839160f79a757b50c74080cf3741aa76f..bde3b6a08539b51e06442ef6090f99cbea7e9de9 100644 --- a/paddle/phi/backends/custom/custom_context.cc +++ b/paddle/phi/backends/custom/custom_context.cc @@ -32,8 +32,8 @@ struct CustomContext::Impl { const Place& GetPlace() const { return place_; } - C_Stream stream() const { - return reinterpret_cast(stream_->raw_stream()); + void* stream() const { + return reinterpret_cast(stream_->raw_stream()); } void Wait() const { stream_->Wait(); } @@ -47,7 +47,7 @@ void CustomContext::Init() { impl_->Init(); } const Place& CustomContext::GetPlace() const { return impl_->GetPlace(); } -C_Stream CustomContext::stream() const { return impl_->stream(); } +void* CustomContext::stream() const { return impl_->stream(); } void CustomContext::Wait() const { return impl_->Wait(); } diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h index 109f5e53707f6ed3a04efb5680e6ec42649e13ef..37b0ee21219b59a0a79d748f6cd4ab0bc289440b 100644 --- a/paddle/phi/backends/custom/custom_context.h +++ b/paddle/phi/backends/custom/custom_context.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/platform/device/device_ext.h" #include "paddle/phi/common/place.h" #include "paddle/phi/core/device_context.h" @@ -30,7 +29,7 @@ class CustomContext : public DeviceContext { const Place& GetPlace() const override; /*! \brief Return stream in the device context. */ - C_Stream stream() const; + void* stream() const; // Wait for all operations completion in the stream. void Wait() const override; diff --git a/paddle/phi/backends/dynload/cudnn.cc b/paddle/phi/backends/dynload/cudnn.cc index ff000d27c4f2e185c88259e2353e476b1ff9220b..02d626d5f98f9fc0c260a55c846031634b68e144 100644 --- a/paddle/phi/backends/dynload/cudnn.cc +++ b/paddle/phi/backends/dynload/cudnn.cc @@ -54,7 +54,7 @@ bool HasCUDNN() { void EnforceCUDNNLoaded(const char* fn_name) { PADDLE_ENFORCE_NOT_NULL( cudnn_dso_handle, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "Cannot load cudnn shared library. Cannot invoke method %s.", fn_name)); } diff --git a/paddle/phi/backends/dynload/cufft.cc b/paddle/phi/backends/dynload/cufft.cc index 14240af41046c3a735b30392b0ab7685bc3d5806..596a68c1ed6aad96942ddd2b5eee82b8102e2444 100644 --- a/paddle/phi/backends/dynload/cufft.cc +++ b/paddle/phi/backends/dynload/cufft.cc @@ -33,7 +33,7 @@ bool HasCUFFT() { void EnforceCUFFTLoaded(const char* fn_name) { PADDLE_ENFORCE_NOT_NULL( cufft_dso_handle, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "Cannot load cufft shared library. Cannot invoke method %s.", fn_name)); } diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc index 473c58b33eebc46a62b6b31af10d6b71b0fff53d..2f35e22a18f820cd15325d8516447e3652c132f1 100644 --- a/paddle/phi/backends/dynload/dynamic_loader.cc +++ b/paddle/phi/backends/dynload/dynamic_loader.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include #endif -// TODO(wilber): The pten computing library requires a component to manage flags +// TODO(wilber): The phi computing library requires a component to manage flags // (maybe not use gflags). #include "gflags/gflags.h" #include "glog/logging.h" @@ -299,8 +299,8 @@ static inline void* GetDsoHandleFromSearchPath( #endif // !_WIN32 if (throw_on_error) { // NOTE: Special error report case, no need to change its format - PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( - error_msg, dso_name, errorno)); + PADDLE_THROW( + phi::errors::PreconditionNotMet(error_msg, dso_name, errorno)); } else { LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno); } @@ -547,14 +547,11 @@ void* GetOpDsoHandle(const std::string& dso_name) { void* GetNvtxDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - PADDLE_THROW( - paddle::platform::errors::Unimplemented("Nvtx do not support Apple.")); + PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Apple.")); #elif defined(_WIN32) - PADDLE_THROW( - paddle::platform::errors::Unimplemented("Nvtx do not support Windows.")); + PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support Windows.")); #elif !defined(PADDLE_WITH_CUDA) - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "Nvtx do not support without CUDA.")); + PADDLE_THROW(phi::errors::Unimplemented("Nvtx do not support without CUDA.")); #else return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so"); #endif diff --git a/paddle/phi/backends/dynload/miopen.cc b/paddle/phi/backends/dynload/miopen.cc index a57574dbab13bc88065cb91b9b175f164799584e..e7916873ccfde7e1e5d0933045c9b44557f2f07a 100644 --- a/paddle/phi/backends/dynload/miopen.cc +++ b/paddle/phi/backends/dynload/miopen.cc @@ -58,7 +58,7 @@ bool HasCUDNN() { void EnforceCUDNNLoaded(const char* fn_name) { PADDLE_ENFORCE_NOT_NULL( miopen_dso_handle, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "Cannot load miopen shared library. Cannot invoke method %s.", fn_name)); } diff --git a/paddle/phi/backends/dynload/tensorrt.h b/paddle/phi/backends/dynload/tensorrt.h index 77f25ec0b5aaff99fcaba8cae418d4045dfedf3a..cd8c6457f1b91b938f1ef927119c9ec63a7b6e1b 100644 --- a/paddle/phi/backends/dynload/tensorrt.h +++ b/paddle/phi/backends/dynload/tensorrt.h @@ -54,21 +54,21 @@ extern void* tensorrt_plugin_dso_handle; }; \ extern DynLoad__##__name __name -#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - std::call_once(tensorrt_dso_flag, []() { \ - tensorrt_dso_handle = phi::dynload::GetTensorRtHandle(); \ - }); \ - static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ - PADDLE_ENFORCE_NOT_NULL(p_##__name, \ - paddle::platform::errors::Unavailable( \ - "Load tensorrt api %s failed", #__name)); \ - using tensorrt_func = decltype(&::__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = phi::dynload::GetTensorRtHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + PADDLE_ENFORCE_NOT_NULL( \ + p_##__name, \ + phi::errors::Unavailable("Load tensorrt api %s failed", #__name)); \ + using tensorrt_func = decltype(&::__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern DynLoad__##__name __name #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name) \ @@ -80,7 +80,7 @@ extern void* tensorrt_plugin_dso_handle; }); \ static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name); \ PADDLE_ENFORCE_NOT_NULL(p_##__name, \ - paddle::platform::errors::Unavailable( \ + phi::errors::Unavailable( \ "Load tensorrt plugin %s failed", #__name)); \ using tensorrt_plugin_func = decltype(&::__name); \ return reinterpret_cast(p_##__name)(args...); \ diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc index f8e4ec02bc39e3406437a0503d4cd9622565dbeb..7be21e85f0005b9bfe7849ac6f12561cf108c7e3 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_info.cc +++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc @@ -14,7 +14,7 @@ #include "paddle/phi/backends/gpu/gpu_info.h" -// TODO(pten): remove fluid headers. +// TODO(phi): remove fluid headers. #include "paddle/fluid/platform/enforce.h" static std::once_flag g_device_props_size_init_flag; @@ -74,13 +74,13 @@ int GetGPUDeviceCount() { } int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int major, minor; auto major_error_code = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, id); @@ -93,26 +93,26 @@ int GetGPUComputeCapability(int id) { } int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int runtime_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(cudaRuntimeGetVersion(&runtime_version)); return runtime_version; } int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int driver_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(cudaDriverGetVersion(&driver_version)); return driver_version; @@ -125,13 +125,13 @@ bool TensorCoreAvailable() { } int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id)); @@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) { } int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( &count, cudaDevAttrMaxThreadsPerMultiProcessor, id)); @@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { } int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( cudaDeviceGetAttribute(&count, cudaDevAttrMaxThreadsPerBlock, id)); @@ -174,13 +174,13 @@ int GetCurrentDeviceId() { } std::array GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); std::array ret; int size; auto error_code_x = cudaDeviceGetAttribute(&size, cudaDevAttrMaxGridDimX, id); @@ -213,7 +213,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) { } if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(paddle::platform::errors::OutOfRange( + PADDLE_THROW(phi::errors::OutOfRange( "The device id %d is out of range [0, %d), where %d is the number of " "devices on this machine. Because the device id should be greater than " "or equal to zero and smaller than the number of gpus. Please input " @@ -233,13 +233,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(id)); } @@ -294,13 +294,13 @@ gpuError_t GpuGetLastError() { return cudaGetLastError(); } // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements // for more detail about managed memory requirements bool IsGPUManagedMemorySupported(int dev_id) { - PADDLE_ENFORCE_LT(dev_id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); #if defined(__linux__) || defined(_WIN32) int ManagedMemoryAttr; PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceGetAttribute( @@ -312,13 +312,13 @@ bool IsGPUManagedMemorySupported(int dev_id) { } bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { - PADDLE_ENFORCE_LT(dev_id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); #ifdef __linux__ return IsGPUManagedMemorySupported(dev_id) && GetGPUComputeCapability(dev_id) >= 60; diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 21193755044579eb4f19936dca1c2b6b3c5b4bea..e45b465122588263e47d3ccda47c29bb8bf3b6bd 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -29,6 +29,7 @@ #include #include #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/enforce.h" #ifdef __HIPCC__ // HIP results in error or nan if > 256 @@ -100,12 +101,12 @@ struct GpuLaunchConfig { inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, int64_t numel, int vec_size = 1) { - PADDLE_ENFORCE_GT(numel, - 0, - paddle::platform::errors::InvalidArgument( - "element quantity should be greater than 0," - " but received value is: %d.", - numel)); + PADDLE_ENFORCE_GT( + numel, + 0, + phi::errors::InvalidArgument("element quantity should be greater than 0," + " but received value is: %d.", + numel)); // Get compute_capability const int capability = context.GetComputeCapability(); /* If thread number per block is 64/128/256/512, cuda performs better.*/ @@ -142,18 +143,18 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(const phi::GPUContext& context, inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context, int x_dim, int y_dim) { - PADDLE_ENFORCE_GT(x_dim, - 0, - paddle::platform::errors::InvalidArgument( - "x dim number should greater than 0," - " but received value is: %d", - x_dim)); - PADDLE_ENFORCE_GT(y_dim, - 0, - paddle::platform::errors::InvalidArgument( - "y dim number should greater than 0," - " but received value is: %d", - y_dim)); + PADDLE_ENFORCE_GT( + x_dim, + 0, + phi::errors::InvalidArgument("x dim number should greater than 0," + " but received value is: %d", + x_dim)); + PADDLE_ENFORCE_GT( + y_dim, + 0, + phi::errors::InvalidArgument("y dim number should greater than 0," + " but received value is: %d", + y_dim)); const int kThreadsPerBlock = 256; int block_cols = (std::min)(x_dim, kThreadsPerBlock); diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index c7390cfb6a2198904f081ffbb8f5f4f8532324e2..11dd4f724878266d52fdcbeee031b6ac6a9a9438 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -78,13 +78,13 @@ int GetGPUDeviceCount() { } int GetGPUComputeCapability(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int major, minor; auto major_error_code = hipDeviceGetAttribute( &major, hipDeviceAttributeComputeCapabilityMajor, id); @@ -97,26 +97,26 @@ int GetGPUComputeCapability(int id) { } int GetGPURuntimeVersion(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int runtime_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(hipRuntimeGetVersion(&runtime_version)); return runtime_version; } int GetGPUDriverVersion(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int driver_version = 0; PADDLE_ENFORCE_GPU_SUCCESS(hipDriverGetVersion(&driver_version)); return driver_version; @@ -125,13 +125,13 @@ int GetGPUDriverVersion(int id) { bool TensorCoreAvailable() { return false; } int GetGPUMultiProcessors(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( hipDeviceGetAttribute(&count, hipDeviceAttributeMultiprocessorCount, id)); @@ -139,13 +139,13 @@ int GetGPUMultiProcessors(int id) { } int GetGPUMaxThreadsPerMultiProcessor(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( &count, hipDeviceAttributeMaxThreadsPerMultiProcessor, id)); @@ -154,13 +154,13 @@ int GetGPUMaxThreadsPerMultiProcessor(int id) { } int GetGPUMaxThreadsPerBlock(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); int count; PADDLE_ENFORCE_GPU_SUCCESS( hipDeviceGetAttribute(&count, hipDeviceAttributeMaxThreadsPerBlock, id)); @@ -174,13 +174,13 @@ int GetCurrentDeviceId() { } std::array GetGpuMaxGridDimSize(int id) { - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); std::array ret; int size; auto error_code_x = @@ -216,7 +216,7 @@ const gpuDeviceProp &GetDeviceProperties(int id) { } if (id < 0 || id >= static_cast(g_device_props.size())) { - PADDLE_THROW(paddle::platform::errors::OutOfRange( + PADDLE_THROW(phi::errors::OutOfRange( "The device id %d is out of range [0, %d), where %d is the number of " "devices on this machine. Because the device id should be greater than " "or equal to zero and smaller than the number of gpus. Please input " @@ -235,13 +235,13 @@ const gpuDeviceProp &GetDeviceProperties(int id) { void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count - PADDLE_ENFORCE_LT(id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + id, + GetGPUDeviceCount())); PADDLE_RETRY_CUDA_SUCCESS(hipSetDevice(id)); } @@ -293,13 +293,13 @@ void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); } gpuError_t GpuGetLastError() { return hipGetLastError(); } bool IsGPUManagedMemorySupported(int dev_id) { - PADDLE_ENFORCE_LT(dev_id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); #if defined(__linux__) || defined(_WIN32) int ManagedMemoryAttr; PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceGetAttribute( @@ -311,13 +311,13 @@ bool IsGPUManagedMemorySupported(int dev_id) { } bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) { - PADDLE_ENFORCE_LT(dev_id, - GetGPUDeviceCount(), - paddle::platform::errors::InvalidArgument( - "Device id must be less than GPU count, " - "but received id is: %d. GPU count is: %d.", - dev_id, - GetGPUDeviceCount())); + PADDLE_ENFORCE_LT( + dev_id, + GetGPUDeviceCount(), + phi::errors::InvalidArgument("Device id must be less than GPU count, " + "but received id is: %d. GPU count is: %d.", + dev_id, + GetGPUDeviceCount())); #ifdef __linux__ return IsGPUManagedMemorySupported(dev_id) && GetGPUComputeCapability(dev_id) >= 60; diff --git a/paddle/phi/backends/xpu/enforce_xpu.h b/paddle/phi/backends/xpu/enforce_xpu.h index bcfebf6d49fb87b7fa1a0fc29595f6f20ca57f77..29b048ead852dd91788316c2284b438d7dcbd61c 100644 --- a/paddle/phi/backends/xpu/enforce_xpu.h +++ b/paddle/phi/backends/xpu/enforce_xpu.h @@ -173,7 +173,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); ::phi::backends::xpu::details::ExternalApiType< \ __XPU_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ - auto __summary__ = paddle::platform::errors::External( \ + auto __summary__ = phi::errors::External( \ ::phi::backends::xpu::build_xpu_error_msg(__cond__)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ @@ -183,7 +183,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); do { \ auto __cond__ = (COND); \ if (UNLIKELY(__cond__ != baidu::xpu::api::Error_t::SUCCESS)) { \ - auto __summary__ = paddle::platform::errors::External( \ + auto __summary__ = phi::errors::External( \ ::phi::backends::xpu::build_xpu_xdnn_error_msg(__cond__, MSG)); \ __THROW_ERROR_INTERNAL__(__summary__); \ } \ @@ -192,7 +192,7 @@ DEFINE_EXTERNAL_API_TYPE(BKCLResult_t, BKCL_SUCCESS); #define PADDLE_ENFORCE_XDNN_NOT_NULL(ptr) \ do { \ if (UNLIKELY(ptr == nullptr)) { \ - auto __summary__ = paddle::platform::errors::External( \ + auto __summary__ = phi::errors::External( \ ::phi::backends::xpu::build_xpu_xdnn_error_msg( \ baidu::xpu::api::Error_t::NO_ENOUGH_WORKSPACE, \ "XPU memory is not enough")); \ diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index 527e13238082ec154b3ece67ca719425ae40d211..96e95df7a9886f2bb1b5485c822a98d4f42b5f12 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -100,7 +100,7 @@ void SetXPUDeviceId(int id) { PADDLE_ENFORCE_LT( id, GetXPUDeviceCount(), - paddle::platform::errors::InvalidArgument("id must less than XPU count")); + phi::errors::InvalidArgument("id must less than XPU count")); PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(id)); } diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index f7c39eacae9bd1192def55aedccd04fdfc1ccd33..1d3e4369c69489fc13ec6938fbb9377e93765bb9 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -71,17 +71,17 @@ enum class Backend : uint8_t { * Of course, we have also considered solving this problem through different * named macros, for example, if we define * - * PT_REGISTER_KERNEL_FOR_ALL_BACKEND + * PD_REGISTER_KERNEL_FOR_ALL_BACKEND * * Based on this design pattern, the dtype and layout also have the same * requirements, this cause we need to define a series of macros * - * PT_REGISTER_KERNEL_FOR_ALL_DTYPE - * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT - * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT - * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE - * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE - * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE + * PD_REGISTER_KERNEL_FOR_ALL_DTYPE + * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT + * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT + * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE + * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE + * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE * * It makes the system of registering macros more complicated, we think * this is not a simple design, so we still adopt the design of providing @@ -130,6 +130,29 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { return os; } +inline Backend StringToBackend(const char* backend_cstr) { + std::string s(backend_cstr); + if (s == std::string("Undefined")) { + return Backend::UNDEFINED; + } + if (s == std::string("CPU")) { + return Backend::CPU; + } else if (s == std::string("GPU")) { + return Backend::GPU; + } else if (s == std::string("XPU")) { + return Backend::XPU; + } else if (s == std::string("NPU")) { + return Backend::NPU; + } else if (s == std::string("MKLDNN")) { + return Backend::MKLDNN; + } else if (s == std::string("CUDNN")) { + return Backend::CUDNN; + } else { + return static_cast(static_cast(Backend::NUM_BACKENDS) + + phi::GetOrRegisterGlobalDeviceTypeId(s)); + } +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index b6adb1c2932bff5842ef74947c149f23b8b79a02..36fb910cad6c705952a0e3858eb09810d1ea6f5f 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -188,6 +188,7 @@ class MLUPlace : public Place { class CustomPlace : public Place { public: + CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {} explicit CustomPlace(const std::string dev_type) : Place(AllocationType::CUSTOM, 0, dev_type) {} CustomPlace(const std::string dev_type, int device_id) diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h index 1da77a0fa196413436030fc2864514cc222af6f8..72cef89d300c8d60811bde7cf667275b37fedc6f 100644 --- a/paddle/phi/common/scalar.h +++ b/paddle/phi/common/scalar.h @@ -25,7 +25,6 @@ namespace experimental { template class ScalarBase { public: - bool FromTensor() const { return is_from_tensor_; } // Constructor support implicit ScalarBase(double val) : dtype_(DataType::FLOAT64) { // NOLINT data_.f64 = val; @@ -157,6 +156,10 @@ class ScalarBase { CopyScalar(other, this); } + bool FromTensor() const { return is_from_tensor_; } + + void SetFromTensor(bool from_tensor) { is_from_tensor_ = from_tensor; } + template inline RT to() const { switch (dtype_) { @@ -191,6 +194,8 @@ class ScalarBase { } } + DataType dtype() const { return dtype_; } + private: template friend void CopyScalar(const ScalarBase& src, ScalarBase* dst); diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 18f209377bafc787268e3e510931661d6dff1cb8..6ada0630699054ba573f018175d9ba0724216e1b 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -13,8 +13,8 @@ cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context) cc_library(ddim SRCS ddim.cc DEPS pten_enforce) cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce) -cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce mixed_vector) -cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce mixed_vector) +cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce) +cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce) cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base) cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base) @@ -23,7 +23,9 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim) +cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim memcpy) + +cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index b4e7e127995ec2d0eeda788e9d6e6f9ccf12f8b1..a5b7b869b948dfb17b9f58a455bb336a4f021c4f 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -30,6 +30,8 @@ Backend TransToPtenBackend(const phi::Place& place) { return Backend::CPU; } else if (place.GetType() == phi::AllocationType::GPU) { return Backend::GPU; + } else if (place.GetType() == phi::AllocationType::XPU) { + return Backend::XPU; } else if (place.GetType() == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 5c0c440d8942c83d10bfe092b3fc1782944f1719..ec810d4e16340862faaabe0799e19245551b44c3 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -164,34 +164,34 @@ struct ArgumentMappingFnRegistrar { } }; -#define PT_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name) \ +#define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_base_kernel_name_ns_check_##op_type, \ - "PT_REGISTER_BASE_KERNEL_NAME must be called in global namespace."); \ + PD_REGISTER_base_kernel_name_ns_check_##op_type, \ + "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace."); \ static const ::phi::BaseKernelNameRegistrar \ __registrar_base_kernel_name_for_##op_type(#op_type, #base_kernel_name); \ int TouchBaseKernelNameSymbol_##op_type() { return 0; } -#define PT_DECLARE_BASE_KERNEL_NAME(op_type) \ +#define PD_DECLARE_BASE_KERNEL_NAME(op_type) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_declare_ai_name_ns_check_##op_type, \ - "PT_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \ + PD_DECLARE_ai_name_ns_check_##op_type, \ + "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \ extern int TouchBaseKernelNameSymbol_##op_type(); \ UNUSED static int __declare_base_kernel_name_symbol_for_##op_type = \ TouchBaseKernelNameSymbol_##op_type() -#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn) \ +#define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_arg_map_fn_ns_check_##op_type, \ - "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \ + PD_REGISTER_arg_map_fn_ns_check_##op_type, \ + "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \ static const ::phi::ArgumentMappingFnRegistrar \ __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn); \ int TouchArgumentMappingFnSymbol_##op_type() { return 0; } -#define PT_DECLARE_ARG_MAPPING_FN(op_type) \ +#define PD_DECLARE_ARG_MAPPING_FN(op_type) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_declare_arg_map_fn_ns_check_##op_type, \ - "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \ + PD_DECLARE_arg_map_fn_ns_check_##op_type, \ + "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \ extern int TouchArgumentMappingFnSymbol_##op_type(); \ UNUSED static int __declare_arg_map_fn_symbol_for_##op_type = \ TouchArgumentMappingFnSymbol_##op_type() diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..75ff9cc28600373eb1f074c0ed91b774ec9ab85a --- /dev/null +++ b/paddle/phi/core/custom_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/custom_kernel.h" + +namespace phi { + +void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { + auto& kernel_info_map = custom_kernel_map.GetMap(); + VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); + + for (auto& pair : kernel_info_map) { + PADDLE_ENFORCE_EQ( + KernelFactory::Instance().HasCompatiblePtenKernel(pair.first), + true, + phi::errors::InvalidArgument( + "The kernel %s is not ready for custom kernel registering.", + pair.first)); + + for (auto& info_pair : pair.second) { + auto& kernels = KernelFactory::Instance().kernels(); + PADDLE_ENFORCE_EQ( + kernels[pair.first].find(info_pair.first), + kernels[pair.first].end(), + phi::errors::InvalidArgument( + "The operator <%s>'s kernel: %s has been already existed " + "in Paddle, please contribute PR if it is necessary " + "to optimize the kernel code. Custom kernel does NOT support " + "to replace existing kernel in Paddle.", + pair.first, + info_pair.first)); + + kernels[pair.first][info_pair.first] = info_pair.second; + + VLOG(3) << "Successed in registering operator <" << pair.first + << ">'s kernel: " << info_pair.first + << " to Paddle. It will be used like native ones."; + } + } +} + +} // namespace phi + +#ifdef __cplusplus +extern "C" { +#endif + +// C-API to get global CustomKernelMap. +phi::CustomKernelMap& PD_GetCustomKernelMap() { + return phi::CustomKernelMap::Instance(); +} + +#ifdef __cplusplus +} // end extern "C" +#endif diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..20ae2b7bb7360ab6878617234784157584e01858 --- /dev/null +++ b/paddle/phi/core/custom_kernel.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/kernel_factory.h" +#include "paddle/phi/core/macros.h" + +namespace phi { +/** + * Note: + * Used to store kernels' info before registered to KernelFactory. + */ +class CustomKernelMap { + public: + static CustomKernelMap& Instance() { + static CustomKernelMap g_custom_kernel_info_map; + return g_custom_kernel_info_map; + } + + KernelNameMap& Kernels() { return kernels_; } + + const KernelNameMap& GetMap() const { return kernels_; } + + private: + CustomKernelMap() = default; + DISABLE_COPY_AND_ASSIGN(CustomKernelMap); + + KernelNameMap kernels_; +}; + +/** + * Note: + * Used to register custom kernels to KernelFactory. + */ +void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map); + +} // namespace phi diff --git a/paddle/phi/core/ddim.h b/paddle/phi/core/ddim.h index 1d186fe3b43fe00965db2ff32c51d43d6b7a3c11..ce462d8d954023a1ccd2ff4d33e1cf9611b40513 100644 --- a/paddle/phi/core/ddim.h +++ b/paddle/phi/core/ddim.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once #include +#include #include #include #include diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index a363d3cbaaa340e183dfa3281800db4a9f72b104..44cb63e2b874bd2df9b034ecf9f03053d1888c94 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -73,7 +73,7 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, size_t requested_size) { PADDLE_ENFORCE_NOT_NULL( allocator, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Required allocator shall not be nullptr, but received nullptr.")); if (this->dtype() != dtype) { VLOG(10) << "change data type in mutbale_data, target dtype - " << dtype; @@ -81,13 +81,13 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, } PADDLE_ENFORCE( valid(), - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "The meta data must be valid when call the mutable data function.")); size_t bytes = numel() * SizeOf(this->dtype()); if (requested_size) { PADDLE_ENFORCE_GE(requested_size, bytes, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The reserved size %d should be enough to meet the " "volume required by metadata %d.", requested_size, @@ -112,7 +112,7 @@ const T* DenseTensor::data() const { check_memory_size(); PADDLE_ENFORCE( (dtype() == paddle::experimental::CppTypeToDataType::Type()), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The type of data we are trying to retrieve does not match the " "type of data currently contained in the container.")); return static_cast(data()); @@ -123,7 +123,7 @@ T* DenseTensor::data() { check_memory_size(); PADDLE_ENFORCE( (dtype() == paddle::experimental::CppTypeToDataType::Type()), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The type of data we are trying to retrieve does not match the " "type of data currently contained in the container.")); return static_cast(data()); @@ -133,7 +133,7 @@ void* DenseTensor::data() { check_memory_size(); PADDLE_ENFORCE_NOT_NULL( holder_, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "The storage must be valid when call the data function.")); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + meta_.offset); @@ -143,7 +143,7 @@ const void* DenseTensor::data() const { check_memory_size(); PADDLE_ENFORCE_NOT_NULL( holder_, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "The storage must be valid when call the data function.")); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + meta_.offset); @@ -151,7 +151,7 @@ const void* DenseTensor::data() const { void DenseTensor::set_meta(DenseTensorMeta&& meta) { PADDLE_ENFORCE(!meta_.valid(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Only when the original attribute of Tensor is " "incomplete, can it be reset.")); meta_ = std::move(meta); @@ -160,7 +160,7 @@ void DenseTensor::set_meta(DenseTensorMeta&& meta) { void DenseTensor::set_meta(const DenseTensorMeta& meta) { PADDLE_ENFORCE( meta.valid(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Input meta is invalid, please check the meta attribute.")); meta_.dims = meta.dims; meta_.dtype = meta.dtype; diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h index 622cedf1d7f91e843efe979c40b9cb298ca3181f..0dddd63099bbca66281c747fd35b8346a2ded726 100644 --- a/paddle/phi/core/dense_tensor.h +++ b/paddle/phi/core/dense_tensor.h @@ -171,6 +171,9 @@ class DenseTensor : public TensorBase, DenseTensorMeta meta_; std::shared_ptr holder_; +#ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/dense_tensor.inl" +#endif }; + } // namespace phi diff --git a/paddle/phi/core/dense_tensor.inl b/paddle/phi/core/dense_tensor.inl index 0547776acad1f3e08752f8ee14d7acf235bdfab4..a422a95346e8b65e91a7404d70c213847e1dcf3e 100644 --- a/paddle/phi/core/dense_tensor.inl +++ b/paddle/phi/core/dense_tensor.inl @@ -54,22 +54,22 @@ DenseTensor(intrusive_ptr storage, DenseTensorMeta&& meta); inline bool IsInitialized() const { return holder_ != nullptr; } template -T* mutable_data(const paddle::platform::Place& place, +T* mutable_data(const phi::Place& place, size_t requested_size = 0); template T* mutable_data(const DDim& dims, - const paddle::platform::Place& place, + const phi::Place& place, size_t requested_size = 0); -void* mutable_data(const paddle::platform::Place& place, +void* mutable_data(const phi::Place& place, paddle::experimental::DataType type, size_t requested_size = 0); -void* mutable_data(const paddle::platform::Place& place, +void* mutable_data(const phi::Place& place, size_t requested_size = 0); -void* mutable_data(const paddle::platform::Place& place, +void* mutable_data(const phi::Place& place, paddle::experimental::DataType type, const phi::Stream& stream); diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index c3e0d2a75228b3211e5d76f95c2f8ff8089b6415..9c1d85251f8926141341ee6b8c15e29164894ee7 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -119,22 +119,39 @@ struct DeviceContext::Impl { gen, phi::errors::InvalidArgument( "Required generator shall not be nullptr, but received nullptr.")); - generator_ = gen; + device_generator_ = gen; } Generator* GetGenerator() const { PADDLE_ENFORCE_NOT_NULL( - generator_, + device_generator_, phi::errors::InvalidArgument("Required generator_ shall not be " "nullptr, but received nullptr.")); - return generator_; + return device_generator_; + } + + void SetHostGenerator(Generator* gen) { + PADDLE_ENFORCE_NOT_NULL( + gen, + phi::errors::InvalidArgument( + "Required generator shall not be nullptr, but received nullptr.")); + host_generator_ = gen; + } + + Generator* GetHostGenerator() const { + PADDLE_ENFORCE_NOT_NULL( + host_generator_, + phi::errors::InvalidArgument("Required generator_ shall not be " + "nullptr, but received nullptr.")); + return host_generator_; } private: const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; - Generator* generator_{nullptr}; + Generator* device_generator_{nullptr}; + Generator* host_generator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } @@ -143,6 +160,8 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); + impl_->SetHostGenerator(other.GetHostGenerator()); + impl_->SetGenerator(other.GetGenerator()); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -224,4 +243,12 @@ void DeviceContext::SetGenerator(Generator* gen) { impl_->SetGenerator(gen); } Generator* DeviceContext::GetGenerator() const { return impl_->GetGenerator(); } +void DeviceContext::SetHostGenerator(Generator* gen) { + impl_->SetHostGenerator(gen); +} + +Generator* DeviceContext::GetHostGenerator() const { + return impl_->GetHostGenerator(); +} + } // namespace phi diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 7c1411e3bef3740f11ff39947028ead4d0357771..689f4e4e66d15f60aec873a9e9b9c07797833487 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -132,6 +132,19 @@ class DeviceContext { */ Generator* GetGenerator() const; + /** + * @brief Set the host generator for special op. + * + * @param Generator + */ + void SetHostGenerator(Generator*); + /** + * @brief Get the host generator object. + * + * @return Generator + */ + Generator* GetHostGenerator() const; + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h index 08fe3125287d76654173324e42a2d0773aab444c..0869df143235fcd937d75e7dba908c4efbd7ee95 100644 --- a/paddle/phi/core/hostdevice.h +++ b/paddle/phi/core/hostdevice.h @@ -18,14 +18,14 @@ #include #endif -#ifdef __xpu_kp__ +#if defined(__xpu__) #include #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" #endif -#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu_kp__)) +#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__)) #define HOSTDEVICE __host__ __device__ #define DEVICE __device__ #define HOST __host__ diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index d21232ed82296cb48af5c72a32264e5c8fd76085..f3dd056911ecf81d5ca0954114acbd1a3ac19ad9 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -67,6 +67,14 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const { return *inputs_.at(idx); } +paddle::optional InferMetaContext::OptionalInputAt( + size_t idx) const { + const auto& input = inputs_.at(idx); + return input ? paddle::optional{static_cast< + const phi::MetaTensor&>(*input)} + : paddle::optional{paddle::none}; +} + std::vector InferMetaContext::InputsBetween(size_t start, size_t end) const { std::vector result; diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 2b98ab22bcdbd43a1863c2d59d93e31c510368b8..203dbb269841ec8616b94c89603af3904eb572c3 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/phi/core/macros.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/type_defs.h" +#include "paddle/utils/any.h" #include "paddle/utils/flat_hash_map.h" #include "paddle/utils/small_vector.h" @@ -50,6 +51,9 @@ class InferMetaContext { const MetaConfig& GetMetaConfig() const; const MetaTensor& InputAt(size_t idx) const; + + paddle::optional OptionalInputAt(size_t idx) const; + std::vector InputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetween(size_t start, size_t end); @@ -134,6 +138,24 @@ struct InferMetaFnImpl { } }; + template + struct InferMetaFnCallHelper, Tail...> { + template + static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { + static_assert(attr_idx == 0, + "InferMeta's Input should appear before Attributes."); + static_assert(out_idx == 0, + "InferMeta's Input should appear before Outputs."); + const std::pair range = ctx->InputRangeAt(in_idx); + auto arg = ctx->OptionalInputAt(range.first); + + InferMetaFnCallHelper< + Tail...>::template Call(ctx, + pargs..., + arg); + } + }; + template struct InferMetaFnCallHelper&, Tail...> { template @@ -282,10 +304,10 @@ struct InferMetaFnRegistrar { } }; -#define PT_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \ +#define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_infer_meta_fn_ns_check_##kernel_name_prefix, \ - "PT_REGISTER_INFER_META_FN must be called in global namespace."); \ + PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix, \ + "PD_REGISTER_INFER_META_FN must be called in global namespace."); \ static const ::phi::InferMetaFnRegistrar \ __registrar_arg_map_fn_for_##kernel_name_prefix( \ #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn)) diff --git a/paddle/phi/core/kernel_context.cc b/paddle/phi/core/kernel_context.cc index 3c7222f7a5379fe1f9d6c87ffdb38d6e6a8fa48c..a32e0e44f469694c62ff33863971d3b04004ff37 100644 --- a/paddle/phi/core/kernel_context.cc +++ b/paddle/phi/core/kernel_context.cc @@ -69,7 +69,7 @@ void KernelContext::AssignInputRange(std::pair&& range, size_t idx) { } else if (idx == input_range_.size()) { input_range_.emplace_back(range); } else { - PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + PADDLE_THROW(phi::errors::PreconditionNotMet( "Invalid idx when trying to set InputRange, " "index is `%d`, it is greater than the size(%d) of InputRange.", idx, @@ -83,7 +83,7 @@ void KernelContext::AssignOutputRange(std::pair&& range, size_t idx) { } else if (idx == output_range_.size()) { output_range_.emplace_back(range); } else { - PADDLE_THROW(paddle::platform::errors::PreconditionNotMet( + PADDLE_THROW(phi::errors::PreconditionNotMet( "Invalid idx when trying to set InputRange, " "index is `%d`, it is greater than the size(%d) of InputRange.", idx, diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 0b960004fcb2729181b8f8d91b7d4cb041b01ca8..57e2db60c24caea8cbac323d9c47bdb53acc8a8c 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -22,6 +22,7 @@ #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/utils/any.h" +#include "paddle/utils/optional.h" #include "paddle/utils/small_vector.h" namespace phi { diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 577e9e28cf3791880a34114201075447f6d9eaf0..6a1688947b986549e1feaf39cdf6c73749b0ff3a 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -21,6 +21,7 @@ #include #include +#include "paddle/phi/core/custom_kernel.h" #include "paddle/phi/core/kernel_factory.h" #include "paddle/phi/core/kernel_utils.h" #include "paddle/phi/core/macros.h" @@ -62,6 +63,9 @@ struct KernelArgsParseFunctor { #elif defined(PADDLE_WITH_XPU) || arg_type == std::type_index(typeid(const XPUContext&))) { +#elif defined(PADDLE_WITH_CUSTOM_DEVICE) + || + arg_type == std::type_index(typeid(const CustomContext&))) { #else ) { #endif @@ -83,11 +87,13 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); +#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(const SelectedRows&))) { args_def->AppendInput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); +#endif } else if (arg_type == std::type_index(typeid(DenseTensor*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, @@ -99,11 +105,13 @@ struct KernelArgsParseFunctor { default_tensor_layout, default_key.dtype(), arg_type); +#ifndef PADDLE_WITH_CUSTOM_KERNEL } else if (arg_type == std::type_index(typeid(SelectedRows*))) { args_def->AppendOutput(default_key.backend(), default_tensor_layout, default_key.dtype(), arg_type); +#endif } else { // Attribute deal with // TODO(chenweihang): now here allow any types of attribute, maybe @@ -121,20 +129,28 @@ struct KernelArgsParseFunctor { } }; +// NOTE: used for making a difference between inner or outer registration. +enum class RegType : uint8_t { + INNER = 0, + OUTER, +}; + // TODO(chenweihang): Polish the kernel selection logic, support the selection // of ALL_DTYPE kernel, and simplify the constructor struct KernelRegistrar { public: - KernelRegistrar(const char* kernel_name_cstr, - Backend backend, + KernelRegistrar(RegType reg_type, + const char* kernel_name_cstr, + const char* backend_cstr, DataLayout layout, DataType dtype, KernelArgsParseFn args_parse_fn, KernelArgsDefFn args_def_fn, KernelFn kernel_fn, void* variadic_kernel_fn) { - ConstructKernel(kernel_name_cstr, - backend, + ConstructKernel(reg_type, + kernel_name_cstr, + backend_cstr, layout, dtype, args_parse_fn, @@ -143,8 +159,9 @@ struct KernelRegistrar { variadic_kernel_fn); } - KernelRegistrar(const char* kernel_name_cstr, - Backend backend, + KernelRegistrar(RegType reg_type, + const char* kernel_name_cstr, + const char* backend_cstr, DataLayout layout, KernelArgsParseFn args_parse_fn, KernelArgsDefFn args_def_fn, @@ -160,8 +177,9 @@ struct KernelRegistrar { dtype == static_cast(DataType::UINT16)) { continue; } - ConstructKernel(kernel_name_cstr, - backend, + ConstructKernel(reg_type, + kernel_name_cstr, + backend_cstr, layout, static_cast(dtype), args_parse_fn, @@ -172,8 +190,9 @@ struct KernelRegistrar { } private: - void ConstructKernel(const char* kernel_name_cstr, - Backend backend, + void ConstructKernel(RegType reg_type, + const char* kernel_name_cstr, + const char* backend_cstr, DataLayout layout, DataType dtype, KernelArgsParseFn args_parse_fn, @@ -181,11 +200,16 @@ struct KernelRegistrar { KernelFn kernel_fn, void* variadic_kernel_fn) { std::string kernel_name(kernel_name_cstr); - KernelKey kernel_key(backend, layout, dtype); + KernelKey kernel_key( + paddle::experimental::StringToBackend(backend_cstr), layout, dtype); Kernel kernel(kernel_fn, variadic_kernel_fn); args_parse_fn(kernel_key, kernel.mutable_args_def()); args_def_fn(kernel_key, &kernel); - KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; + if (reg_type == RegType::INNER) { + KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; + } else { + CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel; + } } }; @@ -210,7 +234,7 @@ struct KernelRegistrar { #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -/** PT_REGISTER_KERNEL +/** PD_REGISTER_KERNEL * * The most frequently used kernel registration macro, used for kernel * registration with only data type as template parameter, and the function @@ -219,22 +243,39 @@ struct KernelRegistrar { * * Note: `2TA` means `2 template argument` */ -#define PT_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PT_REGISTER_KERNEL must be called in global namespace."); \ - PT_EXPAND(_PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, __VA_ARGS__)) +#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \ + _PD_REGISTER_KERNEL(::phi::RegType::INNER, \ + kernel_name, \ + backend, \ + ::phi::backend##Context, \ + layout, \ + meta_kernel_fn, \ + __VA_ARGS__) + +#define _PD_REGISTER_KERNEL( \ + reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ + "PD_REGISTER_KERNEL must be called in global namespace."); \ + PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + meta_kernel_fn, \ + __VA_ARGS__)) #ifndef _WIN32 -#define _PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, ...) \ - PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__); \ +#define _PD_REGISTER_2TA_KERNEL( \ + reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ + PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__); \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ PT_KERNEL_REGISTRAR_INIT( \ + reg_type, \ kernel_name, \ backend, \ + context, \ layout, \ &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ meta_kernel_fn, \ @@ -254,13 +295,15 @@ struct KernelRegistrar { * * And msvc can work without template instantiation */ -#define _PT_REGISTER_2TA_KERNEL( \ - kernel_name, backend, layout, meta_kernel_fn, ...) \ +#define _PD_REGISTER_2TA_KERNEL( \ + reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...) \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ PT_EXPAND(PT_KERNEL_REGISTRAR_INIT( \ + reg_type, \ kernel_name, \ backend, \ + context, \ layout, \ &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ meta_kernel_fn, \ @@ -269,82 +312,119 @@ struct KernelRegistrar { const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #endif -#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \ - _PT_KERNEL_INSTANTIATION( \ - PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__) - -#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \ - PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ - (meta_kernel_fn, backend, __VA_ARGS__) - -#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn -#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__)) -#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \ - template decltype(meta_kernel_fn) \ - meta_kernel_fn; \ - PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__)) - -#define PT_KERNEL_REGISTRAR_INIT( \ - kernel_name, backend, layout, args_def_fn, meta_kernel_fn, ...) \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \ - kernel_name, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ +#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \ + _PT_KERNEL_INSTANTIATION( \ + PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__) + +#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \ + PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ + (meta_kernel_fn, backend, context, __VA_ARGS__) + +#define _PT_KERNEL_INSTANTIATION_1( \ + meta_kernel_fn, backend, context, cpp_dtype) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn +#define _PT_KERNEL_INSTANTIATION_2( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_1( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_3( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_2( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_4( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_3( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_5( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_4( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_6( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_5( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_7( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_6( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_8( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_7( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_9( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_8( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_10( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_9( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_11( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_10( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_12( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_11( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_13( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_12( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_14( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_13( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) +#define _PT_KERNEL_INSTANTIATION_15( \ + meta_kernel_fn, backend, context, cpp_dtype, ...) \ + template decltype( \ + meta_kernel_fn) meta_kernel_fn; \ + PT_EXPAND(_PT_KERNEL_INSTANTIATION_14( \ + meta_kernel_fn, backend, context, __VA_ARGS__)) + +#define PT_KERNEL_REGISTRAR_INIT(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + args_def_fn, \ + meta_kernel_fn, \ + ...) \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \ + reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) // clang-format off @@ -352,15 +432,19 @@ struct KernelRegistrar { /* The =pre-commit always treats this macro into the wrong format, and multi-line macros cannot be skipped with NOLINT.*/ #define _PT_KERNEL_REGISTRAR_INIT(N, \ + reg_type, \ kernel_name, \ backend, \ + context, \ layout, \ args_def_fn, \ meta_kernel_fn, \ ...) \ PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \ + reg_type, \ kernel_name, \ backend, \ + context, \ layout, \ PT_ID, \ args_def_fn, \ @@ -369,413 +453,492 @@ struct KernelRegistrar { // clang-format on -#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ +#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; } -#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) -#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name, \ - backend, \ - layout, \ - registrar_id, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - static const ::phi::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ - #kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::paddle::experimental::CppTypeToDataType::Type(), \ - ::phi::KernelArgsParseFunctor)>::Parse, \ - args_def_fn, \ - PT_KERNEL(meta_kernel_fn), \ - PT_VARIADIC_KERNEL(meta_kernel_fn)); \ - PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - backend, \ - layout, \ - PT_ID, \ - args_def_fn, \ - meta_kernel_fn, \ +#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + registrar_id, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ + ...) \ + static const ::phi::KernelRegistrar PT_CONCATENATE( \ + __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \ + reg_type, \ + #kernel_name, \ + #backend, \ + DATALAYOUT(layout), \ + ::paddle::experimental::CppTypeToDataType::Type(), \ + ::phi::KernelArgsParseFunctor)>::Parse, \ + args_def_fn, \ + PT_KERNEL(meta_kernel_fn), \ + PT_VARIADIC_KERNEL(meta_kernel_fn)); \ + PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type, \ + kernel_name, \ + backend, \ + context, \ + layout, \ + PT_ID, \ + args_def_fn, \ + meta_kernel_fn, \ __VA_ARGS__)) - -/** PT_REGISTER_GENERAL_KERNEL +/** PD_REGISTER_GENERAL_KERNEL * * Basic Kernel register marco, used to register a instantiated kernel function * with one template argument. */ -#define PT_REGISTER_GENERAL_KERNEL( \ - kernel_name, backend, layout, kernel_fn, dtype) \ +#define PD_REGISTER_GENERAL_KERNEL( \ + kernel_name, backend, layout, kernel_fn, dtype) \ + _PD_REGISTER_GENERAL_KERNEL( \ + ::phi::RegType::INNER, kernel_name, backend, layout, kernel_fn, dtype) + +#define _PD_REGISTER_GENERAL_KERNEL( \ + reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \ - _PT_REGISTER_GENERAL_KERNEL(kernel_name, backend, layout, kernel_fn, dtype) + PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \ + "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \ + __PD_REGISTER_GENERAL_KERNEL( \ + reg_type, kernel_name, backend, layout, kernel_fn, dtype) #ifndef _WIN32 -#define _PT_REGISTER_GENERAL_KERNEL( \ - kernel_name, backend, layout, kernel_fn, dtype) \ +#define __PD_REGISTER_GENERAL_KERNEL( \ + reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ template decltype(kernel_fn) kernel_fn; \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ static const ::phi::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ + reg_type, \ #kernel_name, \ - BACKEND(backend), \ + #backend, \ DATALAYOUT(layout), \ ::phi::KernelArgsParseFunctor::Parse, \ &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ @@ -787,14 +950,15 @@ struct KernelRegistrar { void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #else -#define _PT_REGISTER_GENERAL_KERNEL( \ - kernel_name, backend, layout, kernel_fn, dtype) \ +#define __PD_REGISTER_GENERAL_KERNEL( \ + reg_type, kernel_name, backend, layout, kernel_fn, dtype) \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel); \ static const ::phi::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ + reg_type, \ #kernel_name, \ - BACKEND(backend), \ + #backend, \ DATALAYOUT(layout), \ ::phi::KernelArgsParseFunctor::Parse, \ &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout, \ @@ -807,18 +971,48 @@ struct KernelRegistrar { const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel) #endif -/** PT_DECLARE_KERNEL +/** PD_DECLARE_KERNEL * * Used to export the symbols of the file where the kernel is located, * to avoid being removed by linker */ -#define PT_DECLARE_KERNEL(kernel_name, backend, layout) \ +#define PD_DECLARE_KERNEL(kernel_name, backend, layout) \ PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - pt_declare_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ - "PT_DECLARE_KERNEL must be called in global namespace."); \ + PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \ + "PD_DECLARE_KERNEL must be called in global namespace."); \ extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \ UNUSED static int \ __declare_kernel_symbol_for_##kernel_name##_##backend##_##layout = \ TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() +/** PD_REGISTER_BUILTIN_KERNEL + * + * Used to register kernels for built-in backends. + * Support CPU GPU XPU. + */ +#define PD_REGISTER_BUILTIN_KERNEL( \ + kernel_name, backend, layout, meta_kernel_fn, ...) \ + _PD_REGISTER_KERNEL(::phi::RegType::OUTER, \ + kernel_name, \ + backend, \ + ::phi::backend##Context, \ + layout, \ + meta_kernel_fn, \ + __VA_ARGS__) + +/** PD_REGISTER_PLUGIN_KERNEL + * + * Used to register kernels for plug-in backends. + * Support user-defined backend such as 'Ascend910'. + */ +#define PD_REGISTER_PLUGIN_KERNEL( \ + kernel_name, backend, layout, meta_kernel_fn, ...) \ + _PD_REGISTER_KERNEL(::phi::RegType::OUTER, \ + kernel_name, \ + backend, \ + ::phi::CustomContext, \ + layout, \ + meta_kernel_fn, \ + __VA_ARGS__) + } // namespace phi diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 8c7d096eab0916d984819cfe85810a90cd29e631..862f61b20400e674b26f1277caa11a56d85f4e73 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/custom/custom_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/xpu/xpu_context.h" #include "paddle/phi/common/scalar.h" @@ -22,7 +23,9 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_context.h" +#ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/phi/core/selected_rows.h" +#endif #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/type_defs.h" @@ -210,13 +213,18 @@ struct KernelImpl { #ifdef PADDLE_WITH_XPU PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext); #endif +#ifdef PADDLE_WITH_CUSTOM_DEVICE + PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext); +#endif /* Input Helpers */ PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor); +#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows); +#endif PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor); @@ -250,7 +258,9 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor); +#ifndef PADDLE_WITH_CUSTOM_KERNEL PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows); +#endif PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor); PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor); diff --git a/paddle/phi/core/lod_utils.h b/paddle/phi/core/lod_utils.h index 2b0be4d93429d222afbf28d9de0a7bced19a498b..147fca4cb576ce1625df83cca95d3701e082e6f6 100644 --- a/paddle/phi/core/lod_utils.h +++ b/paddle/phi/core/lod_utils.h @@ -13,12 +13,11 @@ // limitations under the License. #pragma once - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/mixed_vector.h" +#include +#include namespace phi { -using LoD = std::vector>; +using LoD = std::vector>; void AppendLoD(LoD* lod, const LoD& lod_length); @@ -34,4 +33,4 @@ void AppendLoD(LoD* lod, const LoD& lod_length); */ LoD ConvertToLengthBasedLoD(const LoD& offset_lod); -} // namespace pten +} // namespace phi diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index cd48777b8ea61d58991923ea5919d7555d0a219b..7ee475b4d5d9e03d0931587f2a607f5f4950a426 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -55,25 +55,17 @@ class SelectedRows : public TensorBase, void set_height(int64_t height) { impl_->set_height(height); } - const paddle::framework::Vector& rows() const { - return impl_->rows(); - } + const std::vector& rows() const { return impl_->rows(); } - paddle::framework::Vector* mutable_rows() { - return impl_->mutable_rows(); - } - - void set_rows(const paddle::framework::Vector& rows) { - impl_->set_rows(rows); - } + std::vector* mutable_rows() { return impl_->mutable_rows(); } + void set_rows(const std::vector& rows) { impl_->set_rows(rows); } /* * @brief Get the index of key in rows * * @return -1 if the key does not exists. */ int64_t Index(int64_t key) const { return impl_->Index(key); } - /* * @brief whether has the specified key in the table. * diff --git a/paddle/phi/core/selected_rows_impl.cc b/paddle/phi/core/selected_rows_impl.cc index 920e9935d5899de82eb2cdd81616f8466916d7e3..7e5fd51343a09aa4ae974ad30f3265169489862c 100644 --- a/paddle/phi/core/selected_rows_impl.cc +++ b/paddle/phi/core/selected_rows_impl.cc @@ -28,7 +28,7 @@ struct ReAllocateVisitor { template void operator()() const { phi::DenseTensor cpu_tensor; - paddle::platform::CPUPlace cpu; + phi::CPUPlace cpu; T* ptr = cpu_tensor.mutable_data(dims_, cpu); const T* old_ptr = tensor_->memory_size() == 0 ? nullptr : tensor_->data(); @@ -57,7 +57,7 @@ struct TensorCopyVisitor { template void apply() const { // TODO(Yancey1989): support other place - paddle::platform::CPUPlace cpu; + phi::CPUPlace cpu; paddle::memory::Copy(cpu, dst_->mutable_data(cpu) + dst_offset_, cpu, @@ -82,7 +82,7 @@ struct TensorFillVisitor { template void apply() const { // TODO(qiao): support other place - paddle::platform::CPUPlace cpu; + phi::CPUPlace cpu; auto* tensor_data = dst_->mutable_data(cpu); auto* start = tensor_data + dst_offset_; auto* end = start + size_; @@ -121,16 +121,16 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key, auto iter = id_to_index_.find(key); if (iter == id_to_index_.end()) { rwlock_->UNLock(); - PADDLE_ENFORCE_EQ(auto_grown, - true, - paddle::platform::errors::NotFound( - "Input key(%lld) is not found.", key)); + PADDLE_ENFORCE_EQ( + auto_grown, + true, + phi::errors::NotFound("Input key(%lld) is not found.", key)); rwlock_->WRLock(); auto map_size = id_to_index_.size(); auto vector_size = rows_.size(); if (map_size != vector_size) { rwlock_->UNLock(); - PADDLE_THROW(paddle::platform::errors::InvalidArgument( + PADDLE_THROW(phi::errors::InvalidArgument( "Row map size(%zu) should be equal to rows size(%zu).", map_size, vector_size)); @@ -140,7 +140,7 @@ int64_t SelectedRowsImpl::AutoGrownIndex(int64_t key, int row_num = rows_.size(); if (row_num == value_->dims()[0]) { rwlock_->UNLock(); - PADDLE_THROW(paddle::platform::errors::InvalidArgument( + PADDLE_THROW(phi::errors::InvalidArgument( "Selected rows is full, then length exceed the length of first " "dimension (%d).", row_num)); @@ -187,7 +187,7 @@ void SelectedRowsImpl::Get(const phi::DenseTensor& ids, PADDLE_ENFORCE_EQ( value_width, value->numel() / value->dims()[0], - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Output tensor should have the same shape with table " "except the first dimmension, excepted value width not counting " "the first dimension is %d, actual value width is %d.", diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h index 86579e529371ad1289e8c792725b642b3a8e117c..3c54b59a159ddfdac25ad64f083cde97cfdd39f6 100644 --- a/paddle/phi/core/selected_rows_impl.h +++ b/paddle/phi/core/selected_rows_impl.h @@ -27,8 +27,6 @@ limitations under the License. */ #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/utils/rw_lock.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/mixed_vector.h" namespace phi { class SelectedRowsImpl { /* @@ -68,13 +66,11 @@ class SelectedRowsImpl { void set_height(int64_t height) { height_ = height; } - const paddle::framework::Vector& rows() const { return rows_; } + const std::vector& rows() const { return rows_; } - paddle::framework::Vector* mutable_rows() { return &rows_; } + std::vector* mutable_rows() { return &rows_; } - void set_rows(const paddle::framework::Vector& rows) { - rows_ = rows; - } + void set_rows(const std::vector& rows) { rows_ = rows; } /* * @brief Get the index of key in rows @@ -84,7 +80,7 @@ class SelectedRowsImpl { int64_t Index(int64_t key) const { auto it = std::find(rows_.begin(), rows_.end(), key); if (it == rows_.end()) { - PADDLE_THROW(paddle::platform::errors::NotFound( + PADDLE_THROW(phi::errors::NotFound( "Input id (%lld) is not in current rows table.", key)); } return static_cast(std::distance(rows_.begin(), it)); @@ -156,10 +152,7 @@ class SelectedRowsImpl { /// \brief Returns the dims of the tensor. /// \return The dims of the tensor. - const DDim& dims() const noexcept { - return value_->dims(); - // return phi::make_ddim(dims); - } + const DDim& dims() const noexcept { return value_->dims(); } /// \brief Returns the data type of the tensor. /// \return The data type of the tensor. @@ -185,7 +178,7 @@ class SelectedRowsImpl { // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. // SelectedRowsImpl are simply concated when adding together. Until a // SelectedRowsImpl add a Tensor, will the duplicate rows be handled. - paddle::framework::Vector rows_; + std::vector rows_; std::unordered_map id_to_index_; // should not be used when rows_ has duplicate member std::unique_ptr value_{nullptr}; diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc index 1659f09248be02a74243a2de071606a9a8d5667c..f2987e36d3db0163c275562562bf5d6bf7aa91af 100644 --- a/paddle/phi/core/sparse_coo_tensor.cc +++ b/paddle/phi/core/sparse_coo_tensor.cc @@ -69,17 +69,17 @@ void SparseCooTensor::Resize(const DDim& dense_dims, const int64_t non_zero_num) { PADDLE_ENFORCE_GE(non_zero_num, this->nnz(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "the non_zero_num must be greater than or equal to the " "origin non_zero_num.")); PADDLE_ENFORCE_GE(sparse_dim, 1, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "the sparse_dim must be greater than or equal 1.")); PADDLE_ENFORCE_LE( sparse_dim, dense_dims.size(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "the sparse_dim must be less than or equal dense_dims.")); DDim indices_dims = phi::make_ddim({sparse_dim, non_zero_num}); diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc index 7f7cd76378cc4932063ecd105147f0bc1a9d07b7..cbf5f941b665d8ae2be58472069d2e04891afe29 100644 --- a/paddle/phi/core/sparse_csr_tensor.cc +++ b/paddle/phi/core/sparse_csr_tensor.cc @@ -20,7 +20,7 @@ inline void check_shape(const DDim& dims) { bool valid = dims.size() == 2 || dims.size() == 3; PADDLE_ENFORCE(valid, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "the SparseCsrTensor only support 2-D Tensor.")); } #define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims) \ @@ -29,12 +29,12 @@ inline void check_shape(const DDim& dims) { PADDLE_ENFORCE_EQ( \ non_zero_cols.place(), \ non_zero_crows.place(), \ - paddle::platform::errors::InvalidArgument( \ + phi::errors::InvalidArgument( \ "non_zero_crows and non_zero_cols must have the same place.")); \ PADDLE_ENFORCE_EQ( \ non_zero_cols.place(), \ non_zero_elements.place(), \ - paddle::platform::errors::InvalidArgument( \ + phi::errors::InvalidArgument( \ "non_zero_cols and non_zero_elements must have the same place.")); \ } @@ -77,7 +77,7 @@ void* SparseCsrTensor::AllocateFrom(Allocator* allocator, void SparseCsrTensor::Resize(const DDim& dense_dims, const int64_t non_zero_num) { PADDLE_ENFORCE(this->initialized(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "the SparseCsrTensor must be initialized when call Resize " "function.")); check_shape(dense_dims); diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h index d5e5e2aa001fd4358bf35179316ddd7519840d05..3d2da542c74176017492bdb9f567396f81308d6a 100644 --- a/paddle/phi/core/tensor_meta.h +++ b/paddle/phi/core/tensor_meta.h @@ -20,16 +20,20 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/ddim.h" +#include "paddle/utils/any.h" +#include "paddle/utils/optional.h" // Note: mixed_vector include many header now, LoD will be // used on CUDA device? Can we use small_vector here? // @zhanlve: Rollback to original LoD for now +#ifndef PADDLE_WITH_CUSTOM_KERNEL #include "paddle/fluid/framework/mixed_vector.h" +#endif namespace phi { using DDim = phi::DDim; -using LoD = std::vector>; +using LoD = std::vector>; /// \brief The meta data of dense tensor. Take the structure type /// and use all default operations. /// diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h index 04db7c0877ad81f2aa54241871fe7dca79380946..676a590ecbce23a107bcc891c37ac69406854035 100644 --- a/paddle/phi/core/tensor_utils.h +++ b/paddle/phi/core/tensor_utils.h @@ -31,25 +31,25 @@ class DenseTensorUtils { size_t bytes = tensor.numel() * SizeOf(tensor.dtype()); PADDLE_ENFORCE_GE(tensor.capacity(), bytes, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The memory size %d should be enough to meet the " "volume required by metadata %d.", tensor.capacity(), bytes)); - PADDLE_ENFORCE_GE(begin_idx, - 0, - paddle::platform::errors::OutOfRange( - "The start row index must be greater than 0." - "But received the start index is d%.", - begin_idx)); - PADDLE_ENFORCE_LE(end_idx, - tensor.dims()[0], - paddle::platform::errors::OutOfRange( - "The end row index is out of bound.")); + PADDLE_ENFORCE_GE( + begin_idx, + 0, + phi::errors::OutOfRange("The start row index must be greater than 0." + "But received the start index is d%.", + begin_idx)); + PADDLE_ENFORCE_LE( + end_idx, + tensor.dims()[0], + phi::errors::OutOfRange("The end row index is out of bound.")); PADDLE_ENFORCE_LT( begin_idx, end_idx, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The start row index must be less than the end row index." "But received the start index = %d, the end index = %d.", begin_idx, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index f79b5982f6194c8fe52b32320014add744942623..58cd43998b8a5eeba52f324dc1609d72c61ff95b 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/core/ddim.h" #include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { @@ -22,7 +23,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { auto x_rank = static_cast(x_dims.size()); PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "ShapeError: The dimensions of input tensor X (%s) " "should be 1 or 2", x_dims.to_str())); @@ -31,7 +32,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { PADDLE_ENFORCE_EQ( true, x_rank == static_cast(y_dims.size()), - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "ShapeError: The shape of input tensor Y: %s should match with " "input tenosr X: %s", y_dims.to_str(), @@ -46,7 +47,7 @@ void DotInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { PADDLE_ENFORCE_EQ(true, shape_match, - paddle::platform::errors::PreconditionNotMet( + phi::errors::PreconditionNotMet( "ShapeError: The shape of input tensor X: %s should " "be exactly the same " "with input tensor Y: %s", @@ -70,12 +71,12 @@ void MatmulInferMeta(const MetaTensor& x, auto ndims_y = dims_y.size(); PADDLE_ENFORCE_GT(ndims_x, 0UL, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The Input(x) dims size must be greater than 0," " but reviced dims size is 0. ")); PADDLE_ENFORCE_GT(ndims_y, 0UL, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The Input(y) dims size must be greater than 0," " but reviced dims size is 0. ")); @@ -149,7 +150,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x, if (x_dims.size() == y_dims.size()) { PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0), true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "axis should be -1 or 0 while the dimension of " "tensor X (%s) is equal to the dimension of " "tensor Y (%s), but received axis: %s", @@ -159,7 +160,7 @@ void ElementwiseRawInferMeta(const MetaTensor& x, } PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The axis range must be [%s, %s), but axis is %s. " "Please set the axis again.", -1 * max_dim, @@ -188,4 +189,128 @@ void ElementwiseRawInferMeta(const MetaTensor& x, out->share_lod(x); } +void HuberLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float delta, + MetaTensor* out, + MetaTensor* residual, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + PADDLE_ENFORCE_EQ(input_dims.size(), + label_dims.size(), + phi::errors::InvalidArgument( + "Input(input) rank and Input(label) rank should be " + "same, but received input rank(%d) != label rank(%d)", + input_dims.size(), + label_dims.size())); + + bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) || + phi::contain_unknown_dim(label_dims); + if (config.is_runtime || !contain_unknown_dim) { + PADDLE_ENFORCE_EQ( + input_dims, + label_dims, + phi::errors::InvalidArgument( + "The Input(input) and Input(label) should have the same " + "shape, but received input shape [%s] != label shape [%s]", + input_dims, + label_dims)); + } + + auto out_dims = label_dims; + residual->set_dims(out_dims); + out->set_dims(out_dims); + out->share_lod(input); +} + +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto dim = axis; + + bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + x_dim, + y_dim)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < x_dim.size() && dim >= (0 - x_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + x_dim.size(), + x_dim.size() - 1, + dim)); + if (dim < 0) { + dim += x_dim.size(); + } + PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims()[dim] should be equal to 3." + "But received Input(X/Y).dims()[dim] = %d.", + x_dim[dim])); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { + auto in_dims = x.dims(); + out->set_dims(in_dims); +} + +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + int rank = input_dims.size(); + PADDLE_ENFORCE_EQ(rank, + label_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + label_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same " + "shape. But received: the shape of Input(X) is " + "[%s], the shape of Input(Label) is [%s].", + input_dims, + label_dims)); + } + + out->set_dims(input_dims); + out->share_lod(input); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 5e3214127ee2361117a215ad7623b040599519df..02750482dccaabd53f360fcc361bfdc8e788b89e 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -45,4 +45,22 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta, const MetaTensor& y_meta, int axis, MetaTensor* out); + +void HuberLossInferMeta(const MetaTensor& input_meta, + const MetaTensor& label_meta, + float delta, + MetaTensor* out, + MetaTensor* residual, + MetaConfig config = MetaConfig()); + +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out); + +void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config = MetaConfig()); } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 5e7dd1de69d7d0f3de5ef7e67dc8d1f48373abdb..d72033f95285738f20c75b5d2a678fe4811e8a18 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -24,7 +24,7 @@ void ConcatInferMeta(const std::vector& x, MetaConfig config) { PADDLE_ENFORCE_GE(x.size(), 0UL, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The size of input meta vector should be greater" "than 0.")); @@ -34,7 +34,7 @@ void ConcatInferMeta(const std::vector& x, PADDLE_ENFORCE_EQ( axis >= -rank && axis < rank, true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The axis is expected to be in range of [%d, %d), but got %d", -rank, rank, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4b13545e038f0970c5ed60ca3c4fefaeb6edba58..ca71d6a56d8e785ab18e047e6ae552f5994cc0f0 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/kernels/funcs/unfold_functor.h" namespace phi { @@ -37,11 +38,11 @@ void FlattenInferMeta(const MetaTensor& x, if (stop_axis < 0) { stop_axis = stop_axis + in_dims_size; } - PADDLE_ENFORCE_GE(stop_axis, - start_axis, - paddle::platform::errors::InvalidArgument( - "The stop_axis should be greater" - "than or equal to start_axis.")); + PADDLE_ENFORCE_GE( + stop_axis, + start_axis, + phi::errors::InvalidArgument("The stop_axis should be greater" + "than or equal to start_axis.")); int64_t outer = 1; std::vector out_shape; @@ -112,7 +113,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_EQ( unk_dim_idx, -1, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Only one dimension value of 'shape' in ReshapeOp can " "be -1. But received shape = [%s], shape[%d] is also -1.", phi::make_ddim(shape), @@ -122,7 +123,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_LT( static_cast(i), in_dims.size(), - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The index of 0 in `shape` must be less than " "the input tensor X's dimensions. " "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " @@ -135,7 +136,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_GT( shape[i], 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Each dimension value of 'shape' in ReshapeOp must not " "be negative except one unknown dimension. " "But received shape = [%s], shape[%d] = %d.", @@ -160,7 +161,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_EQ( output_shape[unk_dim_idx] * capacity, -in_size, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The 'shape' attribute in ReshapeOp is invalid. " "The input tensor X'size must be divisible by known " "capacity of 'shape'. " @@ -178,7 +179,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_EQ( capacity, in_size, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The 'shape' in ReshapeOp is invalid. " "The input tensor X'size must be equal to the capacity of " "'shape'. " @@ -198,7 +199,7 @@ static phi::DDim ValidateShape(const std::vector shape, PADDLE_ENFORCE_LE( capacity, in_size, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The 'shape' in ReshapeOp is invalid. " "The input tensor X's shape = [%s], X's capacity = %d." "But the target shape of Out is [%s], the " @@ -363,7 +364,7 @@ void SplitInferMeta(const MetaTensor& x, PADDLE_ENFORCE_EQ( axis_value >= -rank && axis_value < rank, true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The axis is expected to be in range of [%d, %d), but got %d", -rank, rank, @@ -382,7 +383,7 @@ void SplitInferMeta(const MetaTensor& x, PADDLE_ENFORCE_EQ(input_axis_dim % num, 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The input's size along the split dimension " "must be evenly divisible by Attr(num_or_sections). " "But received Attr(num_or_sections) " @@ -415,7 +416,7 @@ void SplitInferMeta(const MetaTensor& x, if (config.is_runtime) { PADDLE_ENFORCE_LE(num_of_unknow, 1, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Only one dimension value of Attr(num_or_sections) " "in SplitOp can be -1. " "But received Attr(num_or_sections) = [%s].", @@ -429,7 +430,7 @@ void SplitInferMeta(const MetaTensor& x, PADDLE_ENFORCE_LT( sum_of_section, input_axis_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Sum of Attr(num_or_sections) other than unknown section " "must be less than the input's " "size " @@ -446,7 +447,7 @@ void SplitInferMeta(const MetaTensor& x, PADDLE_ENFORCE_EQ( sum_of_section, input_axis_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Sum of Attr(num_or_sections) must be equal to the input's " "size " "along the split dimension. But received Attr(num_or_sections)" @@ -484,6 +485,25 @@ void SplitInferMeta(const MetaTensor& x, } } +void UnbindInferMeta(const MetaTensor& x, + int axis, + std::vector* outs) { + auto in_dims = x.dims(); + std::vector out_dim; + axis = axis < 0 ? in_dims.size() + axis : axis; + for (int i = 0; i < in_dims.size(); ++i) { + if (i != axis) out_dim.push_back(in_dims[i]); + } + auto out_dims = phi::make_ddim(out_dim); + + for (size_t i = 0; i < outs->size(); ++i) { + (*outs)[i].set_dtype(x.dtype()); + (*outs)[i].set_dims(out_dims); + (*outs)[i].set_layout(x.layout()); + (*outs)[i].share_lod(x); + } +} + void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) { int dim1 = axis1; @@ -537,7 +557,165 @@ void TraceInferMeta( out->set_dims(phi::make_ddim(sizes)); } +void UnfoldInferMeta(const MetaTensor& x, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out, + MetaConfig config) { + auto in_dims = x.dims(); + // Only [N, C, H, W] input supported now + PADDLE_ENFORCE_EQ( + in_dims.size(), + 4, + phi::errors::InvalidArgument( + "Input should be 4-D tensor of format [N, C, H, W], but get %u", + in_dims.size())); + PADDLE_ENFORCE_EQ( + in_dims.size() - kernel_sizes.size(), + 2U, + phi::errors::InvalidArgument( + "The dims of X should be larger than that of kernel_sizes " + "by a number of 2, due to the batch size and input channel dim. " + "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2", + in_dims.size(), + kernel_sizes.size())); + PADDLE_ENFORCE_EQ( + strides.size(), + kernel_sizes.size(), + phi::errors::InvalidArgument( + "The dims of strides should be the same with that of kernel_sizes. " + "But recieved dims(strides: %u) != dims(kernel_sizes: %u).", + strides.size(), + kernel_sizes.size())); + PADDLE_ENFORCE_EQ( + paddings.size(), + 2 * strides.size(), + phi::errors::InvalidArgument( + "The dims of paddings should be 2 times of that of strides. " + "But recieved dims(paddings: %u) != 2*dims(strides: %u).", + paddings.size(), + strides.size())); + PADDLE_ENFORCE_EQ( + strides.size(), + dilations.size(), + phi::errors::InvalidArgument( + "The dims of strides should be the same with that of dilations. " + "But recieved dims(strides: %u) != dims(dilations: %u).", + strides.size(), + dilations.size())); + + // check kernel_sizes + PADDLE_ENFORCE_GT(kernel_sizes[0], + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but recieved kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + PADDLE_ENFORCE_GT(kernel_sizes[1], + 0, + phi::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but recieved kernel_height: %d kernel_width: %d.", + kernel_sizes[0], + kernel_sizes[1])); + // check strides + PADDLE_ENFORCE_GT(strides[0], + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but recieved strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + PADDLE_ENFORCE_GT(strides[1], + 0, + phi::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but recieved strides_height: %d strides_width: %d.", + strides[0], + strides[1])); + // check dilations + PADDLE_ENFORCE_GT( + dilations[0], + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but recieved dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + PADDLE_ENFORCE_GT( + dilations[1], + 0, + phi::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but recieved dilations_height: %d dilations_width: %d.", + dilations[0], + dilations[1])); + + std::vector out_dims; + out_dims.push_back(in_dims[0]); + int output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1]; + out_dims.push_back(output_channels); + + int output_height = phi::funcs::CalcOutputSize(in_dims[2], + kernel_sizes[0], + dilations[0], + paddings[0], + paddings[2], + strides[0]); + int output_width = phi::funcs::CalcOutputSize(in_dims[3], + kernel_sizes[1], + dilations[1], + paddings[1], + paddings[3], + strides[1]); + if (config.is_runtime) { + // only check output height and width in runtime + PADDLE_ENFORCE_GT( + output_height, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size " + "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), " + "dilations (%d, %d), is (%d, %d), which should be a " + "positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + PADDLE_ENFORCE_GT( + output_width, + 0, + phi::errors::InvalidArgument( + "The sliding blocks calculated from input spatial size " + "(%d, %d), kernel_sizes (%d, %d), strides (%d, %d), " + "dilations (%d, %d), is (%d, %d), which should be a " + "positive integer.", + in_dims[2], + in_dims[3], + kernel_sizes[0], + kernel_sizes[1], + strides[0], + strides[1], + dilations[0], + dilations[1], + output_height, + output_width)); + } + int output_col_length = output_height * output_width; + out_dims.push_back(output_col_length); + out->set_dims(phi::make_ddim(out_dims)); +} + } // namespace phi -PT_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); -PT_REGISTER_INFER_META_FN(split, phi::SplitInferMeta); +PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); +PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 2ab425d42cd33ec49adf704a54e85e6f1714e19c..21cbe76bb13c0e372668466e1ba0ed415c77f660 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -90,7 +90,18 @@ void SplitInferMeta(const MetaTensor& x_meta, std::vector* out, MetaConfig config = MetaConfig()); +void UnbindInferMeta(const MetaTensor& x, + int axis, + std::vector* outs); void TraceInferMeta( const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); +void UnfoldInferMeta(const MetaTensor& x, + const std::vector& kernel_sizes, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 4f78a6500f434c130558059554a29cd559527a11..ef085e71f5dcc295a417f0c6aa83fc7cdfc20a8d 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PTEN_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/phi/kernels/atan2_grad_kernel.h b/paddle/phi/kernels/atan2_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..ddd87c9da156d4b9ff983972010b90a74a231c4a --- /dev/null +++ b/paddle/phi/kernels/atan2_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Atan2GradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/atan2_kernel.h b/paddle/phi/kernels/atan2_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..38276fa4f73ce5c0c94383a90e6f6f711efd9bcf --- /dev/null +++ b/paddle/phi/kernels/atan2_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void Atan2Kernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/bce_loss_grad_kernel.h b/paddle/phi/kernels/bce_loss_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..14bf52196ac40d81bb925c3fa10c021f173d5218 --- /dev/null +++ b/paddle/phi/kernels/bce_loss_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BCELossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + DenseTensor* input_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/bce_loss_kernel.h b/paddle/phi/kernels/bce_loss_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6459ea911666e7151c2e2fc6645f4a477215f82b --- /dev/null +++ b/paddle/phi/kernels/bce_loss_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BCELossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/phi/kernels/complex_grad_kernel.h similarity index 50% rename from paddle/fluid/operators/huber_loss_op.cu rename to paddle/phi/kernels/complex_grad_kernel.h index 4ce6856a7eade1b314d8aef1d039424ad42e07cf..505d4d374424141ad71da863d1fd7a6424fb35ef 100644 --- a/paddle/fluid/operators/huber_loss_op.cu +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,14 +11,21 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/huber_loss_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - huber_loss, - ops::HuberLossKernel, - ops::HuberLossKernel); -REGISTER_OP_CUDA_KERNEL( - huber_loss_grad, - ops::HuberLossGradKernel, - ops::HuberLossGradKernel); + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RealGradKernel(const Context& dev_ctx, + const DenseTensor& dout, + DenseTensor* dx); + +template +void ImagGradKernel(const Context& dev_ctx, + const DenseTensor& dout, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index cfe9da23880363ccddc84ec39beb9038170e76cb..44bfae9820aa84cb33784f108ace6aa0ab8b5281 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -50,4 +50,14 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } +template +void RealKernel(const DeviceContext& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + +template +void ImagKernel(const DeviceContext& dev_ctx, + const DenseTensor& x, + DenseTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/abs_grad_kernel.cc b/paddle/phi/kernels/cpu/abs_grad_kernel.cc index 3c90a348d86a4ccdc1f6a5c1cd53815e00e1fa79..ca42a5eb2976f62708544e3d3bdd31f63d2a004f 100644 --- a/paddle/phi/kernels/cpu/abs_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_grad_kernel.cc @@ -19,7 +19,7 @@ using phi::dtype::complex; -PT_REGISTER_KERNEL(abs_grad, +PD_REGISTER_KERNEL(abs_grad, CPU, ALL_LAYOUT, phi::AbsGradKernel, @@ -29,7 +29,7 @@ PT_REGISTER_KERNEL(abs_grad, int64_t, complex, complex) {} -PT_REGISTER_KERNEL(abs_double_grad, +PD_REGISTER_KERNEL(abs_double_grad, CPU, ALL_LAYOUT, phi::AbsDoubleGradKernel, diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index 97bd89832870cc1d2a9031c266441bfa4c732ef2..71d818c45e6f3f28697d3496cc9ae8a0d209ce6e 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -36,7 +36,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { } // namespace phi -PT_REGISTER_KERNEL(abs, +PD_REGISTER_KERNEL(abs, CPU, ALL_LAYOUT, phi::AbsKernel, diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6ff7431f0c8c556770b54e1328251e5996850fc9 --- /dev/null +++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/atan2_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(atan2_grad, + CPU, + ALL_LAYOUT, + phi::Atan2GradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..eb38a6c90b7938ef16cf9d56dfdb93903cc3c6a1 --- /dev/null +++ b/paddle/phi/kernels/cpu/atan2_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/atan2_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/atan2_kernel_impl.h" + +PD_REGISTER_KERNEL(atan2, + CPU, + ALL_LAYOUT, + phi::Atan2Kernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6859451e8be32d6d70003d6ce790810d1cc815aa --- /dev/null +++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_grad_kernel.h" + +#include // for max +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void BCELossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + DenseTensor* input_grad) { + auto dx_data = dev_ctx.template Alloc(input_grad); + auto dout_data = out_grad.data(); + auto x_data = input.data(); + auto label_data = label.data(); + + int x_numel = input.numel(); + + // dx = dout * ((x - label)/(x - x^2)) + for (int i = 0; i < x_numel; ++i) { + dx_data[i] = + dout_data[i] * ((x_data[i] - label_data[i]) / + std::max((static_cast(1) - x_data[i]) * x_data[i], + static_cast(1e-12))); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss_grad, CPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..76b979365148468c883962f07db1b923e7ef25b8 --- /dev/null +++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_kernel.h" + +#include // for max +#include "paddle/fluid/operators/math.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void BCELossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + DenseTensor* out) { + auto x_data = input.data(); + auto label_data = label.data(); + auto out_data = dev_ctx.template Alloc(out); + auto x_numel = input.numel(); + + // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 - + // x) - label * ln(x) + for (int64_t i = 0; i < x_numel; ++i) { + PADDLE_ENFORCE_GE( + x_data[i], + static_cast(0), + phi::errors::InvalidArgument( + "Illegal input, input must be greater than or equal to 0")); + PADDLE_ENFORCE_LE( + x_data[i], + static_cast(1), + phi::errors::InvalidArgument( + "Illegal input, input must be less than or equal to 1")); + out_data[i] = + (label_data[i] - static_cast(1)) * + std::max(paddle::operators::real_log(static_cast(1) - x_data[i]), + (T)(-100)) - + label_data[i] * + std::max(paddle::operators::real_log(x_data[i]), (T)(-100)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss, CPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc index 4ba965a4e5f1d2beb6a114b64ca5fa211804bbcb..09c07d9ec9dea028bd3b1921056b78bc97c07ec2 100644 --- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc +++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc @@ -51,5 +51,5 @@ void BernoulliKernel(const Context& ctx, } // namespace phi -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( bernoulli, CPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc index 4e95a37270dd43a4f3f45eb3a26b1c0500e0aaf2..c2c207bfaf25e5bea9faed36c85a5755884e5669 100644 --- a/paddle/phi/kernels/cpu/cast_kernel.cc +++ b/paddle/phi/kernels/cpu/cast_kernel.cc @@ -58,7 +58,7 @@ void CastKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(cast, +PD_REGISTER_KERNEL(cast, CPU, ALL_LAYOUT, phi::CastKernel, diff --git a/paddle/phi/kernels/cpu/complex_grad_kernel.cc b/paddle/phi/kernels/cpu/complex_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..5c1d50f5bf27d2cf7b3e0078f1bcab13d1b898a8 --- /dev/null +++ b/paddle/phi/kernels/cpu/complex_grad_kernel.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/complex_grad_kernel.h" +#include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" + +#include "paddle/phi/common/complex.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(real_grad, + CPU, + ALL_LAYOUT, + phi::RealGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(imag_grad, + CPU, + ALL_LAYOUT, + phi::ImagGradKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc index 3a886c3378524c62c53aae9951de4db17aad9acc..801502e16737d1ef5ffa475916d5e144d2e8d86b 100644 --- a/paddle/phi/kernels/cpu/complex_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_kernel.cc @@ -21,7 +21,7 @@ // See Note [ Why still include the fluid headers? ] #include "paddle/phi/common/complex.h" -PT_REGISTER_KERNEL(conj, +PD_REGISTER_KERNEL(conj, CPU, ALL_LAYOUT, phi::ConjKernel, @@ -31,3 +31,17 @@ PT_REGISTER_KERNEL(conj, double, int, int64_t) {} + +PD_REGISTER_KERNEL(real, + CPU, + ALL_LAYOUT, + phi::RealKernel, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(imag, + CPU, + ALL_LAYOUT, + phi::ImagKernel, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/concat_and_split.h b/paddle/phi/kernels/cpu/concat_and_split.h deleted file mode 100644 index 88cfc5db8f2e852ee26f2300afb5a93cf06274c1..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/concat_and_split.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -/* - * \brief Concatenate the input tensors along the dimension axis. - * TODO(zcd): maybe it needs to be more detailed. - * Examples: - * Input[0] = [[1,2],[3,4]] - * Input[1] = [[5,6]] - * axis = 0 - * - * Output = [[1,2], - * [3,4], - * [5,6]] - */ - -template -void ConcatImpl(const Context& context, - const std::vector& input, - int axis, - DenseTensor* output) { - // TODO(zcd): Add input data validity checking - size_t num = input.size(); - - int64_t rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int64_t out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (size_t i = 0; i < num; ++i) { - int64_t t_cols = input[i].numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - auto cpu_place = context.GetPlace(); - - // computation - auto output_data = output->data(); - int64_t col_idx = 0; - for (size_t j = 0; j < num; ++j) { - int64_t col_len = input_cols[j]; - auto input_data = input[j].data(); - for (int64_t k = 0; k < out_rows; ++k) { - paddle::memory::Copy(cpu_place, - output_data + k * out_cols + col_idx, - cpu_place, - input_data + k * col_len, - sizeof(T) * col_len); - } - col_idx += col_len; - } -} - -/* - * \brief Split the input tensors along the dimension axis into outputs. - * TODO(zcd): maybe it needs to be more detailed. - * Examples: - * Input = [[1,2], - * [3,4], - * [5,6]] - * axis = 0 - * - * Output[0] = [[1,2],[3,4]] - * Output[1] = [[5,6]] - */ -template -void SplitImpl(const Context& context, - const DenseTensor& input, - const std::vector& ref_inputs, - const int axis, - std::vector* outputs) { - // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 - // tensors of shape [0,1,4] - if (input.numel() == 0) { - return; - } - - // TODO(zcd): Add input data validity checking - size_t num = outputs->size(); - - int input_rows = 1; - auto dim_0 = ref_inputs[0]->dims(); - for (int i = 0; i < axis; ++i) { - input_rows *= dim_0[i]; - } - - int input_cols = 0; - - std::vector output_cols(outputs->size()); - for (size_t i = 0; i < num; ++i) { - int t_cols = ref_inputs[i]->numel() / input_rows; - input_cols += t_cols; - output_cols[i] = t_cols; - } - auto cpu_place = context.GetPlace(); - - // computation - for (int k = 0; k < input_rows; ++k) { - const T* src_ptr = input.data() + k * input_cols; - int col_idx = 0; - for (size_t j = 0; j < num; ++j) { - int col_len = output_cols[j]; - auto* out_tensor = outputs->at(j); - if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->data() + k * col_len; - paddle::memory::Copy(cpu_place, - dst_ptr, - cpu_place, - src_ptr + col_idx, - sizeof(T) * col_len); - } - col_idx += col_len; - } - } -} - -} // namespace phi diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 7f4cce379e04d4744f2544788feec28ba0a915e2..18bb8837b105d91e3e13a0a7519b08c9c47202c4 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -22,7 +22,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/lod_utils.h" -#include "paddle/phi/kernels/cpu/concat_and_split.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { @@ -54,7 +54,7 @@ void ConcatKernel(const Context& dev_ctx, PADDLE_ENFORCE_EQ( x[i].lod().size(), lod_size_0, - paddle::platform::errors::Unimplemented( + phi::errors::Unimplemented( "The lod level of all input LoDTensors should be same. " "Maybe different lod level of input LoDTensors can concat," "it is not supported currently. The lod level of %dth input " @@ -104,13 +104,14 @@ void ConcatKernel(const Context& dev_ctx, continue; } } - ConcatImpl(dev_ctx, inputs, axis, out); + phi::funcs::ConcatFunctor functor; + functor(dev_ctx, inputs, axis, out); } } } // namespace phi -PT_REGISTER_KERNEL(concat, +PD_REGISTER_KERNEL(concat, CPU, ALL_LAYOUT, phi::ConcatKernel, diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc index 8a79a5f6b1941e1fcd24d5a1f05d1094628ca28d..7dcd75d39e4df5b7bc634c4e16f7843bf5044c94 100644 --- a/paddle/phi/kernels/cpu/copy_kernel.cc +++ b/paddle/phi/kernels/cpu/copy_kernel.cc @@ -56,5 +56,5 @@ void Copy(const Context& dev_ctx, } // namespace phi -PT_REGISTER_GENERAL_KERNEL( +PD_REGISTER_GENERAL_KERNEL( copy, CPU, ALL_LAYOUT, phi::Copy, ALL_DTYPE) {} diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..390420008e6ea107573bbc2038c3a82af19b06e6 --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_grad_kernel.h" +#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(cross_grad, + CPU, + ALL_LAYOUT, + phi::CrossGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..a63f33174eacda551e595affc34343030468f2c5 --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_kernel.h" +#include "paddle/phi/kernels/impl/cross_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc index 351b2335386a8b60c725c43d80bff8fc5872eb16..c3c290b4fe91ec1ecee6f0026ed5af39288e2618 100644 --- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc @@ -82,7 +82,7 @@ void DiagonalGradKernel(const Context& dev_ctx, } } } // namespace phi -PT_REGISTER_KERNEL(diagonal_grad, +PD_REGISTER_KERNEL(diagonal_grad, CPU, ALL_LAYOUT, phi::DiagonalGradKernel, diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc index 79f09008f3e2e48cce5ec4f431b6541450c3d710..df17b458e1166b49815d405a4e7d97c5384ab4f0 100644 --- a/paddle/phi/kernels/cpu/diagonal_kernel.cc +++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc @@ -79,7 +79,7 @@ void DiagonalKernel(const Context& dev_ctx, } } } // namespace phi -PT_REGISTER_KERNEL(diagonal, +PD_REGISTER_KERNEL(diagonal, CPU, ALL_LAYOUT, phi::DiagonalKernel, diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc index 5cb86eef498bd325c8beda7c08f5e76b57f417b0..da1b5ae556609c05a91623cf9cac408e190868b9 100644 --- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc @@ -19,5 +19,5 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h" -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( digamma_grad, CPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc index 0013d8ee7740b8a396ebf127698b6be0b53067d0..ee120a29b6061efcadfb88ecce8ba3235d865ca1 100644 --- a/paddle/phi/kernels/cpu/digamma_kernel.cc +++ b/paddle/phi/kernels/cpu/digamma_kernel.cc @@ -19,5 +19,5 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/digamma_kernel_impl.h" -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( digamma, CPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc index 729bc9aa3a3acad547269613cbfb66e75ff20ead..a2abdb7c00900ecd103562430d1f965cbaf92d4e 100644 --- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc @@ -20,7 +20,7 @@ #include "paddle/phi/common/complex.h" -PT_REGISTER_KERNEL(dot_grad, +PD_REGISTER_KERNEL(dot_grad, CPU, ALL_LAYOUT, phi::DotGradKernel, diff --git a/paddle/phi/kernels/cpu/dot_kernel.cc b/paddle/phi/kernels/cpu/dot_kernel.cc index f4f5d1ffeb544dfa006444ce746e076c1d6258ae..3518501a6b63d160d32ecefc57236d4e2aa7b1fa 100644 --- a/paddle/phi/kernels/cpu/dot_kernel.cc +++ b/paddle/phi/kernels/cpu/dot_kernel.cc @@ -49,7 +49,7 @@ void DotKernel(const Context& dev_ctx, using complex64 = ::phi::dtype::complex; using complex128 = ::phi::dtype::complex; -PT_REGISTER_KERNEL(dot, +PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, phi::DotKernel, diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h index c692038d24a0a885d21b9c632709b143681a438d..28bf5ab743f6d5d0608fe65c00d5a0de2af3415b 100644 --- a/paddle/phi/kernels/cpu/elementwise.h +++ b/paddle/phi/kernels/cpu/elementwise.h @@ -127,7 +127,7 @@ struct SameDimsDivideFunctor< const DenseTensor& x, const DenseTensor& y, DenseTensor* z) { - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "If use SameDimsDivideFunctor, template args(T) must be floating " "point. "); } @@ -278,12 +278,10 @@ void CommonForwardBroadcastCPU(const DenseTensor& x, std::vector index_array(max_dim, 0); const T* x_data = x.data(); const T* y_data = y.data(); - PADDLE_ENFORCE_NOT_NULL(x_data, - paddle::platform::errors::InvalidArgument( - "The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL(y_data, - paddle::platform::errors::InvalidArgument( - "The input Y should not be empty.")); + PADDLE_ENFORCE_NOT_NULL( + x_data, phi::errors::InvalidArgument("The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL( + y_data, phi::errors::InvalidArgument("The input Y should not be empty.")); OutType* out_data = ctx.Alloc(z); const int out_size = std::accumulate( @@ -317,12 +315,12 @@ void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx, PADDLE_ENFORCE_GE( axis, 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); @@ -385,12 +383,12 @@ void ElementwiseCompute(const CPUContext& dev_ctx, PADDLE_ENFORCE_GE( axis, 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); @@ -630,12 +628,12 @@ void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, PADDLE_ENFORCE_GE( axis, 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index 2d1b2a3bd7c3fa4d40d6544a704ef984d7fac1fc..0b29091367c83acee19e703f450d16602f322f3c 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -125,7 +125,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(add_grad, +PD_REGISTER_KERNEL(add_grad, CPU, ALL_LAYOUT, phi::AddGradKernel, @@ -137,7 +137,7 @@ PT_REGISTER_KERNEL(add_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(add_double_grad, +PD_REGISTER_KERNEL(add_double_grad, CPU, ALL_LAYOUT, phi::AddDoubleGradKernel, @@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(add_double_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(add_triple_grad, +PD_REGISTER_KERNEL(add_triple_grad, CPU, ALL_LAYOUT, phi::AddTripleGradKernel, @@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(add_triple_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(subtract_grad, +PD_REGISTER_KERNEL(subtract_grad, CPU, ALL_LAYOUT, phi::SubtractGradKernel, @@ -173,7 +173,7 @@ PT_REGISTER_KERNEL(subtract_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(subtract_double_grad, +PD_REGISTER_KERNEL(subtract_double_grad, CPU, ALL_LAYOUT, phi::SubtractDoubleGradKernel, diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc index 427b6441b2d24c8ea1862cb7ae0168a3009c54dc..4799a6aa7afdf85a759d5940edea05e885b965e3 100644 --- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc @@ -19,7 +19,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h" -PT_REGISTER_KERNEL(expand_grad, +PD_REGISTER_KERNEL(expand_grad, CPU, ALL_LAYOUT, phi::ExpandGradKernel, diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc index cce367c8eb832469a223c4c54d462b6f7c9b4237..077048976729fddefe8162f8eebb4961843dd2e0 100644 --- a/paddle/phi/kernels/cpu/expand_kernel.cc +++ b/paddle/phi/kernels/cpu/expand_kernel.cc @@ -19,7 +19,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/expand_kernel_impl.h" -PT_REGISTER_KERNEL(expand, +PD_REGISTER_KERNEL(expand, CPU, ALL_LAYOUT, phi::ExpandKernel, diff --git a/paddle/phi/kernels/cpu/flip_kernel.cc b/paddle/phi/kernels/cpu/flip_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..fa1625d65bdc9b2604ba405744fe3def7a2e7282 --- /dev/null +++ b/paddle/phi/kernels/cpu/flip_kernel.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/flip_kernel.h" + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +constexpr size_t dim_bitset_size = 64; + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + auto x_dims = x.dims(); + const int total_dims = x_dims.size(); + std::bitset dim_bitset; + for (size_t i = 0; i < axis.size(); ++i) { + int dim = axis[i]; + if (axis[i] < 0) { + dim += total_dims; + } + dim_bitset[dim] = true; + } + auto x_strides = phi::stride(x_dims); + auto numel = x.numel(); + const T* x_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < numel; ++i) { + int64_t cur_indices = i; + int64_t rem = 0; + int64_t dst_offset = 0; + + for (int d = 0; d < total_dims; ++d) { + int64_t temp = cur_indices; + cur_indices = cur_indices / x_strides[d]; + rem = temp - cur_indices * x_strides[d]; + dst_offset += dim_bitset[d] ? (x_dims[d] - 1 - cur_indices) * x_strides[d] + : cur_indices * x_strides[d]; + cur_indices = rem; + } + out_data[i] = x_data[dst_offset]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(flip, + CPU, + ALL_LAYOUT, + phi::FlipKernel, + float, + double, + int32_t, + int64_t, + bool, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc index b55eb109f7de32ced5c8a316edd6aa2811b7e77d..6b0183d31c6ec3dc3e6712043f27678c3f3a6bb2 100644 --- a/paddle/phi/kernels/cpu/full_kernel.cc +++ b/paddle/phi/kernels/cpu/full_kernel.cc @@ -73,7 +73,7 @@ void FullLikeKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(full, +PD_REGISTER_KERNEL(full, CPU, ALL_LAYOUT, phi::FullKernel, @@ -89,7 +89,7 @@ PT_REGISTER_KERNEL(full, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(full_like, +PD_REGISTER_KERNEL(full_like, CPU, ALL_LAYOUT, phi::FullLikeKernel, @@ -99,4 +99,6 @@ PT_REGISTER_KERNEL(full_like, int, int64_t, bool, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc index fbcf47c3070e68470a2eecf3b4c6eaa6c37926d2..82b88f868d8a70cd61073b65bb24fd195baeb5c2 100644 --- a/paddle/phi/kernels/cpu/histogram_kernel.cc +++ b/paddle/phi/kernels/cpu/histogram_kernel.cc @@ -77,7 +77,7 @@ void HistogramKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(histogram, +PD_REGISTER_KERNEL(histogram, CPU, ALL_LAYOUT, phi::HistogramKernel, diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..654f2c9400af00484e6921aae63aeb0d93b521ae --- /dev/null +++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/huber_loss_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + huber_loss_grad, CPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..702c0589057af7079e6e0a41f1058063922790fe --- /dev/null +++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/huber_loss_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h" + +PD_REGISTER_KERNEL( + huber_loss, CPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..74664fb270b2d27a56e7eb6634b50f167b2764ba --- /dev/null +++ b/paddle/phi/kernels/cpu/label_smooth_grad_kernel.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/label_smooth_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void LabelSmoothGradKernel(const Context& ctx, + const DenseTensor& out_grad, + float epsilon, + DenseTensor* label_grad) { + ctx.template Alloc(label_grad); + auto d_out_dim = out_grad.dims()[out_grad.dims().size() - 1]; + if (d_out_dim != 0) { + auto d_out = EigenVector::Flatten(out_grad); + auto d_in = EigenVector::Flatten(*label_grad); + + auto& dev = *ctx.eigen_device(); + d_in.device(dev) = static_cast(1 - epsilon) * d_out; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(label_smooth_grad, + CPU, + ALL_LAYOUT, + phi::LabelSmoothGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/label_smooth_kernel.cc b/paddle/phi/kernels/cpu/label_smooth_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..c76fb826cdfcce4a37c1d97de0ef37217249a727 --- /dev/null +++ b/paddle/phi/kernels/cpu/label_smooth_kernel.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/label_smooth_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void LabelSmoothKernel(const Context& ctx, + const DenseTensor& label, + paddle::optional prior_dist, + float epsilon, + DenseTensor* out) { + auto label_dim = label.dims()[label.dims().size() - 1]; + ctx.template Alloc(out); + auto& dev = *ctx.eigen_device(); + if (label_dim != 0) { + auto eigen_out = EigenVector::Flatten(*out); + auto eigen_in = EigenVector::Flatten(label); + if (prior_dist.is_initialized()) { + auto dist = EigenVector::Flatten(*prior_dist.get_ptr()); + eigen_out.device(dev) = + static_cast(1 - epsilon) * eigen_in + + static_cast(epsilon) * + dist.broadcast(Eigen::DSizes(label.numel() / label_dim)); + } else { + eigen_out.device(dev) = static_cast(1 - epsilon) * eigen_in + + static_cast(epsilon / label_dim); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + label_smooth, CPU, ALL_LAYOUT, phi::LabelSmoothKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc index 7cfb42dbcf96faef7a2b4a4d9f95b8d3a1cb28d6..d74919011ec5da08b700b974393fcc70de22b21c 100644 --- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc @@ -17,5 +17,5 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h" -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( lerp_grad, CPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc index 97083c96464c305c1ccdb0ff674ce5aac372a335..7adfc35bfa321e8c111a11998e3b0b683009e619 100644 --- a/paddle/phi/kernels/cpu/lerp_kernel.cc +++ b/paddle/phi/kernels/cpu/lerp_kernel.cc @@ -17,4 +17,4 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/lerp_kernel_impl.h" -PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {} +PD_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc index 071bbba1975e40abe65cce3b50972cb282e45c95..7fe41e686af8c54d1d105ffe5ff43c5e9c7a92e8 100644 --- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc @@ -43,7 +43,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(masked_select_grad, +PD_REGISTER_KERNEL(masked_select_grad, CPU, ALL_LAYOUT, phi::MaskedSelectGradKernel, diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc index 08fc3f69f01e17c7e18b0f1307781d9d5290e801..f377658d507f6086101e1cdb0f0ab1891536e771 100644 --- a/paddle/phi/kernels/cpu/masked_select_kernel.cc +++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc @@ -48,7 +48,7 @@ void MaskedSelectKernel(const Context& dev_ctx, DDim out_dim{out_size}; out->Resize(out_dim); - auto out_data = out->mutable_data(paddle::platform::CPUPlace()); + auto out_data = out->mutable_data(phi::CPUPlace()); int index = 0; for (int i = 0; i < mask_size; i++) { @@ -61,7 +61,7 @@ void MaskedSelectKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(masked_select, +PD_REGISTER_KERNEL(masked_select, CPU, ALL_LAYOUT, phi::MaskedSelectKernel, diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 862ee42296c9244a37a018023d5f3d215b8204e0..581c5f90f35e5cadb239291d143ce54d499c017e 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -118,7 +118,7 @@ using complex128 = ::phi::dtype::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // using bfloat16 = ::phi::dtype::bfloat16; -PT_REGISTER_KERNEL(add_raw, +PD_REGISTER_KERNEL(add_raw, CPU, ALL_LAYOUT, phi::AddRawKernel, @@ -129,7 +129,7 @@ PT_REGISTER_KERNEL(add_raw, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(subtract_raw, +PD_REGISTER_KERNEL(subtract_raw, CPU, ALL_LAYOUT, phi::SubtractRawKernel, @@ -140,7 +140,7 @@ PT_REGISTER_KERNEL(subtract_raw, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(divide_raw, +PD_REGISTER_KERNEL(divide_raw, CPU, ALL_LAYOUT, phi::DivideRawKernel, @@ -150,7 +150,7 @@ PT_REGISTER_KERNEL(divide_raw, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(multiply_raw, +PD_REGISTER_KERNEL(multiply_raw, CPU, ALL_LAYOUT, phi::MultiplyRawKernel, @@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(multiply_raw, bool, complex64, complex128) {} -PT_REGISTER_KERNEL(sum_raw, +PD_REGISTER_KERNEL(sum_raw, CPU, ALL_LAYOUT, phi::SumRawKernel, @@ -176,5 +176,5 @@ PT_REGISTER_KERNEL(sum_raw, complex128) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {} diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc index 56a185e4ade064f91b1e7a52ff48997c7e9941e1..c68e8115e898b3701b9f568ac501260615b69ad4 100644 --- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h" -PT_REGISTER_KERNEL(matmul_grad, +PD_REGISTER_KERNEL(matmul_grad, CPU, ALL_LAYOUT, phi::MatmulGradKernel, @@ -28,7 +28,7 @@ PT_REGISTER_KERNEL(matmul_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(matmul_double_grad, +PD_REGISTER_KERNEL(matmul_double_grad, CPU, ALL_LAYOUT, phi::MatmulDoubleGradKernel, @@ -37,7 +37,7 @@ PT_REGISTER_KERNEL(matmul_double_grad, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(matmul_triple_grad, +PD_REGISTER_KERNEL(matmul_triple_grad, CPU, ALL_LAYOUT, phi::MatmulTripleGradKernel, diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc index 8676aec3eccb475a9de346e34e15c01c195aebbb..2bf56c07a5bc7485fd29d6ac347a5311915d8f36 100644 --- a/paddle/phi/kernels/cpu/matmul_kernel.cc +++ b/paddle/phi/kernels/cpu/matmul_kernel.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/phi/common/complex.h" #include "paddle/phi/kernels/impl/matmul_kernel_impl.h" -PT_REGISTER_KERNEL(matmul, +PD_REGISTER_KERNEL(matmul, CPU, ALL_LAYOUT, phi::MatmulKernel, diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc index d2073c07244bd54acbfcf7bf81028684f3ea739b..597207a05a226ac598d9141b42d5682bed5364f1 100644 --- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc @@ -83,5 +83,5 @@ void NormGradKernel(const Context& ctx, } // namespace phi -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( norm_grad, CPU, ALL_LAYOUT, phi::NormGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc index e8f35b5fe7efd8dc04f16dffa877af082456a14d..50906d9c3bb9495817e81678b60fe3e426a22444 100644 --- a/paddle/phi/kernels/cpu/norm_kernel.cc +++ b/paddle/phi/kernels/cpu/norm_kernel.cc @@ -76,4 +76,4 @@ void NormKernel(const Context& ctx, } // namespace phi -PT_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {} +PD_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/poisson_grad_kernel.cc b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e274a7af9ff30428b71dec8367deca71dbb4fe5 --- /dev/null +++ b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + poisson_grad, CPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a3e32c2f07853f57e123e64660cd6bc50d8574b --- /dev/null +++ b/paddle/phi/kernels/cpu/poisson_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/poisson_kernel.h" + +namespace phi { + +template +void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + const T* x_data = x.data(); + T* out_data = ctx.template Alloc(out); + int64_t size = x.numel(); + + auto gen = ctx.GetGenerator(); + auto engine = gen->GetCPUEngine(); + + for (int64_t i = 0; i < size; ++i) { + std::poisson_distribution<> dist(x_data[i]); + out_data[i] = static_cast(dist(*engine)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + poisson, CPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..28092c8df6d153c6b5e787027f0c2239bd257cc1 --- /dev/null +++ b/paddle/phi/kernels/cpu/randperm_kernel.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/randperm_kernel.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void RandpermKernel(const Context& ctx, + int n, + DataType dtype, + DenseTensor* out) { + T* out_data = ctx.template Alloc(out); + auto gen_ptr = ctx.GetHostGenerator(); + auto engine = gen_ptr->GetCPUEngine(); + + for (int i = 0; i < n; ++i) { + out_data[i] = static_cast(i); + } + std::shuffle(out_data, out_data + n, *engine); +} + +} // namespace phi + +PD_REGISTER_KERNEL(randperm, + CPU, + ALL_LAYOUT, + phi::RandpermKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc index 156afb8798de40000dcdea7d613734b92f1bc162..e929b5bd7219b60acb226374f67a0bc511c41723 100644 --- a/paddle/phi/kernels/cpu/scale_kernel.cc +++ b/paddle/phi/kernels/cpu/scale_kernel.cc @@ -51,7 +51,7 @@ void ScaleKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(scale, +PD_REGISTER_KERNEL(scale, CPU, ALL_LAYOUT, phi::ScaleKernel, diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc index 6be931904d133159b907d296d17aebdba9bc2501..5fe11ffbd6d5c08b5072b61ab23d6fbea1879b53 100644 --- a/paddle/phi/kernels/cpu/sign_kernel.cc +++ b/paddle/phi/kernels/cpu/sign_kernel.cc @@ -21,4 +21,4 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/phi/common/bfloat16.h" -PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {} +PD_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index d02909f007da462089903d0f0764e2cf86231ede..722681fb7bc3f9d9f75b92468b89931910dd532e 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -19,7 +19,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/infermeta/unary.h" -#include "paddle/phi/kernels/cpu/concat_and_split.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace phi { template @@ -54,13 +54,14 @@ void SplitKernel(const Context& dev_ctx, paddle::operators::StridedMemcpyWithAxis0( dev_ctx, x, shape_refer, &outs); } else { - SplitImpl(dev_ctx, x, shape_refer, axis, &outs); + phi::funcs::SplitFunctor functor; + functor(dev_ctx, x, shape_refer, axis, &outs); } } } // namespace phi -PT_REGISTER_KERNEL(split, +PD_REGISTER_KERNEL(split, CPU, ALL_LAYOUT, phi::SplitKernel, @@ -69,4 +70,5 @@ PT_REGISTER_KERNEL(split, int64_t, int, bool, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/cpu/trace_grad_kernel.cc b/paddle/phi/kernels/cpu/trace_grad_kernel.cc index e6ffd99bc53bd837aa3ef5ea142890fd4786249d..2167851b197d142a3e9c4b104175fd9147de6972 100644 --- a/paddle/phi/kernels/cpu/trace_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/trace_grad_kernel.cc @@ -18,7 +18,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h" -PT_REGISTER_KERNEL(trace_grad, +PD_REGISTER_KERNEL(trace_grad, CPU, ALL_LAYOUT, phi::TraceGradKernel, diff --git a/paddle/phi/kernels/cpu/trace_kernel.cc b/paddle/phi/kernels/cpu/trace_kernel.cc index 2b2cda6491d48487834321b376920f8943ea3650..3646e226519139430818c0f17b3f40c61c516dbd 100644 --- a/paddle/phi/kernels/cpu/trace_kernel.cc +++ b/paddle/phi/kernels/cpu/trace_kernel.cc @@ -45,7 +45,7 @@ void TraceKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(trace, +PD_REGISTER_KERNEL(trace, CPU, ALL_LAYOUT, phi::TraceKernel, diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc index 7fc677c16ef7397e0963bbd1c9eed3ac49f136e0..4d85dd609e2d1f14cc476a1c53ba0506e6b519a5 100644 --- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc @@ -30,7 +30,7 @@ void TruncGradKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(trunc_grad, +PD_REGISTER_KERNEL(trunc_grad, CPU, ALL_LAYOUT, phi::TruncGradKernel, diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc index 10e42196679fa546f7611b97fbcda812bedf4b23..babae6ce7c9318f7cb4ba1f15aedbe38de5ebbd3 100644 --- a/paddle/phi/kernels/cpu/trunc_kernel.cc +++ b/paddle/phi/kernels/cpu/trunc_kernel.cc @@ -35,5 +35,5 @@ void TruncKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL( +PD_REGISTER_KERNEL( trunc, CPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..655f8c8aafbf201dc07db0fa1af79605c2a76763 --- /dev/null +++ b/paddle/phi/kernels/cpu/unbind_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unbind_kernel.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unbind_kernel_impl.h" + +PD_REGISTER_KERNEL(unbind, + CPU, + ALL_LAYOUT, + phi::UnbindKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/unfold_grad_kernel.cc b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..c97005dd84547eeb04603da6dc29b922715b936a --- /dev/null +++ b/paddle/phi/kernels/cpu/unfold_grad_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unfold_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + unfold_grad, CPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/unfold_kernel.cc b/paddle/phi/kernels/cpu/unfold_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..e38d8acd098204e82245ab697967b8c209bfb0e6 --- /dev/null +++ b/paddle/phi/kernels/cpu/unfold_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/unfold_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/unfold_kernel_impl.h" + +PD_REGISTER_KERNEL(unfold, CPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {} diff --git a/paddle/phi/kernels/cross_grad_kernel.h b/paddle/phi/kernels/cross_grad_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..9ea0804a94b6b5d145a13c8f794a9f01498bf7db --- /dev/null +++ b/paddle/phi/kernels/cross_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/cross_kernel.h b/paddle/phi/kernels/cross_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..567889e0783452bf17630a074528cfbf3658ec38 --- /dev/null +++ b/paddle/phi/kernels/cross_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 6d9e733b2f57677c70e259f39d20c332a5fff195..8109d3879cb21edd85d19612a62d9a8e0711e456 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -38,7 +38,7 @@ void EmptyLikeKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(empty, +PD_REGISTER_KERNEL(empty, CPU, ALL_LAYOUT, phi::EmptyKernel, @@ -54,7 +54,7 @@ PT_REGISTER_KERNEL(empty, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(empty_like, +PD_REGISTER_KERNEL(empty_like, CPU, ALL_LAYOUT, phi::EmptyLikeKernel, @@ -71,7 +71,7 @@ PT_REGISTER_KERNEL(empty_like, phi::dtype::complex) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_KERNEL(empty, +PD_REGISTER_KERNEL(empty, GPU, ALL_LAYOUT, phi::EmptyKernel, @@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(empty, phi::dtype::complex, phi::dtype::complex) {} -PT_REGISTER_KERNEL(empty_like, +PD_REGISTER_KERNEL(empty_like, GPU, ALL_LAYOUT, phi::EmptyLikeKernel, diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 33e6c2724982a7c916636d2f782898eedf875225..7e8010a43f3d1898309ff72ab7189c58d4ece71d 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -32,7 +32,7 @@ void FlattenGradKernel(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(flatten_grad, +PD_REGISTER_KERNEL(flatten_grad, CPU, ALL_LAYOUT, phi::FlattenGradKernel, @@ -44,7 +44,7 @@ PT_REGISTER_KERNEL(flatten_grad, int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_KERNEL(flatten_grad, +PD_REGISTER_KERNEL(flatten_grad, GPU, ALL_LAYOUT, phi::FlattenGradKernel, @@ -59,7 +59,7 @@ PT_REGISTER_KERNEL(flatten_grad, #endif #ifdef PADDLE_WITH_XPU -PT_REGISTER_KERNEL(flatten_grad, +PD_REGISTER_KERNEL(flatten_grad, XPU, ALL_LAYOUT, phi::FlattenGradKernel, diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 1ac444aa1792f4645c44feb117a5eacc409b0017..12eaab92d5211c08143ba72058cd4443aca1501c 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -48,7 +48,7 @@ void FlattenWithXShape(const Context& dev_ctx, } // namespace phi -PT_REGISTER_KERNEL(flatten, +PD_REGISTER_KERNEL(flatten, CPU, ALL_LAYOUT, phi::FlattenKernel, @@ -60,7 +60,7 @@ PT_REGISTER_KERNEL(flatten, int, int64_t) {} -PT_REGISTER_KERNEL(flatten_with_xshape, +PD_REGISTER_KERNEL(flatten_with_xshape, CPU, ALL_LAYOUT, phi::FlattenWithXShape, @@ -73,7 +73,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, int64_t) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_REGISTER_KERNEL(flatten, +PD_REGISTER_KERNEL(flatten, GPU, ALL_LAYOUT, phi::FlattenKernel, @@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(flatten, int, int64_t) {} -PT_REGISTER_KERNEL(flatten_with_xshape, +PD_REGISTER_KERNEL(flatten_with_xshape, GPU, ALL_LAYOUT, phi::FlattenWithXShape, @@ -101,7 +101,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, #endif #ifdef PADDLE_WITH_XPU -PT_REGISTER_KERNEL(flatten, +PD_REGISTER_KERNEL(flatten, XPU, ALL_LAYOUT, phi::FlattenKernel, @@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten, int, int64_t) {} -PT_REGISTER_KERNEL(flatten_with_xshape, +PD_REGISTER_KERNEL(flatten_with_xshape, XPU, ALL_LAYOUT, phi::FlattenWithXShape, diff --git a/paddle/phi/kernels/flip_kernel.h b/paddle/phi/kernels/flip_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..4470486fec0fb6ba1e176d9696bf43b559b62485 --- /dev/null +++ b/paddle/phi/kernels/flip_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index ba0c848df434ed403c29a5754043784066f7ef2a..aa4fac169200753639c48f5e9b5fa8c3bbfbd33c 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -3,3 +3,4 @@ add_subdirectory(blas) add_subdirectory(lapack) math_library(math_function DEPS blas dense_tensor tensor) +math_library(concat_and_split_functor DEPS dense_tensor) diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index be57b8630f89578e8de48f6dc581cb6fc37a1048..84a36b849afa1c4cdcc1a0f4d4ada598944a1faa 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_base.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) namespace kps = phi::kps; @@ -122,7 +122,7 @@ struct DimensionsTransform { explicit DimensionsTransform(const std::vector &ins, const phi::DDim &dims, int axis) { - const int N = max(static_cast(ins.size()), 2); + const int N = std::max(static_cast(ins.size()), 2); dim_size = dims.size(); out_dims = phi::vectorize(dims); in_dims.resize(N); @@ -183,7 +183,7 @@ struct DimensionsTransform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) template __device__ __forceinline__ void LoadData( @@ -268,7 +268,7 @@ __global__ void VectorizedBroadcastKernel( int block_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP for (; block_offset < main_offset; block_offset += stride) { VectorizedBroadcastKernelImpl outs_data; for (int i = 0; i < NumOuts; ++i) { - outs_data[i] = ctx.Alloc((*outs)[i]); + outs_data[i] = (_ptr_ OutT *)(ctx.Alloc((*outs)[i])); } for (int i = 0; i < Arity; i++) { use_broadcast[i] = (ins[i]->numel() != numel); - ins_data[i] = (_ptr_ InT *)(ins[i]->data()); + ins_data[i] = (const _ptr_ InT *)(ins[i]->data()); if (use_broadcast[i]) { // get the broadcast config, // if data shape is[m, n], then you should set data_dim = {n, m} @@ -363,7 +363,7 @@ void LaunchBroadcastKernel(const KPDevice &ctx, } } -#ifdef PADDLE_WITH_XPU2 +#ifdef PADDLE_WITH_XPU_KP const int threads = 64; const int blocks = 8; int main_offset = (numel / (VecSize * threads)) * VecSize * threads; diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index e14241d03c3af09bd1d0201da0f53ffadd2b2c4a..d5289dcc22cbc546acc4980403e7e4641abe39f1 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -42,12 +42,12 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, PADDLE_ENFORCE_GE( axis, 0, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); @@ -72,7 +72,7 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, x_dims_array[i] == y_dims_array[i] || x_dims_array[i] <= 1 || y_dims_array[i] <= 1, true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Broadcast dimension mismatch. Operands could " "not be broadcast together with the shape of X = [%s] and " "the shape of Y = [%s]. Received [%d] in X is not equal to " @@ -128,5 +128,17 @@ static void GetBroadcastDims(const DDim &in_dims, } } +inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) { + if (dims_x.size() != dims_y.size()) { + return false; + } + for (int i = 0; i < dims_x.size(); i++) { + if (dims_x[i] != dims_y[i]) { + return false; + } + } + return true; +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 450adfcc68b7e84e27a2f6bf2c6c22551bab8892..86dbdd099ecde72e932cc6cfa492486b65c7ebc2 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -154,6 +154,53 @@ struct AbsFunctor>> { int64_t numel_; }; +template +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline T operator()(const T x, const T dout) const { + T output; + if (x == T(0)) { + output = T(0); + } else { + output = T(dout) * (x / T(std::abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const float dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const double dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + template struct AbsGradFunctor { AbsGradFunctor(const Real* dout, const T* x, T* output, int64_t numel) diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc new file mode 100644 index 0000000000000000000000000000000000000000..c8405703a5c16ae9eae583638d1c89c22a736531 --- /dev/null +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/utils/data_type.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" + +namespace phi { +namespace funcs { + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +struct ConcatFunctor { + void operator()(const phi::CPUContext& context, + const std::vector& input, + int axis, + phi::DenseTensor* output) { + // TODO(zcd): Add input data validity checking + size_t num = input.size(); + + int64_t rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int64_t out_rows = rows, out_cols = 0; + + std::vector input_cols(input.size()); + for (size_t i = 0; i < num; ++i) { + int64_t t_cols = input[i].numel() / rows; + out_cols += t_cols; + input_cols[i] = t_cols; + } + auto cpu_place = context.GetPlace(); + + // computation + auto output_data = output->data(); + int64_t col_idx = 0; + for (size_t j = 0; j < num; ++j) { + int64_t col_len = input_cols[j]; + auto input_data = input[j].data(); + for (int64_t k = 0; k < out_rows; ++k) { + paddle::memory::Copy(cpu_place, + output_data + k * out_cols + col_idx, + cpu_place, + input_data + k * col_len, + sizeof(T) * col_len); + } + col_idx += col_len; + } + } +}; + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +struct SplitFunctor { + public: + void operator()(const phi::CPUContext& context, + const phi::DenseTensor& input, + const std::vector& ref_inputs, + int axis, + std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + + // TODO(zcd): Add input data validity checking + size_t num = outputs->size(); + + int input_rows = 1; + auto dim_0 = ref_inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + input_rows *= dim_0[i]; + } + + int input_cols = 0; + + std::vector output_cols(outputs->size()); + for (size_t i = 0; i < num; ++i) { + int t_cols = ref_inputs[i]->numel() / input_rows; + input_cols += t_cols; + output_cols[i] = t_cols; + } + auto cpu_place = context.GetPlace(); + + // computation + for (int k = 0; k < input_rows; ++k) { + const T* src_ptr = input.data() + k * input_cols; + int col_idx = 0; + for (size_t j = 0; j < num; ++j) { + int col_len = output_cols[j]; + auto* out_tensor = outputs->at(j); + if (out_tensor != nullptr) { + T* dst_ptr = out_tensor->data() + k * col_len; + paddle::memory::Copy(cpu_place, + dst_ptr, + cpu_place, + src_ptr + col_idx, + sizeof(T) * col_len); + } + col_idx += col_len; + } + } + } +}; + +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu new file mode 100644 index 0000000000000000000000000000000000000000..2abfdb606e7e6c410f6f9deb45aed536bea88207 --- /dev/null +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -0,0 +1,584 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" + +namespace phi { +namespace funcs { + +template +__global__ void ConcatKernel_(const T** inputs, + const int64_t* input_cols, + int col_size, + const int64_t output_rows, + const int64_t output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = input_cols[0]; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = input_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = input_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + + const T* input_ptr = inputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * segment_width + local_col]; + } +} + +template +__device__ void ConcatKernelDetail(const T** inputs_data, + const int fixed_in_col, + const int out_rows, + const int out_cols, + T* output_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * 1.0 / fixed_in_col; + int in_offset = tid_x - split * fixed_in_col; + const T* input_ptr = inputs_data[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) { + output_data[tid_y * out_cols + tid_x] = + input_ptr[tid_y * fixed_in_col + in_offset]; + } + } +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[2]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const T* input_addr2, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[3]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + inputs_data[2] = input_addr2; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const T* input_addr2, + const T* input_addr3, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[4]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + inputs_data[2] = input_addr2; + inputs_data[3] = input_addr3; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T** inputs_data, + const int in_num, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void SplitKernel_(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t* out_cols, + int out_cols_size, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__device__ void SplitKernelDetail(const T* input_data, + const int in_row, + const int in_col, + const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel_(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T** outputs_data) { + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel_(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1) { + T* outputs_data[2]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel_(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1, + T* outputs_addr2) { + T* outputs_data[3]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + outputs_data[2] = outputs_addr2; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel_(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1, + T* outputs_addr2, + T* outputs_addr3) { + T* outputs_data[4]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + outputs_data[2] = outputs_addr2; + outputs_data[3] = outputs_addr3; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +static inline void GetBlockDims(const phi::GPUContext& context, + int64_t num_rows, + int64_t num_cols, + dim3* block_dims, + dim3* grid_dims) { + // Set the thread block and grid according to CurrentDeviceId + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (num_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((num_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + *block_dims = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((num_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = std::min(max_blocks / grid_cols, + std::max(num_rows / block_rows, (int64_t)1)); + *grid_dims = dim3(grid_cols, grid_rows, 1); +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ + +template +struct ConcatFunctor { + void operator()(const phi::GPUContext& context, + const std::vector& input, + int axis, + phi::DenseTensor* output) { + // TODO(zcd): Add input data validity checking + int in_num = input.size(); + int64_t in_row = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + in_row *= dim_0[i]; + } + int64_t in_col = input[0].numel() / in_row; + int64_t out_row = in_row, out_col = 0; + + int inputs_col_num = in_num + 1; + std::vector inputs_data_vec(in_num); + std::vector inputs_col_vec(inputs_col_num); + const T** inputs_data = inputs_data_vec.data(); + int64_t* inputs_col = inputs_col_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, col_alloc; + // TODO(chentianyu03): try to find a method to remove the Alloc function + data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + in_num * sizeof(T*)); + inputs_data = reinterpret_cast(data_alloc->ptr()); + // TODO(chentianyu03): try to find a method to remove the Alloc function + col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + inputs_col_num * sizeof(int)); + inputs_col = reinterpret_cast(col_alloc->ptr()); +#endif + + inputs_col[0] = 0; + bool has_same_shape = true; + for (int i = 0; i < in_num; ++i) { + int64_t t_cols = input[i].numel() / in_row; + if (has_same_shape) { + if (t_cols != in_col) has_same_shape = false; + } + out_col += t_cols; + inputs_col[i + 1] = out_col; + inputs_data[i] = input[i].data(); + } + + dim3 block_dims; + dim3 grid_dims; + GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims); + + paddle::memory::allocation::AllocationPtr tmp_dev_ins_data; + const T** dev_ins_data = nullptr; + if (!has_same_shape || in_num < 2 || in_num > 4) { + tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_data, in_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_data->ptr(), + paddle::platform::CPUPlace(), + restored, + in_num * sizeof(T*), + context.stream()); + dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); + } + + if (has_same_shape) { + if (in_num == 2) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + in_col, + out_row, + out_col, + output->data()); + } else if (in_num == 3) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + inputs_data[2], + in_col, + out_row, + out_col, + output->data()); + } else if (in_num == 4) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + inputs_data[2], + inputs_data[3], + in_col, + out_row, + out_col, + output->data()); + } else { + ConcatKernel_<<>>( + dev_ins_data, in_num, in_col, out_row, out_col, output->data()); + } + } else { + auto tmp_dev_ins_col_data = + paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t)); + + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_col, inputs_col_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_col_data->ptr(), + paddle::platform::CPUPlace(), + restored, + inputs_col_num * sizeof(int64_t), + context.stream()); + int64_t* dev_ins_col_data = + static_cast(tmp_dev_ins_col_data->ptr()); + + ConcatKernel_<<>>( + dev_ins_data, + dev_ins_col_data, + static_cast(inputs_col_num), + out_row, + out_col, + output->data()); + } + +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* col_alloc_released = col_alloc.release(); + context.AddStreamCallback([data_alloc_released, col_alloc_released] { + paddle::memory::allocation::Allocator::AllocationDeleter( + data_alloc_released); + paddle::memory::allocation::Allocator::AllocationDeleter( + col_alloc_released); + }); +#endif + } +}; + +template +class SplitFunctor { + public: + void operator()(const phi::GPUContext& context, + const phi::DenseTensor& input, + const std::vector& ref_inputs, + int axis, + std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + + // TODO(zcd): Add input data validity checking + int o_num = outputs->size(); + int64_t out_row = 1; + auto dim_0 = ref_inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + out_row *= dim_0[i]; + } + + int64_t out0_col = ref_inputs[0]->numel() / out_row; + int64_t in_col = 0, in_row = out_row; + bool has_same_shape = true; + + int outputs_cols_num = o_num + 1; + std::vector outputs_data_vec(o_num); + std::vector outputs_cols_vec(outputs_cols_num); + T** outputs_data = outputs_data_vec.data(); + int64_t* outputs_cols = outputs_cols_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, cols_alloc; + // TODO(chentianyu03): try to find a method to remove the Alloc function + data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + o_num * sizeof(T*)); + outputs_data = reinterpret_cast(data_alloc->ptr()); + // TODO(chentianyu03): try to find a method to remove the Alloc function + cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + (outputs_cols_num) * sizeof(int64_t)); + outputs_cols = reinterpret_cast(cols_alloc->ptr()); +#endif + + outputs_cols[0] = 0; + for (int i = 0; i < o_num; ++i) { + int64_t t_col = ref_inputs.at(i)->numel() / out_row; + if (has_same_shape) { + if (t_col != out0_col) has_same_shape = false; + } + in_col += t_col; + outputs_cols[i + 1] = in_col; + if (outputs->at(i) != nullptr) { + outputs_data[i] = outputs->at(i)->data(); + } else { + outputs_data[i] = nullptr; + } + } + + dim3 block_dims; + dim3 grid_dims; + GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims); + + paddle::memory::allocation::AllocationPtr tmp_dev_outs_data; + T** dev_out_gpu_data = nullptr; + if (!has_same_shape || o_num < 2 || o_num > 4) { + // TODO(chentianyu03): try to find a method to remove the Alloc function + tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_data, o_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_outs_data->ptr(), + paddle::platform::CPUPlace(), + restored, + o_num * sizeof(T*), + context.stream()); + dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); + } + + if (has_same_shape) { + if (o_num == 2) { + SplitKernel_<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1]); + } else if (o_num == 3) { + SplitKernel_<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1], + outputs_data[2]); + } else if (o_num == 4) { + SplitKernel_<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1], + outputs_data[2], + outputs_data[3]); + } else { + SplitKernel_<<>>( + input.data(), in_row, in_col, out0_col, dev_out_gpu_data); + } + } else { + auto tmp_dev_ins_col_data = + // TODO(chentianyu03): try to find a method to remove the Alloc + // function + paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_cols, outputs_cols_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_col_data->ptr(), + paddle::platform::CPUPlace(), + restored, + outputs_cols_num * sizeof(int64_t), + context.stream()); + int64_t* dev_outs_col_data = + reinterpret_cast(tmp_dev_ins_col_data->ptr()); + + SplitKernel_<<>>( + input.data(), + in_row, + in_col, + dev_outs_col_data, + static_cast(outputs_cols_num), + dev_out_gpu_data); + } +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* cols_alloc_released = cols_alloc.release(); + context.AddStreamCallback([data_alloc_released, cols_alloc_released] { + paddle::memory::allocation::Allocator::AllocationDeleter( + data_alloc_released); + paddle::memory::allocation::Allocator::AllocationDeleter( + cols_alloc_released); + }); +#endif + } +}; + +#define DEFINE_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor + +FOR_ALL_TYPES(DEFINE_FUNCTOR); + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..3af4d878d3cab03eb80a6ba878cc4fa5a62103c9 --- /dev/null +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/utils/data_type.h" + +namespace phi { +namespace funcs { + +/* + * \brief Concatenate the input tensors along the dimension axis. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input[0] = [[1,2],[3,4]] + * Input[1] = [[5,6]] + * axis = 0 + * + * Output = [[1,2], + * [3,4], + * [5,6]] + */ +template +struct ConcatFunctor { + void operator()(const Context& context, + const std::vector& input, + int axis, + phi::DenseTensor* output); +}; + +/* + * \brief Split the input tensors along the dimension axis into outputs. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input = [[1,2], + * [3,4], + * [5,6]] + * axis = 0 + * + * Output[0] = [[1,2],[3,4]] + * Output[1] = [[5,6]] + */ +template +class SplitFunctor { + public: + void operator()(const Context& context, + const phi::DenseTensor& input, + const std::vector& ref_inputs, + int axis, + std::vector* outputs); +}; + +} // namespace funcs +} // namespace phi + +#define FOR_ALL_TYPES(macro) \ + macro(int); \ + macro(float); \ + macro(double); \ + macro(bool); \ + macro(int64_t); \ + macro(int16_t); \ + macro(uint8_t); \ + macro(int8_t); \ + macro(phi::dtype::float16); \ + macro(phi::dtype::bfloat16); \ + macro(phi::dtype::complex); \ + macro(phi::dtype::complex); diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h index 63f0c8058acc16f1665bda7d6a2b91cdc24ef2b0..32237e2cc236657db5a99fdd64392da4ff900562 100644 --- a/paddle/phi/kernels/funcs/concat_funcs.h +++ b/paddle/phi/kernels/funcs/concat_funcs.h @@ -23,7 +23,7 @@ static inline int64_t ComputeAxis(int64_t axis, int64_t rank) { PADDLE_ENFORCE_EQ( axis >= -rank && axis < rank, true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The axis is expected to be in range of [%d, %d), but got %d", -rank, rank, @@ -42,17 +42,17 @@ static inline phi::DDim ComputeAndCheckShape( auto out_dims = inputs_dims[0]; size_t in_zero_dims_size = out_dims.size(); for (size_t i = 1; i < n; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), - out_dims.size(), - paddle::platform::errors::InvalidArgument( - "The shape of input[0] and input[%d] " - "is expected to be equal." - "But received input[0]'s shape = " - "[%s], input[%d]'s shape = [%s].", - i, - inputs_dims[0], - i, - inputs_dims[i])); + PADDLE_ENFORCE_EQ( + inputs_dims[i].size(), + out_dims.size(), + phi::errors::InvalidArgument("The shape of input[0] and input[%d] " + "is expected to be equal." + "But received input[0]'s shape = " + "[%s], input[%d]'s shape = [%s].", + i, + inputs_dims[0], + i, + inputs_dims[i])); for (size_t j = 0; j < in_zero_dims_size; j++) { if (j == axis) { if (is_runtime) { @@ -71,7 +71,7 @@ static inline phi::DDim ComputeAndCheckShape( // check all shape in run time PADDLE_ENFORCE_EQ(inputs_dims[0][j], inputs_dims[i][j], - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "The %d-th dimension of input[0] and input[%d] " "is expected to be equal." "But received input[0]'s shape = " @@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape( } } // namespace funcs -} // namespace pten +} // namespace phi diff --git a/paddle/phi/kernels/funcs/eigen/common.h b/paddle/phi/kernels/funcs/eigen/common.h index dc64d3b122f1014ddfed081269859d46c26f43ad..d34427df0e499b78fccdfe80660277152560e34d 100644 --- a/paddle/phi/kernels/funcs/eigen/common.h +++ b/paddle/phi/kernels/funcs/eigen/common.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace phi { -// EigenDim converts paddle::platform::DDim into Eigen::DSizes. +// EigenDim converts phi::DDim into Eigen::DSizes. template struct EigenDim { using Type = Eigen::DSizes; @@ -29,7 +29,7 @@ struct EigenDim { static Type From(const DDim& dims) { PADDLE_ENFORCE_EQ(arity(dims), D, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Input dimension size should be equal to %d, but " "received dimension size is %d.", arity(dims), @@ -42,7 +42,7 @@ struct EigenDim { } }; -// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. +// Interpret phi::Tensor as EigenTensor and EigenConstTensor. template { int rank = tensor.dims().size(); PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Input dimension number(num_col_dims) must be " "between 0 and %d, but received number is %d.", rank, @@ -100,7 +100,7 @@ struct EigenMatrix : public EigenTensor { int rank = tensor.dims().size(); PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank), true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Input dimension number(num_col_dims) must be " "between 0 and %d, but received number is %d.", rank, diff --git a/paddle/phi/kernels/funcs/eigen/extensions.h b/paddle/phi/kernels/funcs/eigen/extensions.h index 5fc8f76d988d1449cc41a89a1740ffeb9a3b05df..fbb9d8e3d2ef552750fc98d10a63d230661adf49 100644 --- a/paddle/phi/kernels/funcs/eigen/extensions.h +++ b/paddle/phi/kernels/funcs/eigen/extensions.h @@ -14,6 +14,8 @@ #pragma once +#ifndef __xpu__ + #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" @@ -435,3 +437,5 @@ HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) { } // namespace numext } // namespace Eigen + +#endif // __xpu__ diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 9fb2dac6c425f6224da713fb6ada636355b42c26..47f1593a11eb9e29cc90b7db36650826734ac27f 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -21,12 +21,13 @@ limitations under the License. */ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" +#define HOSTDEVICE __host__ __device__ namespace kps = phi::kps; #endif @@ -343,7 +344,7 @@ inline void get_mid_dims(const DDim &x_dims, if (x_dims[i + axis] != y_dims[i]) { PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, true, - paddle::platform::errors::InvalidArgument( + phi::errors::InvalidArgument( "Broadcast dimension mismatch. Operands " "could not be broadcast together with the shape of " "X = [%s] and the shape of Y = [%s]. Received [%d] " @@ -436,7 +437,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout, } } -#if defined(__NVCC__) || defined(__HIPCC__) +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) // static unroller template