未验证 提交 e75c01f9 编写于 作者: W Wang Xin 提交者: GitHub

clean up WITH_MLU (#52546)

上级 075d6b14
...@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) ...@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF) option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF) option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
...@@ -81,9 +80,6 @@ endif() ...@@ -81,9 +80,6 @@ endif()
if(WITH_GPU AND WITH_ROCM) if(WITH_GPU AND WITH_ROCM)
message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
endif() endif()
if(WITH_GPU AND WITH_MLU)
message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
endif()
if(WITH_GPU AND NOT APPLE) if(WITH_GPU AND NOT APPLE)
enable_language(CUDA) enable_language(CUDA)
...@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL) ...@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
CACHE STRING "Disable BKCL when compiling without XPU" FORCE) CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
endif() endif()
if(NOT WITH_MLU AND WITH_CNCL)
message(
WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
set(WITH_MLU
OFF
CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
endif()
if(WITH_NCCL) if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL") add_definitions("-DPADDLE_WITH_NCCL")
include(nccl) include(nccl)
...@@ -469,10 +457,6 @@ if(WITH_GPU) ...@@ -469,10 +457,6 @@ if(WITH_GPU)
endif() endif()
endif() endif()
if(WITH_MLU)
include(neuware)
endif()
if(WITH_ROCM) if(WITH_ROCM)
include(hip) include(hip)
include(miopen) # set miopen libraries, must before configure include(miopen) # set miopen libraries, must before configure
......
...@@ -116,11 +116,6 @@ if(WITH_IPU) ...@@ -116,11 +116,6 @@ if(WITH_IPU)
add_definitions(-DPADDLE_WITH_IPU) add_definitions(-DPADDLE_WITH_IPU)
endif() endif()
if(WITH_MLU)
message(STATUS "Compile with MLU!")
add_definitions(-DPADDLE_WITH_MLU)
endif()
if(WITH_GPU) if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
......
if(NOT WITH_MLU)
return()
endif()
if(NOT ENV{NEUWARE_HOME})
set(NEUWARE_HOME "/usr/local/neuware")
else()
set(NEUWARE_HOME $ENV{NEUWARE_HOME})
endif()
message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
include_directories(${NEUWARE_INCLUDE_DIR})
set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
${CNPAPI_LIB})
if(WITH_CNCL)
message(STATUS "Compile with CNCL!")
add_definitions(-DPADDLE_WITH_CNCL)
set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
endif()
target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS})
...@@ -74,9 +74,6 @@ function(op_library TARGET) ...@@ -74,9 +74,6 @@ function(op_library TARGET)
set(MKLDNN_FILE) set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer set(op_common_deps operator op_registry math_function layer
common_infer_shape_functions) common_infer_shape_functions)
if(WITH_MLU)
set(op_common_deps ${op_common_deps} mlu_baseop)
endif()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set(options UNITY) set(options UNITY)
...@@ -169,12 +166,6 @@ function(op_library TARGET) ...@@ -169,12 +166,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${TARGET}.kps) list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
endif() endif()
endif() endif()
if(WITH_MLU)
string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
...@@ -201,8 +192,6 @@ function(op_library TARGET) ...@@ -201,8 +192,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${src}) list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$") elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
list(APPEND xpu_kp_cc_srcs ${src}) list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
list(APPEND mlu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$") elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
...@@ -519,18 +508,6 @@ function(op_library TARGET) ...@@ -519,18 +508,6 @@ function(op_library TARGET)
endforeach() endforeach()
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MLU
if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
foreach(mlu_src ${mlu_cc_srcs})
set(op_name "")
find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
......
...@@ -356,11 +356,6 @@ if(WITH_XPU) ...@@ -356,11 +356,6 @@ if(WITH_XPU)
list(APPEND third_party_deps extern_xpu) list(APPEND third_party_deps extern_xpu)
endif() endif()
if(WITH_MLU)
include(external/concurrentqueue) # download, build, install concurrentqueue
list(APPEND third_party_deps extern_concurrentqueue)
endif()
if(WITH_PSLIB) if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib) list(APPEND third_party_deps extern_pslib)
......
...@@ -99,11 +99,6 @@ struct DLDeviceVisitor ...@@ -99,11 +99,6 @@ struct DLDeviceVisitor
"platform::NPUPinnedPlace is not supported")); "platform::NPUPinnedPlace is not supported"));
} }
inline ::DLDevice operator()(const platform::MLUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::MLUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CustomPlace &place) const { inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported")); "platform::CustomPlace is not supported"));
......
...@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else #else
PADDLE_THROW( PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
#endif
} else if (platform::is_mlu_place(place_)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
#endif #endif
} else if (platform::is_custom_place(place_)) { } else if (platform::is_custom_place(place_)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
......
...@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback( ...@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void MLUDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void MLUDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
MLUUnsafeFastGarbageCollector::MLUUnsafeFastGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void MLUUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
MLUStreamGarbageCollector::MLUStreamGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {
platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueCreate(&stream_));
callback_manager_.reset(
new platform::StreamCallbackManager<mluStream>(stream_));
}
MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
auto place = this->dev_ctx_->GetPlace();
platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
}
mluStream MLUStreamGarbageCollector::stream() const { return stream_; }
void MLUStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
void MLUStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback_manager_->AddCallback(callback);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector( CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size) const platform::CustomPlace &place, size_t max_memory_size)
......
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/platform/stream_callback_manager.h" #include "paddle/fluid/platform/stream_callback_manager.h"
namespace paddle { namespace paddle {
...@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { ...@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
}; };
#endif #endif
#ifdef PADDLE_WITH_MLU
class MLUDefaultStreamGarbageCollector : public GarbageCollector {
public:
MLUDefaultStreamGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class MLUUnsafeFastGarbageCollector : public GarbageCollector {
public:
MLUUnsafeFastGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class MLUStreamGarbageCollector : public GarbageCollector {
public:
MLUStreamGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
~MLUStreamGarbageCollector();
void Wait() const override;
mluStream stream() const;
protected:
void ClearCallback(const std::function<void()> &callback) override;
private:
mluStream stream_;
std::unique_ptr<platform::StreamCallbackManager<mluStream>> callback_manager_;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDefaultStreamGarbageCollector : public GarbageCollector { class CustomDefaultStreamGarbageCollector : public GarbageCollector {
public: public:
......
...@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, ...@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \ #define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \ customized_name, \
customized_type_value, \ customized_type_value, \
...@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, ...@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \ REGISTER_OP_KERNEL_EX( \
op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE, \ op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE, \
......
...@@ -57,10 +57,6 @@ class DenseTensor; ...@@ -57,10 +57,6 @@ class DenseTensor;
#include "paddle/fluid/platform/mkldnn_op_list.h" #include "paddle/fluid/platform/mkldnn_op_list.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#endif #endif
...@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else #else
auto dev_id = place.device; auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id); platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_mlu_place(place)) {
#ifndef PADDLE_WITH_MLU
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with MLU support.",
place));
#else
auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id);
#endif #endif
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE #ifndef PADDLE_WITH_CUSTOM_DEVICE
...@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
platform::is_custom_place(expected_kernel_key.place_)) { platform::is_custom_place(expected_kernel_key.place_)) {
......
...@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ...@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CUDA device since it's not compiled with CUDA," "Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support.")); "Please recompile or reinstall Paddle with GPU support."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif #endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU) #if defined(PADDLE_WITH_XPU)
......
...@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key, ...@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (kernel_key.backend() == phi::Backend::MLU) {
VLOG(3) << "phi missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << kernel_key
<< ", fallback to CPU one!";
return phi::KernelKey(
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
}
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
if (kernel_key.backend() == phi::Backend::IPU) { if (kernel_key.backend() == phi::Backend::IPU) {
VLOG(3) << "phi missing IPU kernel: " << op.Type() VLOG(3) << "phi missing IPU kernel: " << op.Type()
......
...@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src, ...@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
"Copying from %s to %s is not supported.", src_place, dst_place)); "Copying from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_mlu_place = src_place;
auto dst_cpu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_cpu_place = src_place;
auto dst_mlu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_mlu_place = src_place;
auto dst_mlu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
} }
template <typename TENSOR> template <typename TENSOR>
...@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src, ...@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) { platform::is_cpu_place(dst_place)) {
...@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os, ...@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_mlu_place(tensor.place())) {
#ifdef PADDLE_WITH_MLU
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& mlu_dev_ctx =
static_cast<const platform::MLUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu,
buf.get(),
tensor.place(),
reinterpret_cast<const void*>(data),
size_to_write,
mlu_dev_ctx.stream());
mlu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
#endif #endif
} else if (platform::is_custom_place(tensor.place())) { } else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
...@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is, ...@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
platform::is_npu_place(dev_ctx.GetPlace()) || platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) { platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor; phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(shape)); cpu_tensor.Resize(phi::make_ddim(shape));
framework::VisitDataType( framework::VisitDataType(
...@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is, ...@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) { } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
} }
#endif #endif
} else { } else {
...@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is, ...@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
platform::is_npu_place(dev_ctx.GetPlace()) || platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) { platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \ defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor; phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(dims)); cpu_tensor.Resize(phi::make_ddim(dims));
framework::VisitDataType( framework::VisitDataType(
...@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is, ...@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) { } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else if (platform::is_npu_place(dev_ctx.GetPlace())) { } else if (platform::is_npu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU")); "NPUPlace is not supported when not compiled with NPU"));
......
...@@ -26,9 +26,6 @@ limitations under the License. */ ...@@ -26,9 +26,6 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
...@@ -142,11 +139,6 @@ void TensorFromArray(const T* src, ...@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream()); reinterpret_cast<const phi::GPUContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
...@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream()); reinterpret_cast<const phi::GPUContext&>(ctx).stream());
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
...@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src, ...@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
...@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src, ...@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr); memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
......
...@@ -177,10 +177,6 @@ if(WITH_GLOO) ...@@ -177,10 +177,6 @@ if(WITH_GLOO)
endif() endif()
endif() endif()
if(WITH_MLU)
set(MLU_DEPS mlu_baseop)
endif()
if(NOT WITH_ASCEND_CL) if(NOT WITH_ASCEND_CL)
cc_library( cc_library(
gradient_accumulator gradient_accumulator
......
...@@ -159,15 +159,6 @@ AmpOperators::AmpOperators() ...@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16)); OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(), unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
unsupported_ops_xpu_bf16.end()); unsupported_ops_xpu_bf16.end());
#elif defined(PADDLE_WITH_MLU)
auto unsupported_ops_mlu_fp16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
unsupported_ops_mlu_fp16.end());
auto unsupported_ops_mlu_bf16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
unsupported_ops_mlu_bf16.end());
#endif #endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " " VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size() << " " << unsupported_fp16_ops_->size() << " "
......
...@@ -34,9 +34,6 @@ ...@@ -34,9 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
#endif #endif
...@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) { ...@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place)) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<platform::MLUDeviceContext*>(ctx);
if (data_type == framework::DataTypeTrait<float>::DataType()) {
dst_tensor->mutable_data<float>(place);
} else if (data_type ==
framework::DataTypeTrait<platform::float16>::DataType()) {
dst_tensor->mutable_data<platform::float16>(place);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type),
place));
}
static const float alpha = 1.f;
static const float beta = 1.f;
operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor);
operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor);
PADDLE_ENFORCE_MLU_SUCCESS(
cnnlAssignAdd(dev_ctx->cnnl_handle(),
static_cast<const void*>(&alpha),
src_tensor_desc.get(),
operators::GetBasePtr(&src_tensor),
nullptr,
0,
static_cast<const void*>(&beta),
dst_tensor_desc.get(),
operators::GetBasePtr(dst_tensor)));
return;
}
#endif
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not " "Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode", "supported in imperative mode",
......
...@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, ...@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
kernel_signature_(std::move(kernel_signature)), kernel_signature_(std::move(kernel_signature)),
phi_kernel_(phi_kernel) {} phi_kernel_(phi_kernel) {}
#ifdef PADDLE_WITH_MLU
static void tokenize(const std::string& ops,
char delim,
std::unordered_set<std::string>* op_set) {
std::string::size_type beg = 0;
for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
++end) {
op_set->insert(ops.substr(beg, end - beg));
beg = end + 1;
}
op_set->insert(ops.substr(beg));
}
static bool is_in_mlu_black_list(const std::string& op_name) {
static bool inited = false;
static std::unordered_set<std::string> mlu_black_list;
static std::mutex s_mtx;
if (!inited) {
std::lock_guard<std::mutex> guard(s_mtx);
if (!inited) {
if (std::getenv("MLU_BLACK_LIST") != nullptr) {
std::string ops(std::getenv("MLU_BLACK_LIST"));
tokenize(ops, ',', &mlu_black_list);
}
inited = true;
VLOG(3) << "MLU Black List: ";
for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
++iter) {
VLOG(3) << *iter << " ";
}
}
}
if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
return true;
}
return false;
}
#endif
template <typename VarType> template <typename VarType>
PreparedOp PrepareImpl( PreparedOp PrepareImpl(
const NameVarMap<VarType>& ins, const NameVarMap<VarType>& ins,
...@@ -258,12 +216,6 @@ PreparedOp PrepareImpl( ...@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
op.Type(), expected_kernel_key.dtype()); op.Type(), expected_kernel_key.dtype());
#endif #endif
#ifdef PADDLE_WITH_MLU
if (is_in_mlu_black_list(op.Type())) {
expected_kernel_key.set_backend(phi::Backend::CPU);
}
#endif
bool has_phi_kernel = false; bool has_phi_kernel = false;
const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type()); const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
...@@ -468,16 +420,6 @@ PreparedOp PrepareImpl( ...@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
kernel_iter = kernels.find(fluid_kernel_type); kernel_iter = kernels.find(fluid_kernel_type);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
paddle::platform::is_mlu_place(fluid_kernel_type.place_)) {
VLOG(3) << "missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << fluid_kernel_type
<< ", fallbacking to CPU one!";
fluid_kernel_type.place_ = platform::CPUPlace();
kernel_iter = kernels.find(fluid_kernel_type);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() && if (kernel_iter == kernels.end() &&
paddle::platform::is_custom_place(fluid_kernel_type.place_)) { paddle::platform::is_custom_place(fluid_kernel_type.place_)) {
......
...@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( ...@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use IPU device since it's not compiled with IPU," "Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support.")); "Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (platform::is_mlu_place(place)) {
#if defined(PADDLE_WITH_MLU)
gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif #endif
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE) #if defined(PADDLE_WITH_CUSTOM_DEVICE)
...@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type, ...@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
} else if (platform::is_npu_place(place)) { } else if (platform::is_npu_place(place)) {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace.")); "PaddlePaddle should compile with NPU if use NPUPlace."));
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace."));
#endif
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::SetDevice(place); phi::DeviceManager::SetDevice(place);
......
...@@ -58,10 +58,6 @@ ...@@ -58,10 +58,6 @@
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h" #include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
...@@ -194,11 +190,6 @@ class AllocatorFacadePrivate { ...@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) { for (const auto& dev_type : device_types) {
...@@ -254,11 +245,6 @@ class AllocatorFacadePrivate { ...@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id)); InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) { for (const auto& dev_type : device_types) {
...@@ -290,11 +276,6 @@ class AllocatorFacadePrivate { ...@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
} }
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif #endif
break; break;
} }
...@@ -801,12 +782,6 @@ class AllocatorFacadePrivate { ...@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) { void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p); allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
...@@ -851,13 +826,6 @@ class AllocatorFacadePrivate { ...@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
system_allocators_[p] = CreateCUDAAllocator(p); system_allocators_[p] = CreateCUDAAllocator(p);
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
int device_count = platform::GetMLUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::MLUPlace p(i);
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) { for (const auto& dev_type : device_types) {
...@@ -894,12 +862,6 @@ class AllocatorFacadePrivate { ...@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
places.emplace_back(platform::IPUPlace(dev_id)); places.emplace_back(platform::IPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
int device_count = platform::GetMLUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) { for (const auto& dev_type : device_types) {
......
...@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator( ...@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_ = &platform::GpuInitAllocSize; init_allocate_size_func_ = &platform::GpuInitAllocSize;
re_allocate_size_func_ = &platform::GpuReallocSize; re_allocate_size_func_ = &platform::GpuReallocSize;
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_ = &platform::MLUInitAllocSize;
re_allocate_size_func_ = &platform::MLUReallocSize;
#endif #endif
} }
#endif #endif
...@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes = DeviceAllocateSize( allocate_bytes = DeviceAllocateSize(
&platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes); &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
#elif defined(PADDLE_WITH_MLU)
allocate_bytes = DeviceAllocateSize(
&platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
#endif #endif
#endif #endif
......
...@@ -25,9 +25,6 @@ limitations under the License. */ ...@@ -25,9 +25,6 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
...@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) { ...@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
TEST(BuddyAllocator, MluFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator,
600 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, InitRealloc) {
FLAGS_initial_gpu_memory_in_mb = 100;
FLAGS_reallocate_gpu_memory_in_mb = 50;
EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
TestBuddyAllocator(&buddy_allocator, 99 << 20);
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator,
101 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
FLAGS_initial_gpu_memory_in_mb = 5;
FLAGS_reallocate_gpu_memory_in_mb = 10;
EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 6 << 20);
TestBuddyAllocator(&buddy_allocator, 8 << 20);
TestBuddyAllocator(&buddy_allocator, 9 << 20);
// Greater than max trunk size
TestBuddyAllocator(&buddy_allocator,
11 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, FractionRefillPool) {
FLAGS_fraction_of_gpu_memory_to_use = 0.6;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t max_chunk_size = platform::MLUMaxChunkSize();
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
max_chunk_size);
// Less than pool size
int* p0 = TestBuddyAllocator(&buddy_allocator,
max_chunk_size - 1000,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
size_t alloc =
platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
int* p1 = TestBuddyAllocator(&buddy_allocator,
alloc,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
alloc =
platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
TestBuddyAllocator(&buddy_allocator,
alloc,
/* use_system_allocator = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
buddy_allocator.Free(p0);
buddy_allocator.Free(p1);
}
TEST(BuddyAllocator, AllocFromAvailable) {
FLAGS_fraction_of_gpu_memory_to_use = 0.7;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t total = 0, available = 0;
platform::SetMLUDeviceId(0);
platform::MLUMemoryUsage(&available, &total);
// Take half of available MLU
void* p;
cnrtStatus result = cnrtMalloc(&p, available >> 1);
EXPECT_TRUE(result == cnrtSuccess);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
if (p) {
EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
}
}
TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_fraction_of_gpu_memory_to_use = 1.0;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
void* p = nullptr;
EXPECT_TRUE(cnrtMalloc(&p, static_cast<size_t>(1) << 30) == cnrtSuccess);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
if (p) {
EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
}
}
TEST(BuddyAllocator, Release) {
// In a 8 GB machine, the pool size will be about 800 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 50 << 20);
buddy_allocator.Release();
}
#endif
} // namespace detail } // namespace detail
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>( ...@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
#endif #endif
} }
// For MLU
#ifdef PADDLE_WITH_MLU
class MLUBuddyAllocatorList {
private:
MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
auto mlu_num = devices_.size();
allocators_.resize(mlu_num);
init_flags_.reserve(mlu_num);
for (size_t i = 0; i < mlu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static MLUBuddyAllocatorList *CreateNewInstance() {
return new MLUBuddyAllocatorList();
}
public:
static MLUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for MLU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
}
#endif
template <>
size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
return GetMLUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
#ifdef PADDLE_WITH_MLU
auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::MLUDeviceGuard(place.device);
size_t avail = 0, total = 0;
platform::MLUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
cnrtMemset(ptr, 0xEF, size);
}
}
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetMLUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
return GetMLUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
// For CustomDevice // For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
class BuddyAllocatorList { class BuddyAllocatorList {
......
...@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { ...@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
TEST(NaiveBestFitAllocatorTest, MluAlloc) {
NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::MLUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::MLUPlace(0));
}
#endif
} // namespace allocation } // namespace allocation
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -31,9 +31,6 @@ limitations under the License. */ ...@@ -31,9 +31,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h" #include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
...@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } ...@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif #endif
#ifdef PADDLE_WITH_MLU
void* MLUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedMLUMalloc(&p, size, mlu_id_);
if (result == cnrtSuccess) {
*index = 0;
mlu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedMLUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, mlu_id_);
size_t allocated = total - avail;
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on MLU %d. "
"Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
"available memory is only %s.\n\n"
"Please check whether there is any other process using MLU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another MLU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_,
string::HumanReadableSize(size),
mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size,
mlu_alloc_size_));
mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_);
}
bool MLUAllocator::UseGpu() const { return true; }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
void* CustomAllocator::Alloc(size_t* index, size_t size) { void* CustomAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr; if (size <= 0) return nullptr;
......
...@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator { ...@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
}; };
#endif #endif
#ifdef PADDLE_WITH_MLU
class MLUAllocator : public SystemAllocator {
public:
explicit MLUAllocator(int mlu_id) : mlu_id_(mlu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t mlu_alloc_size_ = 0;
int mlu_id_;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomAllocator : public SystemAllocator { class CustomAllocator : public SystemAllocator {
public: public:
......
...@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) { ...@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
} }
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
TEST(MLUAllocator, Alloc) {
paddle::memory::detail::MLUAllocator a(0);
TestAllocator(&a, 2048);
TestAllocator(&a, 0);
}
TEST(MLUAllocator, AllocFailure) {
paddle::memory::detail::MLUAllocator allocator(0);
size_t index;
size_t alloc_size = (static_cast<size_t>(1) << 40); // Very large number
try {
allocator.Alloc(&index, alloc_size);
ASSERT_TRUE(false);
} catch (paddle::memory::allocation::BadAlloc&) {
PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetLastError());
}
}
#endif
...@@ -23,10 +23,6 @@ limitations under the License. */ ...@@ -23,10 +23,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_header.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, ...@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
template <>
void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(src_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2HAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event(
"MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyD2HSync(dst, src, num);
}
}
template <>
void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(dst_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyH2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(dst_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event(
"MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyH2DSync(dst, src, num);
}
}
template <>
void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
if (dst_place == src_place) {
platform::SetMLUDeviceId(dst_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DSync(dst, src, num);
}
} else {
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerAsync(dst,
dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<mluStream>(stream));
} else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerSync(
dst, dst_place.device, src, src_place.device, num);
}
}
}
// NOTE: only for CPUPlace and MLUPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::MLU) {
platform::MLUPlace place_dst(dst_place.GetDeviceId());
platform::CPUPlace place_src;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::MLU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::MLUPlace place_src(src_place.GetDeviceId());
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::MLU &&
dst_place.GetType() == phi::AllocationType::MLU) {
platform::MLUPlace place_src(src_place.GetDeviceId());
platform::MLUPlace place_dst(dst_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
}
}
// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
template <>
void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
template <>
void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
void* dst,
phi::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
#endif // PADDLE_WITH_MLU
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
......
...@@ -16,9 +16,6 @@ limitations under the License. */ ...@@ -16,9 +16,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
......
...@@ -55,10 +55,6 @@ if (WITH_LITE) ...@@ -55,10 +55,6 @@ if (WITH_LITE)
add_subdirectory(lite) add_subdirectory(lite)
endif() endif()
if (WITH_MLU)
add_subdirectory(mlu)
endif()
if(WITH_CINN) if(WITH_CINN)
add_subdirectory(cinn) add_subdirectory(cinn)
endif() endif()
...@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL) ...@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
op_library(sync_batch_norm_op) op_library(sync_batch_norm_op)
endif() endif()
if (WITH_MLU)
op_library(sync_batch_norm_op)
endif()
op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute) op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
op_library(recurrent_op DEPS ${OP_HEADER_DEPS}) op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
found_inf->mutable_data<bool>(dev_ctx.GetPlace());
MLUCnnlTensorDesc scale_desc(*scale);
MLUCnnlTensorDesc found_inf_desc(
*found_inf, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
// check is_finite or is_nan
phi::DenseTensor is_finite(found_inf->type());
if (i != 0) {
is_finite.Resize(phi::make_ddim({1}));
is_finite.mutable_data<bool>(ctx.GetPlace());
} else {
is_finite.ShareDataWith(*found_inf);
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::IsNanInf(
ctx, x_desc.get(), GetBasePtr(x), GetBasePtr(&is_finite));
// save is_finite by logical_and op after checking every input
if (i != 0) {
MLUCnnlTensorDesc is_finite_desc(
is_finite, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_OR,
found_inf_desc.get(),
GetBasePtr(found_inf),
is_finite_desc.get(),
GetBasePtr(&is_finite),
found_inf_desc.get(),
GetBasePtr(found_inf));
}
// The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// But when found_inf is true, the data of Out should not be used.
// So, on MLU, we always compute out with in/scale.
phi::DenseTensor float_x;
phi::DenseTensor float_out;
if (std::is_same<T, paddle::platform::float16>::value) {
float_x.Resize(x->dims());
float_out.Resize(out->dims());
float_x.mutable_data<MPDType>(ctx.GetPlace());
float_out.mutable_data<MPDType>(ctx.GetPlace());
MLUCnnlTensorDesc float_x_desc(float_x);
MLUCnnlTensorDesc float_out_desc(float_out);
auto cast_fp16_type =
GetCastDataType(DataType::FLOAT16, DataType::FLOAT32);
MLUCnnl::Cast(ctx,
cast_fp16_type,
x_desc.get(),
GetBasePtr(x),
float_x_desc.get(),
GetBasePtr(&float_x));
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
float_x_desc.get(),
GetBasePtr(&float_x),
scale_desc.get(),
GetBasePtr(scale),
float_out_desc.get(),
GetBasePtr(&float_out));
auto cast_fp32_type =
GetCastDataType(DataType::FLOAT32, DataType::FLOAT16);
MLUCnnl::Cast(ctx,
cast_fp32_type,
float_out_desc.get(),
GetBasePtr(&float_out),
out_desc.get(),
GetBasePtr(out));
} else {
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
scale_desc.get(),
GetBasePtr(scale),
out_desc.get(),
GetBasePtr(out));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleMLUKernel<float>,
ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
...@@ -21,9 +21,7 @@ limitations under the License. */ ...@@ -21,9 +21,7 @@ limitations under the License. */
#include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/infermeta/unary.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h" #include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h" #include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h" #include "paddle/fluid/prim/utils/static/desc_tensor.h"
...@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel { ...@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
} }
// NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN // NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MLU
auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
if (src_type == dst_type || MLUSupportsCast(src_type, dst_type)) {
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
tensor_place);
} else {
VLOG(3) << "MLU not support cast type: "
<< framework::DataTypeToString(src_type)
<< " to type: " << framework::DataTypeToString(dst_type)
<< ", fallbacking to CPU one!";
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
platform::CPUPlace());
}
#endif
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()), return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
tensor_place); tensor_place);
} }
......
...@@ -23,9 +23,6 @@ ...@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h" #include "paddle/phi/infermeta/multiary.h"
...@@ -57,17 +54,8 @@ struct FillConstantVisitor { ...@@ -57,17 +54,8 @@ struct FillConstantVisitor {
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value || void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type std::is_same<T, int16_t>::value)>::type
* = nullptr) const { * = nullptr) const {
#if defined(PADDLE_WITH_MLU)
if (platform::is_mlu_place(context_.GetPlace())) {
FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
} else {
phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
#else
phi::funcs::SetConstant<DeviceContext, T> set_constant; phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_)); set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
#endif
} }
const DeviceContext &dev_ctx_; const DeviceContext &dev_ctx_;
...@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor, ...@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
#if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
#endif
REGISTER_OP_VERSION(coalesce_tensor) REGISTER_OP_VERSION(coalesce_tensor)
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class BarrierOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CNCL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
const void* sendbuff = in->data();
void* recvbuff = out->mutable_data<T>(place);
int rid = ctx.Attr<int>("ring_id");
auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
auto* comm = cncl_comm->comm();
auto comm_stream = cncl_comm->stream();
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
cnclReduceOp_t cncl_red_type = cnclSum;
dev_ctx.Wait();
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with CNCL."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#include "paddle/fluid/framework/convert_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
#if defined(PADDLE_WITH_CNCL)
auto x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int nranks = ctx.Attr<int>("nranks");
int rid = ctx.Attr<int>("ring_id");
auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
PADDLE_ENFORCE_EQ(
nranks,
comm->nranks(),
platform::errors::InvalidArgument(
"nranks: %s should equal to %s", nranks, comm->nranks()));
framework::DDim out_dims = x->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);
uint32_t send_numel = x->numel();
void* send_buff;
void* recv_buff;
phi::DenseTensor in_tensor, out_tensor;
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast from int64 to int32 since cncl do not support int64
in_tensor.mutable_data<int32_t>(x->dims(), place);
out_tensor.mutable_data<int32_t>(out->dims(), place);
MLUCnnlTensorDesc x_int64_desc(*x);
MLUCnnlTensorDesc x_int32_desc(in_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
x_int64_desc.get(),
GetBasePtr(x),
x_int32_desc.get(),
GetBasePtr(&in_tensor));
send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
} else {
in_tensor.ShareDataWith(*x);
out_tensor.ShareDataWith(*out);
send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
}
mluStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
cnclDataType_t dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(in_tensor.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast back from int64 out_tensor to out
MLUCnnlTensorDesc out_int64_desc(*out);
MLUCnnlTensorDesc out_int32_desc(out_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
out_int32_desc.get(),
GetBasePtr(&out_tensor),
out_int64_desc.get(),
GetBasePtr(out));
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allgather,
ops::CAllGatherOpMLUKernel<float>,
ops::CAllGatherOpMLUKernel<uint8_t>,
ops::CAllGatherOpMLUKernel<int>,
ops::CAllGatherOpMLUKernel<int8_t>,
ops::CAllGatherOpMLUKernel<int16_t>,
ops::CAllGatherOpMLUKernel<int64_t>,
ops::CAllGatherOpMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_max,
ops::CAllReduceOpMLUKernel<ops::kRedMax, float>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_min,
ops::CAllReduceOpMLUKernel<ops::kRedMin, float>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_prod,
ops::CAllReduceOpMLUKernel<ops::kRedProd, float>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CNCL)
auto x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int numel = x->numel();
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
int rid = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
mluStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
if (root == comm->rank()) {
PADDLE_ENFORCE_MLU_SUCCESS(
cnclBcast(reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
numel,
dtype,
root,
comm->comm(),
stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
<< x->numel();
if (out != x) {
framework::TensorCopy(
*static_cast<const phi::DenseTensor*>(x),
place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<phi::DenseTensor*>(out));
}
} else {
PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place),
numel,
dtype,
root,
comm->comm(),
stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
<< phi::product(out->dims());
}
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_broadcast,
ops::CBroadcastOPMLUKernel<float>,
ops::CBroadcastOPMLUKernel<plat::float16>,
ops::CBroadcastOPMLUKernel<int>,
ops::CBroadcastOPMLUKernel<int16_t>,
ops::CBroadcastOPMLUKernel<int8_t>,
ops::CBroadcastOPMLUKernel<uint8_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_max,
ops::CReduceOpMLUKernel<ops::kRedMax, float>,
ops::CReduceOpMLUKernel<ops::kRedMax, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedMax, int>,
ops::CReduceOpMLUKernel<ops::kRedMax, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedMax, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedMax, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_min,
ops::CReduceOpMLUKernel<ops::kRedMin, float>,
ops::CReduceOpMLUKernel<ops::kRedMin, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedMin, int>,
ops::CReduceOpMLUKernel<ops::kRedMin, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedMin, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedMin, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_prod,
ops::CReduceOpMLUKernel<ops::kRedProd, float>,
ops::CReduceOpMLUKernel<ops::kRedProd, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedProd, int>,
ops::CReduceOpMLUKernel<ops::kRedProd, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedProd, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedProd, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_sum,
ops::CReduceOpMLUKernel<ops::kRedSum, float>,
ops::CReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedSum, int>,
ops::CReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedSum, uint8_t>)
...@@ -33,22 +33,7 @@ Call calculation stream synchronization. ...@@ -33,22 +33,7 @@ Call calculation stream synchronization.
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform; namespace plat = paddle::platform;
using MLU = plat::MLUPlace;
REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream,
ops::CSyncCalcStreamOp, ops::CSyncCalcStreamOp,
ops::CSyncCalcStreamOpMaker); ops::CSyncCalcStreamOpMaker);
REGISTER_OP_NPU_KERNEL(c_sync_calc_stream,
ops::CSyncCalcStreamKernel<float, MLU>,
ops::CSyncCalcStreamKernel<double, MLU>,
ops::CSyncCalcStreamKernel<int, MLU>,
ops::CSyncCalcStreamKernel<int64_t, MLU>,
ops::CSyncCalcStreamKernel<plat::float16, MLU>);
REGISTER_OP_MLU_KERNEL(c_sync_calc_stream,
ops::CSyncCalcStreamKernel<float, MLU>,
ops::CSyncCalcStreamKernel<double, MLU>,
ops::CSyncCalcStreamKernel<int, MLU>,
ops::CSyncCalcStreamKernel<int64_t, MLU>,
ops::CSyncCalcStreamKernel<plat::float16, MLU>);
...@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ...@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, REGISTER_OP_NPU_KERNEL(c_sync_comm_stream,
ops::CSyncCommStreamKernel<float, plat::NPUPlace>); ops::CSyncCommStreamKernel<float, plat::NPUPlace>);
REGISTER_OP_MLU_KERNEL(c_sync_comm_stream,
ops::CSyncCommStreamKernel<float, plat::MLUPlace>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(mp_allreduce_sum,
ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class EqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_EQ,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class NotEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_NE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class LessThanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_LT,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class LessEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_LE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class GreaterThanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_GT,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class GreaterEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_GE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
equal,
ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
not_equal,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
less_than,
ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
less_equal,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
greater_than,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
greater_equal,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T, cnnlLogicOp_t log_method>
class LogicalMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
if (log_method == CNNL_LOGIC_OP_NOT) {
y = x;
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Logic(ctx,
log_method,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(logical_not,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_NOT>);
REGISTER_OP_MLU_KERNEL(logical_and,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_AND>);
REGISTER_OP_MLU_KERNEL(logical_or,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_OR>);
REGISTER_OP_MLU_KERNEL(logical_xor,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_XOR>);
...@@ -38,11 +38,6 @@ if(WITH_XPU) ...@@ -38,11 +38,6 @@ if(WITH_XPU)
detection_library(prior_box_op SRCS prior_box_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc) detection_library(yolo_box_op SRCS yolo_box_op.cc)
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc) detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
elseif(WITH_MLU)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_mlu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
else() else()
detection_library(iou_similarity_op SRCS iou_similarity_op.cc detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu) iou_similarity_op.cu)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/iou_similarity_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
struct IouFunction {
public:
explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
place = ctx.GetPlace();
}
void Transpose(const phi::DenseTensor* x,
phi::DenseTensor* y,
const std::vector<int>& axis) {
// y should be init first
TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
}
void Add(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc add_op_desc(
CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
add_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void Sub(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void Mul(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void DivNoNan(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
MLUCnnl::DivNoNan(ctx,
prefer,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
float alpha = 1.0;
float beta = scalar;
MLUCnnl::Transform(ctx,
&alpha,
&beta,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y));
}
void Maximum(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnl::Maximum(ctx,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
void Minimum(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnl::Minimum(ctx,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
private:
platform::Place place;
const framework::ExecutionContext& ctx;
};
template <typename T>
class IouSimilarityMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
bool normalized = ctx.Attr<bool>("box_normalized");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto _type = x->dtype();
auto place = ctx.GetPlace();
IouFunction<T> F(ctx);
auto N = x->dims()[0];
auto M = y->dims()[0];
out->mutable_data<T>({N, M}, place);
phi::DenseTensor xt(_type);
phi::DenseTensor yt(_type);
xt.mutable_data<T>({4, N}, place);
yt.mutable_data<T>({4, M}, place);
std::vector<int> vec_trans = {1, 0};
F.Transpose(x, &xt, vec_trans);
F.Transpose(y, &yt, vec_trans);
phi::DenseTensor xmin1 = xt.Slice(0, 1);
phi::DenseTensor ymin1 = xt.Slice(1, 2);
phi::DenseTensor xmax1 = xt.Slice(2, 3);
phi::DenseTensor ymax1 = xt.Slice(3, 4);
phi::DenseTensor xmin2 = yt.Slice(0, 1);
phi::DenseTensor ymin2 = yt.Slice(1, 2);
phi::DenseTensor xmax2 = yt.Slice(2, 3);
phi::DenseTensor ymax2 = yt.Slice(3, 4);
xmin1.Resize({N, 1});
ymin1.Resize({N, 1});
xmax1.Resize({N, 1});
ymax1.Resize({N, 1});
xmin2.Resize({1, M});
ymin2.Resize({1, M});
xmax2.Resize({1, M});
ymax2.Resize({1, M});
phi::DenseTensor w1(_type);
phi::DenseTensor h1(_type);
phi::DenseTensor w2(_type);
phi::DenseTensor h2(_type);
phi::DenseTensor area1(_type);
phi::DenseTensor area2(_type);
w1.mutable_data<T>({N, 1}, place);
h1.mutable_data<T>({N, 1}, place);
w2.mutable_data<T>({1, M}, place);
h2.mutable_data<T>({1, M}, place);
area1.mutable_data<T>({N, 1}, place);
area2.mutable_data<T>({1, M}, place);
F.Sub(&xmax1, &xmin1, &w1);
F.Sub(&ymax1, &ymin1, &h1);
F.Sub(&xmax2, &xmin2, &w2);
F.Sub(&ymax2, &ymin2, &h2);
if (!normalized) {
F.Adds(&w1, 1.0f, &w1);
F.Adds(&h1, 1.0f, &h1);
F.Adds(&w2, 1.0f, &w2);
F.Adds(&h2, 1.0f, &h2);
}
F.Mul(&w1, &h1, &area1);
F.Mul(&w2, &h2, &area2);
phi::DenseTensor inter_xmax(_type);
phi::DenseTensor inter_ymax(_type);
phi::DenseTensor inter_xmin(_type);
phi::DenseTensor inter_ymin(_type);
inter_xmax.mutable_data<T>({N, M}, place);
inter_ymax.mutable_data<T>({N, M}, place);
inter_xmin.mutable_data<T>({N, M}, place);
inter_ymin.mutable_data<T>({N, M}, place);
F.Minimum(&xmax1, &xmax2, &inter_xmax);
F.Minimum(&ymax1, &ymax2, &inter_ymax);
F.Maximum(&xmin1, &xmin2, &inter_xmin);
F.Maximum(&ymin1, &ymin2, &inter_ymin);
phi::DenseTensor inter_w(_type);
phi::DenseTensor inter_h(_type);
inter_w.mutable_data<T>({N, M}, place);
inter_h.mutable_data<T>({N, M}, place);
F.Sub(&inter_xmax, &inter_xmin, &inter_w);
F.Sub(&inter_ymax, &inter_ymin, &inter_h);
if (!normalized) {
F.Adds(&inter_w, 1.0f, &inter_w);
F.Adds(&inter_h, 1.0f, &inter_h);
}
phi::DenseTensor zeros(_type);
zeros.mutable_data<T>({1}, place);
FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
F.Maximum(&inter_w, &zeros, &inter_w);
F.Maximum(&inter_h, &zeros, &inter_h);
F.Mul(&inter_w, &inter_h, out);
phi::DenseTensor union_area(_type);
union_area.mutable_data<T>({N, M}, place);
F.Add(&area1, &area2, &union_area);
F.Sub(&union_area, out, &union_area);
F.DivNoNan(out, &union_area, out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(iou_similarity,
ops::IouSimilarityMLUKernel<float>,
ops::IouSimilarityMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/detection/prior_box_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class PriorBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* image = ctx.Input<phi::DenseTensor>("Image");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* variances = ctx.Output<phi::DenseTensor>("Variances");
float step_w = ctx.Attr<float>("step_w");
float step_h = ctx.Attr<float>("step_h");
float offset = ctx.Attr<float>("offset");
bool clip = ctx.Attr<bool>("clip");
bool min_max_aspect_ratios_order =
ctx.Attr<bool>("min_max_aspect_ratios_order");
int im_width = image->dims()[3];
int im_height = image->dims()[2];
int width = input->dims()[3];
int height = input->dims()[2];
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
bool flip = ctx.Attr<bool>("flip");
std::vector<float> new_aspect_ratios;
ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
phi::DenseTensor ratios;
paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
MLUOpTensorDesc new_aspect_ratios_desc(ratios);
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
phi::DenseTensor min;
paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
MLUOpTensorDesc min_sizes_desc(min);
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
phi::DenseTensor max;
paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
MLUOpTensorDesc max_sizes_desc(max);
auto variances_attr = ctx.Attr<std::vector<float>>("variances");
phi::DenseTensor var_tensor;
paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
MLUOpTensorDesc variances_attr_desc(var_tensor);
auto place = ctx.GetPlace();
boxes->mutable_data<T>(place);
variances->mutable_data<T>(place);
MLUOpTensorDesc var_desc(*variances);
MLUOpTensorDesc output_desc(*boxes);
MLUOP::OpPriorBox(ctx,
min_sizes_desc.get(),
GetBasePtr(&min),
new_aspect_ratios_desc.get(),
GetBasePtr(&ratios),
variances_attr_desc.get(),
GetBasePtr(&var_tensor),
max_sizes_desc.get(),
GetBasePtr(&max),
height,
width,
im_height,
im_width,
step_h,
step_w,
offset,
clip,
min_max_aspect_ratios_order,
output_desc.get(),
GetBasePtr(boxes),
var_desc.get(),
GetBasePtr(variances));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class YoloBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* scores = ctx.Output<phi::DenseTensor>("Scores");
const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
auto class_num = ctx.Attr<int>("class_num");
auto conf_thresh = ctx.Attr<float>("conf_thresh");
auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
auto clip_bbox = ctx.Attr<bool>("clip_bbox");
auto scale = ctx.Attr<float>("scale_x_y");
auto iou_aware = ctx.Attr<bool>("iou_aware");
auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
int anchor_num = anchors.size() / 2;
int64_t size = anchors.size();
auto dim_x = x->dims();
int n = dim_x[0];
int s = anchor_num;
int h = dim_x[2];
int w = dim_x[3];
// The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
// H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
// anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
// In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
// of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
// classification scores of boxes.
std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
phi::DenseTensor boxes_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
phi::DenseTensor scores_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
dev_ctx);
MLUOpTensorDesc boxes_trans_desc_mluops(
4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc boxes_trans_desc_cnnl(
4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
MLUOpTensorDesc scores_trans_desc_mluops(
4, scores_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc scores_trans_desc_cnnl(
4, scores_dim_mluops.data(), ToCnnlDataType<T>());
boxes->mutable_data<T>(ctx.GetPlace());
scores->mutable_data<T>(ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
MLUOpTensorDesc img_size_desc(
*img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
anchors_temp.Resize({size});
paddle::framework::TensorFromVector(
anchors, ctx.device_context(), &anchors_temp);
MLUOpTensorDesc anchors_desc(anchors_temp);
MLUCnnlTensorDesc boxes_desc_cnnl(
4, boxes_out_dim.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc scores_desc_cnnl(
4, scores_out_dim.data(), ToCnnlDataType<T>());
MLUOP::OpYoloBox(ctx,
x_desc.get(),
GetBasePtr(x),
img_size_desc.get(),
GetBasePtr(img_size),
anchors_desc.get(),
GetBasePtr(&anchors_temp),
class_num,
conf_thresh,
downsample_ratio,
clip_bbox,
scale,
iou_aware,
iou_aware_factor,
boxes_trans_desc_mluops.get(),
GetBasePtr(&boxes_tensor_mluops),
scores_trans_desc_mluops.get(),
GetBasePtr(&scores_tensor_mluops));
const std::vector<int> perm = {0, 1, 3, 2};
// transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
MLUCnnl::Transpose(ctx,
perm,
4,
boxes_trans_desc_cnnl.get(),
GetBasePtr(&boxes_tensor_mluops),
boxes_desc_cnnl.get(),
GetBasePtr(boxes));
// transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
// class_num]
MLUCnnl::Transpose(ctx,
perm,
4,
scores_trans_desc_cnnl.get(),
GetBasePtr(&scores_tensor_mluops),
scores_desc_cnnl.get(),
GetBasePtr(scores));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
}
};
template <typename T>
class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
MLUCnnlTensorDesc dout_desc(*dout);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_add,
ops::ElementwiseAddMLUKernel<float>,
ops::ElementwiseAddMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradMLUKernel<float>,
ops::ElementwiseAddGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<DIV, T>(ctx);
}
};
template <typename T>
class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
: axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
// compute dout/y == 1/y * dout
phi::DenseTensor dout_div_y(dout->dtype());
dout_div_y.Resize(dout->dims());
dout_div_y.mutable_data<T>(ctx.GetPlace());
MLUBinary<DIV>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
dout_desc.get(),
GetBasePtr(&dout_div_y));
if (dx) {
// compute dx = dout/y = 1/y * dout
if (dx->dims() != dout->dims()) {
dx->mutable_data<T>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_div_y.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dout_div_y),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
dx->ShareDataWith(dout_div_y);
}
}
if (dy) {
// compute dy = -out * (dout/y) = -out/y * dout
phi::DenseTensor neg_out(out->type());
neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
MLUCnnlTensorDesc out_desc(*out);
MLUUnary<NEG>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
out_desc.get(),
GetBasePtr(out),
out_desc.get(),
GetBasePtr(&neg_out));
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(&neg_out),
dout_desc.get(),
GetBasePtr(&dout_div_y),
dout_desc.get(),
GetBasePtr(&dy_temp),
ToCnnlDataType<T>());
if (dy->dims() != dout->dims()) {
dy->mutable_data<T>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
dy->ShareDataWith(dy_temp);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_div,
ops::ElementwiseDivMLUKernel<int>,
ops::ElementwiseDivMLUKernel<float>,
ops::ElementwiseDivMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_div_grad,
ops::ElementwiseDivGradMLUKernel<int>,
ops::ElementwiseDivGradMLUKernel<float>,
ops::ElementwiseDivGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseMaxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<MAXIMUM, T>(ctx);
}
};
template <typename T>
class ElementwiseMaxGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUMinMaxGradHelper<MAXIMUM_GRAD, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(elementwise_max,
ops::ElementwiseMaxMLUKernel<int>,
ops::ElementwiseMaxMLUKernel<float>,
ops::ElementwiseMaxMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
elementwise_max_grad,
ops::ElementwiseMaxGradMLUKernel<int>,
ops::ElementwiseMaxGradMLUKernel<float>,
ops::ElementwiseMaxGradMLUKernel<paddle::platform::float16>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<MINIMUM, T>(ctx);
}
};
template <typename T>
class ElementwiseMinGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUMinMaxGradHelper<MINIMUM_GRAD, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_min,
ops::ElementwiseMinMLUKernel<int>,
ops::ElementwiseMinMLUKernel<float>,
ops::ElementwiseMinMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_min_grad,
ops::ElementwiseMinGradMLUKernel<int>,
ops::ElementwiseMinGradMLUKernel<float>,
ops::ElementwiseMinGradMLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_MLU
#include <vector>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
inline void GetReduceAxes(const int axis,
const framework::DDim& src_ddims,
const framework::DDim& target_ddims,
std::vector<int>* axes) {
int64_t src_dim_size = src_ddims.size();
int64_t target_dim_size = target_ddims.size();
for (int64_t i = 0; i < src_dim_size; ++i) {
if (i < axis || i >= target_dim_size + axis) {
axes->push_back(i);
continue;
}
if (src_ddims[i] > target_ddims[i - axis]) {
axes->push_back(i);
}
}
}
inline void GetReduceAxesAndDstDims(const int axis,
const framework::DDim& src_ddims,
const framework::DDim& target_ddims,
std::vector<int>* reduce_axes,
std::vector<int>* dst_dims_vec) {
int64_t src_dim_size = src_ddims.size();
int64_t target_dim_size = target_ddims.size();
int src_axis = (target_dim_size < src_dim_size ? axis : 0);
for (int ax = 0; ax < src_dim_size; ++ax) {
if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
(src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
reduce_axes->push_back(ax);
} else {
dst_dims_vec->push_back(src_ddims[ax]);
}
}
if (dst_dims_vec->size() == 0) {
// target_var is scalar
dst_dims_vec->push_back(1);
}
}
template <typename T>
void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
const cnnlOpTensorDesc_t op_tensor_op) {
PADDLE_ENFORCE_EQ(
platform::is_mlu_place(ctx.GetPlace()),
true,
platform::errors::Unavailable("This kernel only runs on MLU."));
PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
(op_tensor_op == CNNL_OP_TENSOR_SUB) ||
(op_tensor_op == CNNL_OP_TENSOR_MUL),
true,
platform::errors::Unavailable(
"This kernel of MLU only support ADD, SUB, MUL."));
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlOpTensorDesc op_tensor_desc(
op_tensor_op, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out),
ToCnnlDataType<T>());
}
// ------------------ BinaryOp -----------------
enum BINARY_FUNCTOR {
DIV,
DIVNONAN,
MAXIMUM,
MINIMUM,
POW,
};
template <BINARY_FUNCTOR func>
void MLUBinary(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out);
template <>
inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
}
template <>
inline void MLUBinary<MAXIMUM>(
const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer, // useless, only for compatible
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out);
}
template <>
inline void MLUBinary<MINIMUM>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t in2_desc,
const void* in2,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out);
}
template <>
inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
}
template <BINARY_FUNCTOR Functor, typename T>
void MLUBinaryOp(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
MLUBinary<Functor>(ctx,
prefer_type,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out));
}
// ------------------ UnaryOp -----------------
enum UNARY_FUNCTOR {
NEG,
RECIPROCAL,
};
template <UNARY_FUNCTOR func>
void MLUUnary(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
template <>
inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output) {
MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
}
template <>
inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output) {
MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
}
template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
void MLUUnaryOp(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<Tout>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
MLUUnary<Functor>(ctx,
prefer_type,
x_desc.get(),
GetBasePtr(x),
out_desc.get(),
GetBasePtr(out));
}
// ------------------ MLUElementwiseGradOp -----------------
enum MINMAX_GRAD_FUNCTOR {
MAXIMUM_GRAD,
MINIMUM_GRAD,
};
template <MINMAX_GRAD_FUNCTOR Functor, typename Tin, typename Tout = Tin>
void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
// mask = Logic(x, y) only support min & max
cnnlLogicOp_t logic =
Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
phi::DenseTensor mask(x->dtype());
mask.Resize(phi::make_ddim(out_dims_array));
mask.mutable_data<Tin>(ctx.GetPlace());
cnnlDataType_t data_type = ToCnnlDataType<Tin>();
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
MLUCnnlTensorDesc mask_desc(max_dim, out_dims_array.data(), data_type);
MLUCnnl::Logic(ctx,
logic,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
mask_desc.get(),
GetBasePtr(&mask));
// dx = Mul(dz, mask)
phi::DenseTensor dx_temp(x->dtype());
dx_temp.Resize(dout->dims());
dx_temp.mutable_data<Tout>(ctx.GetPlace());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
dout_desc.get(),
GetBasePtr(&mask),
dout_desc.get(),
GetBasePtr(&dx_temp),
data_type);
// dy = Sub(dz, dx)
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<Tout>(ctx.GetPlace());
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
dout_desc.get(),
GetBasePtr(&dx_temp),
dout_desc.get(),
GetBasePtr(&dy_temp),
data_type);
if (dx) {
if (dx->dims() != dout->dims()) {
dx->mutable_data<Tout>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dx_temp),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
dx->ShareDataWith(dx_temp);
}
}
if (dy) {
if (dy->dims() != dout->dims()) {
dy->mutable_data<Tout>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
dy->ShareDataWith(dy_temp);
}
}
}
} // namespace operators
} // namespace paddle
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
using MLUDeviceContext = platform::MLUDeviceContext;
template <typename T>
class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
}
};
template <typename T>
class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
: axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout->dims()) {
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
x_desc.get(),
GetBasePtr(dx),
ToCnnlDataType<T>());
} else {
phi::DenseTensor dx_temp(x->dtype());
dx_temp.Resize(dout->dims());
dx_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
dout_desc.get(),
GetBasePtr(&dx_temp),
ToCnnlDataType<T>());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dx_temp),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout->dims()) {
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(dy),
ToCnnlDataType<T>());
} else {
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
dout_desc.get(),
GetBasePtr(&dy_temp),
ToCnnlDataType<T>());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(elementwise_mul,
ops::ElementwiseMulMLUKernel<float>,
ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
ops::ElementwiseMulMLUKernel<int>);
REGISTER_OP_MLU_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradMLUKernel<float>,
ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
ops::ElementwiseMulGradMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwisePowMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<POW, T>(ctx);
}
};
template <typename T>
class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto place = ctx.GetPlace();
auto x_dims = x->dims();
auto y_dims = y->dims();
axis =
(axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
cnnlDataType_t data_type = ToCnnlDataType<T>();
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type);
auto dout_dims = dout->dims();
if (dx) {
// dx = dout * y * pow(x, y - 1);
phi::DenseTensor one_dx(y->type());
one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
phi::DenseTensor sub_dx(y->type());
sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
y_desc.get(),
GetBasePtr(y),
y_desc.get(),
GetBasePtr(&one_dx),
y_desc.get(),
GetBasePtr(&sub_dx),
data_type);
phi::DenseTensor tmp_dx(x->type());
tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
MLUCnnl::Pow(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(&sub_dx),
out_desc.get(),
GetBasePtr(&tmp_dx));
MLUCnnl::MulAx(ctx,
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(&tmp_dx));
MLUCnnl::MulAx(ctx,
out_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(&tmp_dx));
if (x_dims != dout_dims) {
dx->mutable_data<T>(place);
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes);
if (!reduce_axes.empty()) {
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
out_desc.get(),
GetBasePtr(&tmp_dx),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
}
} else {
dx->ShareDataWith(tmp_dx);
}
}
if (dy) {
// dy = dout * log(x) * pow(x, y)
phi::DenseTensor tmp_dy(y->type());
tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
MLUCnnl::Pow(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(&tmp_dy));
phi::DenseTensor log_x(x->type());
log_x.mutable_data<T>(x->dims(), place);
MLUCnnl::Log(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
CNNL_LOG_E,
x_desc.get(),
GetBasePtr(x),
x_desc.get(),
GetBasePtr(&log_x));
MLUCnnl::MulAx(ctx,
x_desc.get(),
GetBasePtr(&log_x),
out_desc.get(),
GetBasePtr(&tmp_dy));
MLUCnnl::MulAx(ctx,
out_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(&tmp_dy));
if (y_dims != dout_dims) {
dy->mutable_data<T>(place);
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes);
if (!reduce_axes.empty()) {
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
out_desc.get(),
GetBasePtr(&tmp_dy),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
}
} else {
dy->ShareDataWith(tmp_dy);
}
}
if (!dx && !dy) {
PADDLE_THROW(platform::errors::Unavailable(
"Not support all outputs to be empty."));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_pow,
ops::ElementwisePowMLUKernel<plat::float16>,
ops::ElementwisePowMLUKernel<float>);
REGISTER_OP_MLU_KERNEL(elementwise_pow_grad,
ops::ElementwisePowGradMLUKernel<plat::float16>,
ops::ElementwisePowGradMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
}
};
template <typename T>
class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
MLUCnnlTensorDesc dout_desc(*dout);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
tmp_dout = dy;
}
// call neg op, dy = -dout
MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
MLUCnnlTensorDesc dy_desc(*dy);
MLUUnary<NEG>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
tmp_dout_desc.get(),
GetBasePtr(tmp_dout),
dy_desc.get(),
GetBasePtr(dy));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_sub,
ops::ElementwiseSubMLUKernel<int>,
ops::ElementwiseSubMLUKernel<float>,
ops::ElementwiseSubMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
ops::ElementwiseSubGradMLUKernel<int>,
ops::ElementwiseSubGradMLUKernel<float>,
ops::ElementwiseSubGradMLUKernel<plat::float16>);
...@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape( ...@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); *shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>(); shape_data = cpu_shape_tensor.data<int>();
} }
#endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(shape_tensor->place())) {
paddle::framework::TensorCopySync(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
#endif #endif
auto vec_shape = auto vec_shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel()); std::vector<int>(shape_data, shape_data + shape_tensor->numel());
...@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape( ...@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>()); vec_epxand_shape.push_back(*temp.data<int32_t>());
} }
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(tensor->place())) { // NOLINT
phi::DenseTensor temp;
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>());
}
#endif #endif
else { // NOLINT else { // NOLINT
vec_epxand_shape.push_back(*tensor->data<int32_t>()); vec_epxand_shape.push_back(*tensor->data<int32_t>());
......
...@@ -6,11 +6,7 @@ if(WITH_XPU) ...@@ -6,11 +6,7 @@ if(WITH_XPU)
endif() endif()
# please add new math_library in alphabetical order # please add new math_library in alphabetical order
if(WITH_MLU) math_library(concat_and_split DEPS concat_and_split_functor)
math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
else()
math_library(concat_and_split DEPS concat_and_split_functor)
endif()
math_library(context_project DEPS im2col math_function) math_library(context_project DEPS im2col math_function)
math_library(cos_sim_functor) math_library(cos_sim_functor)
math_library(depthwise_conv) math_library(depthwise_conv)
......
...@@ -17,9 +17,6 @@ limitations under the License. */ ...@@ -17,9 +17,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h" #include "paddle/phi/common/float16.h"
...@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> { ...@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
}; };
#endif #endif
#ifdef PADDLE_WITH_MLU
template <typename T>
class ConcatFunctor<platform::MLUDeviceContext, T> {
public:
void operator()(const platform::MLUDeviceContext& context,
const std::vector<phi::DenseTensor>& input,
int axis,
phi::DenseTensor* output) {
int dev_id = context.GetPlace().GetDeviceId();
platform::MLUDeviceGuard guard(dev_id);
auto ins_size = input.size();
const int axis_t = axis;
const int ins_size_t = ins_size;
// mlu should do sth
// init ins tensors
std::vector<const void*> inputs;
std::vector<MLUCnnlTensorDesc> input_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < ins_size; i++) {
input_descs.emplace_back(MLUCnnlTensorDesc(
input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
desc_vector.push_back(input_descs.back().get());
inputs.push_back(input[i].data());
}
// init out tensors
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
// MLU should do sth
MLUCnnl::Concat(context,
ins_size_t,
axis_t,
desc_vector.data(),
inputs.data(),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class SplitFunctor<platform::MLUDeviceContext, T> {
public:
void operator()(const platform::MLUDeviceContext& context,
const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& ref_inputs,
const int axis,
std::vector<phi::DenseTensor*>* outputs) {
if (input.numel() == 0) {
return;
}
int dev_id = context.GetPlace().GetDeviceId();
platform::MLUDeviceGuard guard(dev_id);
auto in_dims = input.dims();
auto out_size = outputs->size();
std::vector<framework::DDim> outs_dims(out_size, in_dims);
for (size_t i = 0; i < out_size; ++i) {
outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
}
// init out tensors
std::vector<void*> vct_tensor;
std::vector<MLUCnnlTensorDesc> output_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < out_size; i++) {
(*outputs)[i]->Resize(outs_dims[i]);
output_descs.emplace_back(
MLUCnnlTensorDesc(*(*outputs)[i],
CNNL_LAYOUT_ARRAY,
ToCnnlDataType((*outputs)[i]->dtype())));
desc_vector.push_back(output_descs.back().get());
vct_tensor.push_back(GetBasePtr((*outputs)[i]));
}
// init in tensors
MLUCnnlTensorDesc input_desc(
input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input.dtype()));
// MLU should do sth
MLUCnnl::Split(context,
out_size,
axis,
input_desc.get(),
input.data(),
desc_vector.data(),
vct_tensor.data());
}
};
#endif
#define DEFINE_FUNCTOR(type) \ #define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<phi::CPUContext, type>; \ template class ConcatFunctor<phi::CPUContext, type>; \
template class SplitFunctor<phi::CPUContext, type>; template class SplitFunctor<phi::CPUContext, type>;
...@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR); ...@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
DEFINE_XPU_FUNCTOR(float) DEFINE_XPU_FUNCTOR(float)
DEFINE_XPU_FUNCTOR(platform::float16) DEFINE_XPU_FUNCTOR(platform::float16)
#endif #endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
template class SplitFunctor<platform::MLUDeviceContext, type>;
DEFINE_MLU_FUNCTOR(float)
DEFINE_MLU_FUNCTOR(platform::float16)
DEFINE_MLU_FUNCTOR(int64_t)
DEFINE_MLU_FUNCTOR(bool)
DEFINE_MLU_FUNCTOR(int)
DEFINE_MLU_FUNCTOR(int8_t)
DEFINE_MLU_FUNCTOR(int16_t)
DEFINE_MLU_FUNCTOR(uint8_t)
#endif
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class AccuracyMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* indices = ctx.Input<phi::DenseTensor>("Indices");
auto* label = ctx.Input<phi::DenseTensor>("Label");
auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
auto* correct = ctx.Output<phi::DenseTensor>("Correct");
auto* total = ctx.Output<phi::DenseTensor>("Total");
int num_samples = indices->dims()[0];
if (num_samples == 0) {
return;
}
// cast `indices` or `label` if their type is not INT32
phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
auto indices_type = framework::TransToProtoVarType(indices->type());
if (indices_type != VT::INT32) {
PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
true,
platform::errors::Unimplemented(
"In accuracy mlu kernel, cast indices from [%s] to "
"[%s] is not supported.",
framework::DataTypeToString(indices_type),
framework::DataTypeToString(VT::INT32)));
indices_int32.Resize(indices->dims());
indices_int32.mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc org_indices_desc(*indices);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
cnnlCastDataType_t cast_type = GetCastDataType(indices_type, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
org_indices_desc.get(),
GetBasePtr(indices),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
} else {
indices_int32.ShareDataWith(*indices);
}
auto label_type = framework::TransToProtoVarType(label->type());
if (label_type != VT::INT32) {
PADDLE_ENFORCE_EQ(
MLUSupportsCast(label_type, VT::INT32),
true,
platform::errors::Unimplemented(
"In accuracy mlu kernel, cast label from [%s] to [%s] "
"is not supported.",
framework::DataTypeToString(label_type),
framework::DataTypeToString(VT::INT32)));
label_int32.Resize(label->dims());
label_int32.mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc org_label_desc(*label);
MLUCnnlTensorDesc label_int32_desc(label_int32);
cnnlCastDataType_t cast_type = GetCastDataType(label_type, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
org_label_desc.get(),
GetBasePtr(label),
label_int32_desc.get(),
GetBasePtr(&label_int32));
} else {
label_int32.ShareDataWith(*label);
}
// equal
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnlTensorDesc label_int32_desc(label_int32);
phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
equal_tensor.Resize(indices->dims());
equal_tensor.mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_EQ,
indices_int32_desc.get(),
GetBasePtr(&indices_int32),
label_int32_desc.get(),
GetBasePtr(&label_int32),
equal_tensor_desc.get(),
GetBasePtr(&equal_tensor));
// cast equal
phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
equal_fp32.Resize(indices->dims());
equal_fp32.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
cnnlCastDataType_t equal_cast_type = GetCastDataType(VT::BOOL, VT::FP32);
MLUCnnl::Cast(ctx,
equal_cast_type,
equal_tensor_desc.get(),
GetBasePtr(&equal_tensor),
equal_fp32_desc.get(),
GetBasePtr(&equal_fp32));
// [correct]
// reduce_max
phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
correct_max.Resize(phi::make_ddim({num_samples}));
correct_max.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc correct_max_desc(correct_max);
MLUCnnlReduceDesc reduce_max_desc({1},
CNNL_REDUCE_MAX,
ToCnnlDataType<float>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_max_desc.get(),
nullptr,
equal_fp32_desc.get(),
GetBasePtr(&equal_fp32),
0 /*indices_size*/,
nullptr,
nullptr,
correct_max_desc.get(),
GetBasePtr(&correct_max));
// reduce_sum
phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
correct_sum.Resize(correct->dims());
correct_sum.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc correct_sum_desc(correct_sum);
MLUCnnlReduceDesc reduce_sum_desc({0},
CNNL_REDUCE_ADD,
ToCnnlDataType<float>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_sum_desc.get(),
nullptr,
correct_max_desc.get(),
GetBasePtr(&correct_max),
0 /*indices_size*/,
nullptr,
nullptr,
correct_sum_desc.get(),
GetBasePtr(&correct_sum));
// cast to int
correct->mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc correct_desc(*correct);
cnnlCastDataType_t correct_cast_type = GetCastDataType(VT::FP32, VT::INT32);
MLUCnnl::Cast(ctx,
correct_cast_type,
correct_sum_desc.get(),
GetBasePtr(&correct_sum),
correct_desc.get(),
GetBasePtr(correct));
// [total]
total->mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc total_desc(*total);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&num_samples,
total_desc.get(),
GetBasePtr(total));
// use `total` of type `float32` for calculating accuracy
phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
total_fp32.Resize(total->dims());
total_fp32.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc total_fp32_desc(total_fp32);
float num_samples_fp32 = static_cast<float>(num_samples);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&num_samples_fp32,
total_fp32_desc.get(),
GetBasePtr(&total_fp32));
// [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc accuracy_desc(*accuracy);
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
correct_sum_desc.get(),
GetBasePtr(&correct_sum),
total_fp32_desc.get(),
GetBasePtr(&total_fp32),
accuracy_desc.get(),
GetBasePtr(accuracy));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(accuracy,
ops::AccuracyMLUKernel<float>,
ops::AccuracyMLUKernel<paddle::platform::float16>,
ops::AccuracyMLUKernel<int16_t>,
ops::AccuracyMLUKernel<int64_t>,
ops::AccuracyMLUKernel<uint8_t>,
ops::AccuracyMLUKernel<int>);
if(WITH_MLU)
cc_library(
mlu_baseop
SRCS mlu_baseop.cc
DEPS neuware_lib device_context)
cc_test(
activation_op_mlu_test
SRCS activation_op_mlu_test.cc
DEPS op_registry activation_op scope device_context executor)
endif()
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace fw = paddle::framework;
namespace plat = paddle::platform;
USE_OP_ITSELF(relu);
USE_OP_DEVICE_KERNEL(relu, MLU);
// relu
template <typename T>
inline T relu(T x) {
return x > 0 ? x : 0.;
}
template <typename T>
inline T relu_grad_dx(T x, T out, T dout) {
return out > 0 ? dout : 0;
}
template <typename T>
void Compare(fw::Scope* scope,
const plat::DeviceContext& ctx,
std::string op_type) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<phi::DenseTensor>();
const int num = 10;
std::vector<T> init_x;
for (int64_t i = 0; i < num * num; ++i) {
init_x.push_back(static_cast<T>(i - 50));
}
paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({num, num});
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<phi::DenseTensor>();
fw::AttributeMap attrs;
auto op = fw::OpRegistry::CreateOp(
op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
ctx.Wait();
// eval time
struct timeval start, end;
gettimeofday(&start, NULL);
for (int i = 0; i < 100; i++) {
op->Run(*scope, place);
}
ctx.Wait();
gettimeofday(&end, NULL);
int micros =
(((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
printf("used time: %d\n", micros / 100);
// eval value
std::vector<T> out_vec;
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_FLOAT_EQ(out_vec[i], relu<T>(init_x[i]));
}
}
template <typename T>
void CompareGrad(fw::Scope* scope,
const plat::DeviceContext& ctx,
std::string op_type) {
auto dout = scope->Var("DOut");
auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<phi::DenseTensor>();
const int num = 10;
std::vector<T> init_dout;
for (int64_t i = 0; i < num * num; ++i) {
init_dout.push_back(static_cast<T>(1.0));
}
std::vector<T> init_out;
for (int64_t i = 0; i < num * num; ++i) {
init_out.push_back(static_cast<T>(i - 50));
}
paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({num, num});
paddle::framework::TensorFromVector(init_out, ctx, tensor_out);
tensor_out->Resize({num, num});
auto dx = scope->Var("DX");
auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
// run
auto place = ctx.GetPlace();
fw::AttributeMap attrs;
auto op = fw::OpRegistry::CreateOp(op_type,
{{"Out@GRAD", {"DOut"}}, {"Out", {"Out"}}},
{{"X@GRAD", {"DX"}}},
attrs);
op->Run(*scope, place);
ctx.Wait();
// eval time
struct timeval start, end;
gettimeofday(&start, NULL);
for (int i = 0; i < 100; i++) {
op->Run(*scope, place);
}
ctx.Wait();
gettimeofday(&end, NULL);
int micros =
(((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
printf("used time: %d\n", micros / 100);
// eval value
std::vector<T> dx_vec;
paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
ctx.Wait();
for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_FLOAT_EQ(dx_vec[i],
relu_grad_dx<T>(dx_vec[i], init_out[i], init_dout[i]));
}
}
TEST(relu, MLU_fp32) {
fw::Scope scope;
auto* ctx = plat::DeviceContextPool::Instance().Get(plat::MLUPlace(0));
Compare<float>(&scope, *ctx, "relu");
}
TEST(relu_grad, MLU_fp32) {
fw::Scope scope;
auto* ctx = plat::DeviceContextPool::Instance().Get(plat::MLUPlace(0));
CompareGrad<float>(&scope, *ctx, "relu_grad");
}
因为 它太大了无法显示 source diff 。你可以改为 查看blob
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cn_api.h>
#include <cnnl.h>
#include <concurrentqueue.h>
#include <mlu_op.h>
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/device/mlu/enforce.h"
namespace paddle {
namespace operators {
using DataLayout = phi::DataLayout;
using ExecutionContext = framework::ExecutionContext;
using DeviceContextPool = platform::DeviceContextPool;
using MLUDeviceContext = platform::MLUDeviceContext;
const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
{"reduce_all", CNNL_REDUCE_AND},
{"reduce_any", CNNL_REDUCE_OR},
{"reduce_max", CNNL_REDUCE_MAX},
{"reduce_mean", CNNL_REDUCE_AVG},
{"reduce_min", CNNL_REDUCE_MIN},
{"reduce_sum", CNNL_REDUCE_ADD},
{"reduce_prod", CNNL_REDUCE_MUL},
};
const std::map<std::string, cnnlInterpMode_t> MLUInterpModeMap = {
{"bilinear", CNNL_INTERP_BILINEAR},
{"nearest", CNNL_INTERP_NEAREST},
{"linear", CNNL_INTERP_LINEAR},
{"trilinear", CNNL_INTERP_TRILINEAR},
{"bicubic", CNNL_INTERP_BICUBIC}};
const std::map<std::string, cnnlInterpBackwardMode_t> MLUInterpBackwardModeMap =
{{"bilinear", CNNL_INTERP_BACKWARD_BILINEAR},
{"nearest", CNNL_INTERP_BACKWARD_NEAREST},
{"linear", CNNL_INTERP_BACKWARD_LINEAR},
{"trilinear", CNNL_INTERP_BACKWARD_TRILINEAR},
{"bicubic", CNNL_INTERP_BACKWARD_BICUBIC}};
inline cnnlReduceOp_t GetMLUCnnlReduceOp(const std::string reduce_name) {
auto iter = MLUReduceOpMap.find(reduce_name);
if (iter != MLUReduceOpMap.end()) {
return iter->second;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Not support reduce op type of MLU Device: %s", reduce_name));
}
inline cnnlInterpMode_t GetMLUCnnlInterpMode(const std::string interp_mode) {
auto iter = MLUInterpModeMap.find(interp_mode);
if (iter != MLUInterpModeMap.end()) {
return iter->second;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Not support interp mode of MLU Device: %s", interp_mode));
}
inline cnnlInterpBackwardMode_t GetMLUCnnlInterpBackwardMode(
const std::string interp_mode) {
auto iter = MLUInterpBackwardModeMap.find(interp_mode);
if (iter != MLUInterpBackwardModeMap.end()) {
return iter->second;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Not support interp mode of MLU Device: %s", interp_mode));
}
inline const void* GetBasePtr(const phi::DenseTensor* t) { return t->data(); }
inline void* GetBasePtr(phi::DenseTensor* t) { return t->data(); }
inline cnnlDataType_t ToCnnlDataType(const phi::DataType& dtype) {
cnnlDataType_t type = CNNL_DTYPE_FLOAT;
switch (dtype) {
case DataType::FLOAT16:
type = CNNL_DTYPE_HALF;
break;
case DataType::FLOAT32:
type = CNNL_DTYPE_FLOAT;
break;
case DataType::FLOAT64:
type = CNNL_DTYPE_DOUBLE;
break;
case DataType::INT8:
type = CNNL_DTYPE_INT8;
break;
case DataType::INT16:
type = CNNL_DTYPE_INT16;
break;
case DataType::INT32:
type = CNNL_DTYPE_INT32;
break;
case DataType::INT64:
type = CNNL_DTYPE_INT64;
break;
case DataType::BOOL:
type = CNNL_DTYPE_BOOL;
break;
case DataType::UINT8:
type = CNNL_DTYPE_UINT8;
break;
default:
break;
}
return type;
}
inline cnnlDataType_t ToCnnlDataType(
const paddle::framework::proto::VarType::Type& type) {
return ToCnnlDataType(framework::TransToPhiDataType(type));
}
template <typename T>
inline cnnlDataType_t ToCnnlDataType() {
auto type = framework::ToDataType(std::type_index(typeid(T)));
return ToCnnlDataType(type);
}
inline mluOpDataType_t ToMluOpDataType(const phi::DataType& dtype) {
mluOpDataType_t type = MLUOP_DTYPE_FLOAT;
switch (dtype) {
case DataType::FLOAT16:
type = MLUOP_DTYPE_HALF;
break;
case DataType::FLOAT32:
type = MLUOP_DTYPE_FLOAT;
break;
case DataType::FLOAT64:
type = MLUOP_DTYPE_DOUBLE;
break;
case DataType::INT8:
type = MLUOP_DTYPE_INT8;
break;
case DataType::INT16:
type = MLUOP_DTYPE_INT16;
break;
case DataType::INT32:
type = MLUOP_DTYPE_INT32;
break;
case DataType::INT64:
type = MLUOP_DTYPE_INT64;
break;
case DataType::BOOL:
type = MLUOP_DTYPE_BOOL;
break;
case DataType::UINT8:
type = MLUOP_DTYPE_UINT8;
break;
default:
break;
}
return type;
}
inline mluOpDataType_t ToMluOpDataType(
const paddle::framework::proto::VarType::Type& type) {
return ToMluOpDataType(framework::TransToPhiDataType(type));
}
template <typename T>
inline mluOpDataType_t ToMluOpDataType() {
auto type = framework::ToDataType(std::type_index(typeid(T)));
return ToMluOpDataType(type);
}
// Converts (via narrowing) a type T value to a type U, and checks that the
// value has no value change due to the conversion.
template <typename WideT, typename NarrowT>
NarrowT CheckedNarrowing(const WideT& wide) {
NarrowT narrow = wide;
CHECK_EQ(narrow, wide)
<< "checked narrowing failed; values not equal post-conversion";
return narrow;
}
inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
}
inline static mluOpHandle_t GetMLUOpHandleFromCTX(const ExecutionContext& ctx) {
return ctx.template device_context<MLUDeviceContext>().mluOp_handle();
}
inline static const MLUDeviceContext& GetDevCtxFromCTX(
const ExecutionContext& ctx) {
return ctx.template device_context<MLUDeviceContext>();
}
using VT = framework::proto::VarType;
const std::map<std::pair<VT::Type, VT::Type>, cnnlCastDataType_t>
MLU_SUPPORTED_CAST_TYPE = {
{{VT::FP32, /*cast to*/ VT::FP16}, CNNL_CAST_FLOAT_TO_HALF},
{{VT::FP32, /*cast to*/ VT::INT32}, CNNL_CAST_FLOAT_TO_INT32},
{{VT::FP32, /*cast to*/ VT::INT16}, CNNL_CAST_FLOAT_TO_INT16},
{{VT::FP32, /*cast to*/ VT::INT8}, CNNL_CAST_FLOAT_TO_INT8},
{{VT::FP32, /*cast to*/ VT::UINT8}, CNNL_CAST_FLOAT_TO_UINT8},
{{VT::FP32, /*cast to*/ VT::BOOL}, CNNL_CAST_FLOAT_TO_BOOL},
{{VT::FP16, /*cast to*/ VT::FP32}, CNNL_CAST_HALF_TO_FLOAT},
{{VT::FP16, /*cast to*/ VT::INT32}, CNNL_CAST_HALF_TO_INT32},
{{VT::FP16, /*cast to*/ VT::INT16}, CNNL_CAST_HALF_TO_INT16},
{{VT::FP16, /*cast to*/ VT::INT8}, CNNL_CAST_HALF_TO_INT8},
{{VT::FP16, /*cast to*/ VT::UINT8}, CNNL_CAST_HALF_TO_UINT8},
{{VT::FP16, /*cast to*/ VT::BOOL}, CNNL_CAST_HALF_TO_BOOL},
{{VT::INT32, /*cast to*/ VT::FP32}, CNNL_CAST_INT32_TO_FLOAT},
{{VT::INT32, /*cast to*/ VT::FP16}, CNNL_CAST_INT32_TO_HALF},
{{VT::INT32, /*cast to*/ VT::INT8}, CNNL_CAST_INT32_TO_INT8},
{{VT::INT32, /*cast to*/ VT::INT16}, CNNL_CAST_INT32_TO_INT16},
{{VT::INT16, /*cast to*/ VT::FP32}, CNNL_CAST_INT16_TO_FLOAT},
{{VT::INT16, /*cast to*/ VT::FP16}, CNNL_CAST_INT16_TO_HALF},
{{VT::INT16, /*cast to*/ VT::INT32}, CNNL_CAST_INT16_TO_INT32},
{{VT::INT8, /*cast to*/ VT::FP32}, CNNL_CAST_INT8_TO_FLOAT},
{{VT::INT8, /*cast to*/ VT::FP16}, CNNL_CAST_INT8_TO_HALF},
{{VT::INT8, /*cast to*/ VT::INT32}, CNNL_CAST_INT8_TO_INT32},
{{VT::UINT8, /*cast to*/ VT::FP32}, CNNL_CAST_UINT8_TO_FLOAT},
{{VT::UINT8, /*cast to*/ VT::FP16}, CNNL_CAST_UINT8_TO_HALF},
{{VT::BOOL, /*cast to*/ VT::FP32}, CNNL_CAST_BOOL_TO_FLOAT},
{{VT::BOOL, /*cast to*/ VT::FP16}, CNNL_CAST_BOOL_TO_HALF},
{{VT::BOOL, /*cast to*/ VT::INT32}, CNNL_CAST_BOOL_TO_INT32},
{{VT::UINT8, /*cast to*/ VT::INT32}, CNNL_CAST_UINT8_TO_INT32},
{{VT::INT32, /*cast to*/ VT::INT64}, CNNL_CAST_INT32_TO_INT64},
{{VT::INT64, /*cast to*/ VT::INT32}, CNNL_CAST_INT64_TO_INT32},
{{VT::INT32, /*cast to*/ VT::BOOL}, CNNL_CAST_INT32_TO_BOOL},
{{VT::UINT8, /*cast to*/ VT::INT64}, CNNL_CAST_UINT8_TO_INT64},
{{VT::INT8, /*cast to*/ VT::INT16}, CNNL_CAST_INT8_TO_INT16},
{{VT::FP32, /*cast to*/ VT::FP64}, CNNL_CAST_FLOAT_TO_DOUBLE},
{{VT::FP64, /*cast to*/ VT::FP32}, CNNL_CAST_DOUBLE_TO_FLOAT},
{{VT::INT64, /*cast to*/ VT::FP32}, CNNL_CAST_INT64_TO_FLOAT},
{{VT::INT64, /*cast to*/ VT::FP16}, CNNL_CAST_INT64_TO_HALF},
{{VT::FP32, /*cast to*/ VT::INT64}, CNNL_CAST_FLOAT_TO_INT64},
{{VT::FP16, /*cast to*/ VT::INT64}, CNNL_CAST_HALF_TO_INT64},
};
cnnlCastDataType_t GetCastDataType(const VT::Type& src_type,
const VT::Type& dst_type);
cnnlCastDataType_t GetCastDataType(const DataType& src_type,
const DataType& dst_type);
bool MLUSupportsCast(const VT::Type& src_type, const VT::Type& dst_type);
cnnlDeviceType_t GetCnnlDev(int dev_ordinal);
using CnnlTensorDesc = cnnlTensorDescriptor_t;
class MLUCnnlTensorDesc {
public:
MLUCnnlTensorDesc() {}
// SE_DISALLOW_COPY_AND_ASSIGN
MLUCnnlTensorDesc(const MLUCnnlTensorDesc& desc) = delete;
MLUCnnlTensorDesc& operator=(const MLUCnnlTensorDesc&) = delete;
MLUCnnlTensorDesc(MLUCnnlTensorDesc&& rhs)
: raw_tensor_desc(rhs.raw_tensor_desc) {
rhs.raw_tensor_desc = nullptr;
}
MLUCnnlTensorDesc& operator=(MLUCnnlTensorDesc&& rhs);
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim,
const int dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype);
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype,
const cnnlTensorLayout_t layout);
MLUCnnlTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
const cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype);
explicit MLUCnnlTensorDesc(const phi::DenseTensor& tensor);
MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position);
MLUCnnlTensorDesc(const phi::DenseTensor& tensor,
cnnlTensorLayout_t layout,
const cnnlDataType_t tensor_dtype,
int position,
float scale);
~MLUCnnlTensorDesc();
const cnnlTensorDescriptor_t get() const { return raw_tensor_desc; }
private:
cnnlTensorDescriptor_t raw_tensor_desc = nullptr;
};
class MLUOpTensorDesc {
public:
MLUOpTensorDesc() {}
// SE_DISALLOW_COPY_AND_ASSIGN
MLUOpTensorDesc(const MLUOpTensorDesc& desc) = delete;
MLUOpTensorDesc& operator=(const MLUOpTensorDesc&) = delete;
MLUOpTensorDesc(MLUOpTensorDesc&& rhs)
: raw_tensor_desc(rhs.raw_tensor_desc) {
rhs.raw_tensor_desc = nullptr;
}
MLUOpTensorDesc& operator=(MLUOpTensorDesc&& rhs);
MLUOpTensorDesc(const int tensor_dim,
const int dim_sizes[],
const mluOpDataType_t tensor_dtype);
MLUOpTensorDesc(const int tensor_dim,
const int dim_sizes[],
const mluOpDataType_t tensor_dtype,
const mluOpTensorLayout_t layout);
MLUOpTensorDesc(const int tensor_dim,
const int dim_sizes[],
const mluOpDataType_t tensor_dtype,
int position);
MLUOpTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const mluOpDataType_t tensor_dtype);
MLUOpTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const mluOpDataType_t tensor_dtype,
const mluOpTensorLayout_t layout);
MLUOpTensorDesc(const int tensor_dim,
const int64_t dim_sizes[],
const mluOpDataType_t tensor_dtype,
int position);
MLUOpTensorDesc(const phi::DenseTensor& tensor,
const mluOpTensorLayout_t layout,
const mluOpDataType_t tensor_dtype);
explicit MLUOpTensorDesc(const phi::DenseTensor& tensor);
MLUOpTensorDesc(const phi::DenseTensor& tensor,
mluOpTensorLayout_t layout,
const mluOpDataType_t tensor_dtype,
int position);
MLUOpTensorDesc(const phi::DenseTensor& tensor,
mluOpTensorLayout_t layout,
const mluOpDataType_t tensor_dtype,
int position,
float scale);
~MLUOpTensorDesc();
const mluOpTensorDescriptor_t get() const { return raw_tensor_desc; }
private:
mluOpTensorDescriptor_t raw_tensor_desc = nullptr;
};
class MLUCnnlActivationDesc {
public:
MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc& operator=(const MLUCnnlActivationDesc& desc) = delete;
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode, const float ceof);
MLUCnnlActivationDesc(const cnnlActivationMode_t act_mode,
const float ceof,
const float sliced_dim,
const float selu_alpha,
const float selu_lambda);
const cnnlActivationDescriptor_t get() const;
~MLUCnnlActivationDesc();
private:
cnnlActivationDescriptor_t active_desc_ = nullptr;
};
class MLUCnnlPoolingDesc {
public:
MLUCnnlPoolingDesc(const MLUCnnlPoolingDesc& desc) = delete;
MLUCnnlPoolingDesc& operator=(const MLUCnnlPoolingDesc& desc) = delete;
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt,
int window_rows,
int window_cols,
int64_t pad_up,
int64_t pad_down,
int64_t pad_left,
int64_t pad_right,
int row_stride,
int col_stride,
int row_dilation,
int col_dilation,
bool ceil_mode);
MLUCnnlPoolingDesc(const cnnlPoolingMode_t mode,
const cnnlNanPropagation_t maxpooling_nan_opt,
const int tensor_rank,
const std::vector<int>& window,
const std::vector<int>& padding,
const std::vector<int>& stride);
const cnnlPoolingDescriptor_t get() const;
~MLUCnnlPoolingDesc();
private:
cnnlPoolingDescriptor_t pooling_desc_ = nullptr;
};
class MLUCnnlRandomGeneratorDesc {
public:
MLUCnnlRandomGeneratorDesc(const ExecutionContext& ctx, const int seed);
const cnnlRandGenerator_t get() const;
phi::DenseTensor& get_state();
~MLUCnnlRandomGeneratorDesc();
private:
phi::DenseTensor mlu_state;
cnnlRandGenerator_t mlu_generator = nullptr;
};
const std::shared_ptr<MLUCnnlRandomGeneratorDesc>& GetMLURandomGenerator(
const ExecutionContext& ctx, const int64_t device_id, const int seed);
class MLUCnnlReduceDesc {
public:
MLUCnnlReduceDesc(const MLUCnnlReduceDesc& desc) = delete;
MLUCnnlReduceDesc& operator=(const MLUCnnlReduceDesc& desc) = delete;
MLUCnnlReduceDesc(const std::vector<int>& axis_vec,
const cnnlReduceOp_t reduce_op,
const cnnlDataType_t data_type,
const cnnlNanPropagation_t nan_propagation,
const cnnlReduceIndices_t reduce_indices,
const cnnlIndicesType_t indices_type);
const cnnlReduceDescriptor_t get() const;
~MLUCnnlReduceDesc();
private:
cnnlReduceDescriptor_t reduction_desc_ = nullptr;
};
class MLUCnnlOpTensorDesc {
public:
MLUCnnlOpTensorDesc(const MLUCnnlOpTensorDesc& desc) = delete;
void operator=(const MLUCnnlOpTensorDesc&) = delete;
MLUCnnlOpTensorDesc(cnnlOpTensorDesc_t op_tensor_op,
cnnlDataType_t op_tensor_comp_type,
cnnlNanPropagation_t op_tensor_nan_opt);
const cnnlOpTensorDescriptor_t get() const;
~MLUCnnlOpTensorDesc();
private:
cnnlOpTensorDescriptor_t op_tensor_desc_ = nullptr;
};
class MLUCnnlNMSDesc {
public:
MLUCnnlNMSDesc(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc& operator=(const MLUCnnlNMSDesc& desc) = delete;
MLUCnnlNMSDesc(const cnnlNmsOutputMode_t mode,
const float iou_threshold,
const int max_output_size,
const float confidence_threshold,
const int input_layout);
const cnnlNmsDescriptor_t get() const;
~MLUCnnlNMSDesc();
private:
cnnlNmsDescriptor_t nms_desc_ = nullptr;
};
class MLUCnnlConvolutionDesc {
public:
MLUCnnlConvolutionDesc(const int dims,
const int pad[],
const int stride[],
const int dilation[],
const int group_count,
const cnnlDataType_t tensor_dtype);
MLUCnnlConvolutionDesc(const int dims,
const int64_t pad[],
const int64_t stride[],
const int64_t dilation[],
const int group_count,
const cnnlDataType_t tensor_dtype);
MLUCnnlConvolutionDesc(const MLUCnnlConvolutionDesc& desc) = delete;
MLUCnnlConvolutionDesc& operator=(const MLUCnnlConvolutionDesc& desc) =
delete;
const cnnlConvolutionDescriptor_t get() const;
~MLUCnnlConvolutionDesc();
private:
cnnlConvolutionDescriptor_t conv_desc_ = nullptr;
};
class MLUCnnlBatchSpaceDesc {
public:
MLUCnnlBatchSpaceDesc(uint32_t block_shape[],
uint32_t paddings[],
const uint32_t block_shape_size,
const uint32_t paddings_size);
void getBatch2spaceNdextraInputSize(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc);
void getSpace2batchNdextraInputSize(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc);
void initSpace2batchNdExtraInput(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
void* extra_host_input);
void initBatch2spaceNdExtraInput(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
void* extra_host_input);
const cnnlSpaceBatchNdDescriptor_t get() const;
size_t getExtraInputSize() const;
~MLUCnnlBatchSpaceDesc();
private:
cnnlSpaceBatchNdDescriptor_t op_desc_ = nullptr;
size_t extra_input_size_;
};
class MLUCnnlTrigonDesc {
public:
explicit MLUCnnlTrigonDesc(
const cnnlTrigonFunctionMode_t trigon_function_mode);
const cnnlTrigonDescriptor_t get() const;
~MLUCnnlTrigonDesc();
private:
cnnlTrigonDescriptor_t trigon_desc_ = nullptr;
};
class MLUCnnlDCNDesc {
public:
MLUCnnlDCNDesc(int dimNb,
const int* pad,
const int* stride,
const int* dilation,
int deformable_group,
int conv_group,
int im2col_step);
const cnnlDCNDescriptor_t get() const;
~MLUCnnlDCNDesc();
private:
cnnlDCNDescriptor_t dcn_desc_ = nullptr;
};
class MLUCnnlGridSampleDesc {
public:
MLUCnnlGridSampleDesc(const std::string& interp_mode_str,
const std::string& padding_mode_str,
bool align_corners);
const cnnlGridSampleDescriptor_t get() const;
~MLUCnnlGridSampleDesc();
private:
cnnlGridSampleDescriptor_t grid_sample_desc_ = nullptr;
};
class MLUSeqDataDesc {
public:
MLUSeqDataDesc(const MLUSeqDataDesc& desc) = delete;
MLUSeqDataDesc& operator=(const MLUSeqDataDesc& desc) = delete;
MLUSeqDataDesc(cnnlSeqDataLayout_t layout,
cnnlDataType_t dtype,
int dimNb,
const int dimSize[],
int seqLengthArraySize,
const int seqLengthArray[],
void* paddingFill);
const cnnlSeqDataDescriptor_t get() const;
~MLUSeqDataDesc();
private:
cnnlSeqDataDescriptor_t seq_data_desc_ = nullptr;
};
class MLURNNDesc {
public:
MLURNNDesc(const MLURNNDesc& desc) = delete;
MLURNNDesc& operator=(const MLURNNDesc& desc) = delete;
MLURNNDesc(const int hidden_size,
const int num_layers,
const cnnlRNNInputMode_t input_mode,
const cnnlDirectionMode_t direction,
const cnnlRNNMode_t rnn_mode);
MLURNNDesc(cnnlRNNMode_t cell_mode,
cnnlRNNBiasMode_t bias_mode,
cnnlDirectionMode_t direction,
cnnlRNNInputMode_t input_mode,
cnnlDataType_t data_type,
cnnlDataType_t math_prec,
int input_size,
int hidden_size,
int proj_size,
int layer_num,
void* dropout_desc,
cnnlRNNPaddingMode_t padding_mode);
void SetRNNProjectionLayers(const int rec_proj_size,
const int out_proj_size) {
PADDLE_ENFORCE_MLU_SUCCESS(
cnnlSetRNNProjectionLayers(rnn_desc_, rec_proj_size, out_proj_size));
}
void SetPeepholeMode(const cnnlRNNPeepholeMode_t peephole_mode) {
PADDLE_ENFORCE_MLU_SUCCESS(
cnnlSetRNNPeepholeMode(rnn_desc_, peephole_mode));
}
void SetRNNBiasMode(const cnnlRNNBiasMode_t bias_mode) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNBiasMode(rnn_desc_, bias_mode));
}
void SetRNNMaskMode(const cnnlRNNMaskMode_t mask_mode) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNMaskMode(rnn_desc_, mask_mode));
}
void SetRNNClip(const cnnlRNNClipMode_t clip_mode,
const cnnlNanPropagation_t clip_nan_opt,
const double left_clip,
const double right_clip) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNClip(
rnn_desc_, clip_mode, clip_nan_opt, left_clip, right_clip));
}
void SetRNNPaddingMode(const cnnlRNNPaddingMode_t padding_mode) {
PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetRNNPaddingMode(rnn_desc_, padding_mode));
}
const cnnlRNNDescriptor_t get() const;
~MLURNNDesc();
private:
cnnlRNNDescriptor_t rnn_desc_ = nullptr;
};
class MLUCnnl {
public:
static void Active(const ExecutionContext& ctx,
cnnlActivationDescriptor_t active_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ActiveGrad(const ExecutionContext& ctx,
cnnlActivationDescriptor_t active_desc,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Concat(const ExecutionContext& ctx,
const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[],
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Concat(const MLUDeviceContext& dev_ctx,
const int pack_num,
const int axis,
const cnnlTensorDescriptor_t inputs_desc[],
const void* const inputs[],
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cast(const ExecutionContext& ctx,
cnnlCastDataType_t cast_type,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Clip(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* min,
const void* max,
void* y);
static void HardtanhBackward(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const float max_val,
const float min_val,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Div(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Fill(const ExecutionContext& ctx,
const cnnlPointerMode_t pointer_mode,
const void* value_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LRN(const ExecutionContext& ctx,
const int local_size,
const double alpha,
const double beta,
const double k,
const cnnlTensorDescriptor_t input_quant_desc,
const void* input_quant,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantifyOffline(const ExecutionContext& context,
cnnlQuantizeMode_t mode,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void QuantifyOnline(const ExecutionContext& context,
const int bitwidth,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const bool compute_scale,
void* position,
void* scale,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void SGD(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const cnnlTensorDescriptor_t var_desc,
void* var);
static void ApplyAdaGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t accum_desc,
void* accum,
const cnnlTensorDescriptor_t var_desc,
void* var,
const void* lr,
const bool update_slots);
static void ApplyRMSProp(const ExecutionContext& context,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const void* rho,
const void* momentum,
const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t ms_desc,
void* ms,
const cnnlTensorDescriptor_t mom_desc,
void* mom);
static void ApplyCenterRMSProp(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const void* rho,
const void* momentum,
const void* epsilon,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t mg_desc,
void* mg,
const cnnlTensorDescriptor_t ms_desc,
void* ms,
const cnnlTensorDescriptor_t mom_desc,
void* mom);
static void ApplyAdam(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t m_desc,
void* m,
const cnnlTensorDescriptor_t v_desc,
void* v,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const void* lr,
const void* beta1,
const void* beta2,
const void* beta1_power,
const void* beta2_power,
const void* epsilon,
const bool use_nesterov);
static void ApplyAdaMax(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const cnnlTensorDescriptor_t var_desc,
void* var,
const cnnlTensorDescriptor_t m_desc,
void* m,
const cnnlTensorDescriptor_t v_desc,
void* v,
const void* diff,
const void* lr,
const void* beta1,
const void* beta2,
const void* beta1_power,
const void* epsilon);
static void ApplyMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const bool use_nesterov,
const void* lr,
const void* momentum,
void* var,
void* accum);
static void ApplyKerasMomentum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const bool use_nesterov,
const void* lr,
const void* momentum,
void* var,
void* accum);
static void ApplyAdadelta(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t grad_desc,
const void* diff,
const void* lr,
const void* rho,
const void* epsilon,
void* var,
void* accum,
void* accum_update);
static void SparseSoftmaxXentWithLogits(
const ExecutionContext& ctx,
cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t x_desc,
const void* input,
const cnnlTensorDescriptor_t label_desc,
const void* label,
const cnnlTensorDescriptor_t y_desc,
void* output,
const cnnlTensorDescriptor_t diff_y_desc,
void* back_out);
static void RandomUniform(const ExecutionContext& ctx,
const int num,
const cnnlDataType_t data_type,
const cnnlRandGenerator_t mlu_generator,
void* mlu_state,
void* output);
static void FusedDropout(const ExecutionContext& ctx,
const cnnlRandGenerator_t generator,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const float p,
void* state,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cumsum(const ExecutionContext& ctx,
const int axis,
const bool exclusive,
const bool reverse,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void BroadcastTo(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void GatherFunctor(const ExecutionContext& ctx,
const int axis,
const int batch_dims,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterRefFunctor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlScatterRefMode_t mode);
static void ScatterFunctor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
void* params,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const int dim,
const cnnlScatterMode_t mode = CNNL_SCATTER);
static void Range(const ExecutionContext& ctx,
const void* start,
const void* end,
const void* step,
const cnnlDataType_t output_dtype,
void* output);
static void Round(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TopK(const ExecutionContext& ctx,
const int k,
const int dim,
const bool largest,
const bool sorted,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t values_output_desc,
void* values_out,
const cnnlTensorDescriptor_t indices_output_desc,
void* indices_out);
static void StridedSlice(const ExecutionContext& ctx,
const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Split(const ExecutionContext& ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]);
static void Split(const MLUDeviceContext& dev_ctx,
int split_num,
int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input_ptr,
const cnnlTensorDescriptor_t output_descs[],
void* output_ptrs[]);
static void Scale(const ExecutionContext& ctx,
const int axis,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t alpha_desc,
const void* alpha,
const cnnlTensorDescriptor_t beta_desc,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AddN(const ExecutionContext& ctx,
uint32_t input_num,
const cnnlTensorDescriptor_t inputs_desc[],
const void* inputs[],
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
cnnlLogBase_t log_base,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void StridedSliceGrad(const ExecutionContext& ctx,
const int begin[],
const int end[],
const int strides[],
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Logic(const ExecutionContext& ctx,
const cnnlLogicOp_t log_method,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t ouput_desc,
void* output);
static void Select(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t condition_desc,
const void* condition_ptr,
const cnnlTensorDescriptor_t then_desc,
const void* then_ptr,
const cnnlTensorDescriptor_t else_desc,
const void* else_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output_ptr);
static void AssignAdd(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc,
void* param);
static void AssignSub(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc,
void* param);
static void Assign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t update_desc,
const void* update,
const cnnlTensorDescriptor_t param_desc,
void* param);
static void GatherNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t params_desc,
const void* params,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchToSpace(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlSpaceBatchParam_t param);
static void BatchToSpaceNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PoolingForward(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
int64_t output_h,
int64_t output_w,
cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const void* extra_input_ptr,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void AdaptivePoolingForward(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlTensorDescriptor_t index_desc,
void* index);
static void Pool3D(const ExecutionContext& ctx,
cnnlPoolingMode_t pool_mode,
const std::vector<int64_t>& output_shape,
cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Pad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* paddings,
const void* padding_value,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Matmul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BatchMatmul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t in0_desc,
const void* in0,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void MulAx(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t alpha_desc,
const void* alpha,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void OpTensor(const ExecutionContext& ctx,
const cnnlOpTensorDescriptor_t op_tensor_desc,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const cnnlTensorDescriptor_t output_desc,
void* output,
const cnnlDataType_t dtype,
const float alpha1_float = 1.f,
const float alpha2_float = 1.f,
const float beta_float = 0.f);
static void BiasAddGrad(const ExecutionContext& ctx,
const int axis,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void OneHot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t desc_indices,
const void* indices,
const int depth,
const void* on_value,
const void* off_value,
const int axis,
cnnlDataType_t output_data_type,
void* output);
static void NonMaxSuppression(const ExecutionContext& ctx,
const cnnlNmsDescriptor_t nms_desc,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t confidence_desc,
const void* confidence,
const cnnlTensorDescriptor_t output_desc,
void* output,
void* output_size);
static void SoftmaxCrossEntropyWithLogits(
const ExecutionContext& ctx,
cnnlSoftmaxMode_t mode,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* logits_in,
const cnnlTensorDescriptor_t label_desc,
const void* labels_in,
const cnnlTensorDescriptor_t loss_out_desc,
void* loss_out,
const cnnlTensorDescriptor_t back_out_desc,
void* back_out);
static void SoftmaxForward(const ExecutionContext& ctx,
cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SoftmaxBackward(const ExecutionContext& ctx,
cnnlSoftmaxAlgorithm_t algorithm,
cnnlSoftmaxMode_t mode,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void Softplus(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t features_desc,
const void* features,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SoftplusGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t gradients_desc,
const void* gradients,
const cnnlTensorDescriptor_t features_desc,
const void* features,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void RsqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
const void* y,
const void* diff_y,
void* output);
static void SqrtGrad(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
const void* y,
const void* diff_y,
void* output);
static void ConvolutionForward(const ExecutionContext& ctx,
cnnlConvolutionDescriptor_t conv_desc_,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t bias_desc,
const void* bias_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedConvBNQuantify(const ExecutionContext& ctx,
cnnlConvolutionDescriptor_t conv_desc,
const void* epsilon_ptr,
const int fused_ops_number,
const cnnlDataType_t tensor_dtype,
const int input_position,
const float input_scale,
const int filter_position,
const float filter_scale,
const cnnlTensorDescriptor_t scale_desc,
const void* scale_ptr,
const cnnlTensorDescriptor_t offset_desc,
const void* offset_ptr,
const cnnlTensorDescriptor_t mean_desc,
const void* mean_ptr,
const cnnlTensorDescriptor_t variance_desc,
const void* variance_ptr,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filtet_desc,
const void* filter,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Tile(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void UnsortedSegmentSum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
const void* data,
const cnnlTensorDescriptor_t ids_desc,
const int* segment_ids,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Reduce(const ExecutionContext& ctx,
const bool need_workspace,
const cnnlReduceDescriptor_t reduction_desc,
const void* alpha,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const size_t indices_size,
void* indices,
const void* beta,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorDiv(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FloorMod(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Maximum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Minimum(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Pow(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PowR(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DivNoNan(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SquaredDifference(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void L2Loss(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
void* output);
static void Abs(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Neg(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Floor(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Ceil(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNan(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Square(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Rsqrt(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Cos(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sin(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrigonForward(const ExecutionContext& ctx,
const cnnlTrigonDescriptor_t trigon_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Exp(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Sign(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IndexSelect(const ExecutionContext& ctx,
const int dim,
cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t index_desc,
const void* index,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsFinite(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void IsNanInf(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
void* output);
static void Erf(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Log1p(const ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void LogicalNot(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DynamicStitch(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t* indices_desc,
const int** indices,
const cnnlTensorDescriptor_t* data_desc,
const void** data,
const int size,
int* indices_dims,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void CropAndResize(const ExecutionContext& ctx,
const std::string method_name,
const float extrapolation_value,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_index_desc,
const void* box_index,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void CropAndResizeBackwardImage(
const ExecutionContext& ctx,
const std::string method_name,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx,
const cnnlTensorDescriptor_t grads_image_desc,
void* grads_image);
static void CropAndResizeBackwardBoxes(
const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t image_desc,
const void* image,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t box_idx_desc,
const void* box_idx,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void PoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc,
const void* alpha,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const void* beta,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void AdaptivePoolingBackward(const ExecutionContext& ctx,
const cnnlPoolingMode_t pool_mode,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t index_desc,
const void* index,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
static void PoolingIndex(const ExecutionContext& ctx,
const cnnlPoolingDescriptor_t pooling_desc,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void SpaceToBatch(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output,
const int64_t block_shape[]);
static void SpaceToBatchNd(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
cnnlSpaceBatchNdDescriptor_t param,
void* extra_device_input,
size_t extra_input_size,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void Interp(const ExecutionContext& ctx,
const cnnlInterpMode_t mode,
const bool align_corners,
const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void InterpBackward(const ExecutionContext& ctx,
const cnnlInterpBackwardMode_t mode,
const bool align_corners,
const bool half_pixel_centers,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeParam(const ExecutionContext& ctx,
const cnnlQuantizeMode_t mode,
const int bitwidth,
const cnnlTensorDescriptor_t input_desc,
const void* input,
void* position,
void* scale,
void* offset);
static void QuantizeMatMul(const ExecutionContext& ctx,
const bool transpose_a,
const bool transpose_b,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMul(const ExecutionContext& ctx,
const bool adj_x,
const bool adj_y,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QuantizeBatchMatMulBCast(const ExecutionContext& ctx,
const bool adj_x,
const bool adj_y,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const void* a_position,
const void* a_scale,
const void* a_offset,
const cnnlTensorDescriptor_t b_desc,
const void* b,
const void* b_position,
const void* b_scale,
const void* b_offset,
const cnnlDataType_t quant_type,
const cnnlDataType_t data_type,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void FusedBatchNorm(const ExecutionContext& ctx,
const bool is_training,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t scale_desc,
const void* scale,
const void* offset,
const void* estimated_mean,
const void* estimated_variance,
float epsilon,
float momentum,
const cnnlTensorDescriptor_t output_desc,
void* output,
void* batch_mean,
void* batch_var,
void* saved_mean,
void* saved_var);
static void FusedBatchNormGrad(const ExecutionContext& ctx,
const bool is_training,
const cnnlTensorDescriptor_t y_backprop_desc,
const void* y_backprop,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t scale_desc,
const void* scale,
const void* saved_mean,
const void* saved_var,
float epsilon,
const cnnlTensorDescriptor_t x_backprop_desc,
void* x_backprop,
void* scale_backprop,
void* offset_backprop);
static void LayerNormForward(const ExecutionContext& ctx,
int axis,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t weight_bias_desc,
const void* weight,
const void* bias,
float eps,
const cnnlTensorDescriptor_t y_desc,
void* y,
const cnnlTensorDescriptor_t mean_rstd_desc,
void* saved_mean,
void* saved_rstd);
static void LayerNormBackward(const ExecutionContext& ctx,
int axis,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t diff_z_desc,
const void* diff_z,
const cnnlTensorDescriptor_t weight_bias_desc,
const void* weight,
const cnnlTensorDescriptor_t mean_rstd_desc,
const void* saved_mean,
const void* saved_rstd,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x,
void* diff_weight,
void* diff_bias);
static void Transpose(const ExecutionContext& ctx,
const std::vector<int> perm,
const int input_dim,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void TrilTriu(const ExecutionContext& ctx,
const int diagonal_k,
const bool tri_up_mode,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void MatrixBandPart(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t data_desc,
const void* input,
const int num_lower,
const int num_upper,
void* output);
static void NumTrue(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t num_true_desc,
void* num_true);
static void Where(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t num_true_desc,
const void* num_true,
const bool as_tuple,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void Conv2D(const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip,
const void* input_position,
const void* input_scale,
const void* input_offset,
const void* filter_position,
const void* filter_scale,
const void* filter_offset,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t filter_desc,
const void* filter,
const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ConvBackpropInput(const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t filter_desc,
const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void QuantizeConvBackpropInput(
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip,
const void* filter_position,
const void* filter_scale,
const void* filter_offset,
const void* out_backprop_position,
const void* out_backprop_scale,
const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc,
const void* filter,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t in_backprop_desc,
void* in_backprop);
static void ConvBackpropFilter(
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc,
void* filter_backprop);
static void QuantizeConvBackpropFilter(
const ExecutionContext& ctx,
const cnnlConvolutionDescriptor_t conv_desc,
const cnnlDataType_t tensor_dtype,
const cnnlDataType_t dt_onchip,
const void* input_position,
const void* input_scale,
const void* input_offset,
const void* out_backprop_position,
const void* out_backprop_scale,
const void* out_backprop_offset,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t out_backprop_desc,
const void* out_backprop,
const cnnlTensorDescriptor_t filter_backprop_desc,
void* filter_backprop);
static void DCNForward(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void DCNBackwardData(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_input_desc,
void* grad_input,
const cnnlTensorDescriptor_t grad_offset_desc,
void* grad_offset,
const cnnlTensorDescriptor_t grad_mask_desc,
void* grad_mask);
static void DCNBackwardWeight(const ExecutionContext& ctx,
const cnnlDCNDescriptor_t dcn_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t offset_desc,
const void* offset,
const cnnlTensorDescriptor_t mask_desc,
const void* mask,
const cnnlTensorDescriptor_t grad_output_desc,
const void* grad_output,
const cnnlTensorDescriptor_t grad_weight_desc,
void* grad_weight,
const cnnlTensorDescriptor_t grad_bias_desc,
void* grad_bias);
static void InTopK(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t predictions_desc,
const void* predictions,
const cnnlTensorDescriptor_t targets_desc,
const void* targets,
const cnnlTensorDescriptor_t k_desc,
const void* k,
const int k_int,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void ScatterNd(const ExecutionContext& ctx,
cnnlScatterNdMode_t mode,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t updates_desc,
const void* updates,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BitWise(const ExecutionContext& ctx,
const cnnlBitComputeOp_t optype,
const cnnlTensorDescriptor_t input1_desc,
const void* input1,
const cnnlTensorDescriptor_t input2_desc,
const void* input2,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void QR(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t a_desc,
const void* a,
const cnnlTensorDescriptor_t q_desc,
void* q,
const cnnlTensorDescriptor_t r_desc,
void* r,
const bool some);
static void Reciprocal(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceLoss(const ExecutionContext& ctx,
const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceLossBackward(const ExecutionContext& ctx,
const cnnlBceLossReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SmoothL1LossForward(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t t_desc,
const void* target,
const float beta,
const cnnlSmoothL1LossAlgorithm_t algorithm,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void SmoothL1LossBackward(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t dy_desc,
const void* dy,
const float beta,
const cnnlSmoothL1LossAlgorithm_t algorithm,
const cnnlTensorDescriptor_t dx_desc,
void* dx);
static void EmbeddingForward(const ExecutionContext& ctx,
const int padding_idx,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t indices_desc,
const int* indices,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void RNNForward(const ExecutionContext& ctx,
const cnnlRNNDescriptor_t rnn_desc,
const int dev_seq_lengths[],
const void* weight_param_ptr,
size_t weightspace_size,
const cnnlSeqDataDescriptor_t x_desc,
const void* x,
const cnnlSeqDataDescriptor_t y_desc,
void* y,
const cnnlTensorDescriptor_t h_desc,
const void* hx,
void* hy,
const cnnlTensorDescriptor_t c_desc,
const void* cx,
void* cy,
void* reservespace_ptr);
static void RNNBackward(const ExecutionContext& ctx,
const cnnlRNNDescriptor_t rnn_desc,
cnnlWgradMode_t add_grad,
const int dev_seq_lengths[],
const void* weight_param_ptr,
void* dweight_param_ptr,
size_t weightspace_size,
const cnnlSeqDataDescriptor_t x_desc,
const void* x,
void* dx,
const cnnlSeqDataDescriptor_t y_desc,
const void* y,
const void* dy,
const cnnlTensorDescriptor_t hx_desc,
const void* hx,
const void* dhy,
void* dhx,
const cnnlTensorDescriptor_t cx_desc,
const void* cx,
const void* dcy,
void* dcx,
void* reservespace_ptr,
size_t reservespace_size);
static void Mask(const ExecutionContext& ctx,
cnnlMaskedOp_t masked_mode,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t masked_desc,
const void* masked,
const cnnlTensorDescriptor_t value_desc,
const void* value,
const cnnlTensorDescriptor_t output_desc,
void* output,
uint32_t* number);
static void Transform(const ExecutionContext& ctx,
const void* alpha,
const void* beta,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void EmbeddingBackward(const ExecutionContext& ctx,
int padding_idx,
bool scale_grad_by_freq,
const cnnlTensorDescriptor_t indices_desc,
const void* indices,
const cnnlTensorDescriptor_t diff_desc,
const void* diff,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogits(const ExecutionContext& ctx,
cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void BceWithLogitsBackward(
const ExecutionContext& ctx,
cnnlBceWithLogitsReduction_t reduction,
const cnnlTensorDescriptor_t grad_desc,
const void* grad,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t target_desc,
const void* target,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t pos_weight_desc,
const void* pos_weight,
const cnnlTensorDescriptor_t diff_input_desc,
void* diff_input);
static void RoiAlign(const ExecutionContext& ctx,
const int pooled_height,
const int pooled_width,
const int sampling_ratio,
const float spatial_scale,
const bool aligned,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void RoiAlignBackward(const ExecutionContext& ctx,
const int sampling_ratio,
const float spatial_scale,
const bool aligned,
const cnnlTensorDescriptor_t grads_desc,
const void* grads,
const cnnlTensorDescriptor_t boxes_desc,
const void* boxes,
const cnnlTensorDescriptor_t grads_image_desc,
void* grads_image);
static void GridSample(const ExecutionContext& ctx,
const cnnlGridSampleDescriptor_t grid_sample_desc,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t grid_desc,
const void* grid,
const cnnlTensorDescriptor_t output_desc,
void* output);
static void SyncBatchNormStats(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const float eps,
const cnnlTensorDescriptor_t mean_desc,
void* mean,
const cnnlTensorDescriptor_t invstd_desc,
void* invstd);
static void SyncBatchNormGatherStatsWithCounts(
const ExecutionContext& ctx,
float momentum,
float eps,
const cnnlTensorDescriptor_t mean_all_desc,
const void* mean_all,
const cnnlTensorDescriptor_t invstd_all_desc,
const void* invstd_all,
const cnnlTensorDescriptor_t moving_mean_desc,
void* moving_mean,
const cnnlTensorDescriptor_t moving_var_desc,
void* moving_var,
const cnnlTensorDescriptor_t count_all_desc,
const void* count_all,
const cnnlTensorDescriptor_t mean_desc,
void* mean,
const cnnlTensorDescriptor_t invstd_desc,
void* invstd);
static void SyncBatchNormElemt(const ExecutionContext& ctx,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t mean_desc,
const void* mean,
const cnnlTensorDescriptor_t invstd_desc,
const void* invstd,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t bias_desc,
const void* bias,
const cnnlTensorDescriptor_t y_desc,
void* y);
static void SyncBatchnormBackwardReduce(
const ExecutionContext& ctx,
const cnnlTensorDescriptor_t desc_dz,
const void* dz,
const cnnlTensorDescriptor_t desc_x,
const void* x,
const cnnlTensorDescriptor_t desc_mean,
const void* mean,
const cnnlTensorDescriptor_t desc_invstd,
const void* invstd,
const cnnlTensorDescriptor_t desc_dweight,
void* dweight,
const cnnlTensorDescriptor_t desc_dbias,
void* dbias,
const cnnlTensorDescriptor_t desc_sum_dy,
void* sum_dy,
const cnnlTensorDescriptor_t desc_sum_dy_xmu,
void* sum_dy_xmu,
const bool needs_input_grad0,
const bool needs_input_grad1,
const bool needs_input_grad2);
static void SyncBatchNormBackwardElemt(
const ExecutionContext& ctx,
const cnnlTensorDescriptor_t diff_y_desc,
const void* diff_y,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t mean_desc,
const void* mean,
const cnnlTensorDescriptor_t invstd_desc,
const void* invstd,
const cnnlTensorDescriptor_t weight_desc,
const void* weight,
const cnnlTensorDescriptor_t sum_dy_desc,
const void* sum_dy,
const cnnlTensorDescriptor_t sum_dy_xmu_desc,
const void* sum_dy_xmu,
const cnnlTensorDescriptor_t count_desc,
const void* count,
const cnnlTensorDescriptor_t diff_x_desc,
void* diff_x);
};
class MLUOP {
public:
static void OpYoloBox(const ExecutionContext& ctx,
const mluOpTensorDescriptor_t x_desc,
const void* x,
const mluOpTensorDescriptor_t img_size_desc,
const void* img_size,
const mluOpTensorDescriptor_t anchors_desc,
const void* anchors,
const int class_num,
const float conf_thresh,
const int downsample_ratio,
const bool clip_bbox,
const float scale,
const bool iou_aware,
const float iou_aware_factor,
const mluOpTensorDescriptor_t boxes_desc,
void* boxes,
const mluOpTensorDescriptor_t scores_desc,
void* scores);
static void OpPriorBox(const ExecutionContext& ctx,
const mluOpTensorDescriptor_t min_sizes_desc,
const void* min_sizes,
const mluOpTensorDescriptor_t aspect_ratios_desc,
const void* aspect_ratios,
const mluOpTensorDescriptor_t variances_desc,
const void* variances,
const mluOpTensorDescriptor_t max_sizes_desc,
const void* max_sizes,
const int height,
const int width,
const int im_height,
const int im_width,
const float step_h,
const float step_w,
const float offset,
const bool clip,
const bool min_max_aspect_ratios_order,
const mluOpTensorDescriptor_t output_desc,
void* output,
const mluOpTensorDescriptor_t var_desc,
void* var);
};
const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
TransPermMap = {
// trans_mode, (forward_perm, backward_perm)
{"3D_NCHW2NHWC", {{0, 2, 1}, {0, 2, 1}}},
{"4D_NCHW2NHWC", {{0, 2, 3, 1}, {0, 3, 1, 2}}},
{"5D_NCHWD2NDHWC", {{0, 4, 2, 3, 1}, {0, 4, 2, 3, 1}}},
{"5D_NHWDC2NDHWC", {{0, 3, 1, 2, 4}, {0, 2, 3, 4, 1}}}};
inline void SetMLUTransposePerm(const framework::DDim& dims,
const DataLayout& data_layout,
std::vector<int>* forward_perm,
std::vector<int>* backward_perm,
std::vector<int>* out_shape) {
const int dim_size = dims.size();
PADDLE_ENFORCE_EQ((dim_size >= 3) && (dim_size <= 5),
true,
platform::errors::InvalidArgument(
"MLUTransposePerm func only support (dim_size >= 3) && "
"(dim_size <= 5), but now dim_size is %d.",
dim_size));
PADDLE_ENFORCE_EQ(
(data_layout == DataLayout::kNCHW) || (data_layout == DataLayout::kNHWC),
true,
platform::errors::InvalidArgument(
"MLUTransposePerm func only support DataLayout: kNCHW or kNHWC, but "
"now data_layout is %s.",
data_layout));
// case 1: NCHW of Paddle != NHWC of MLU when dims==3,4
// case 2: NHWDC and NCHWD of Paddle != NDHWC of MLU when dims==5
std::string map_key = "";
if (data_layout == DataLayout::kNCHW) {
switch (dim_size) {
case 3:
map_key = "3D_NCHW2NHWC";
break;
case 4:
map_key = "4D_NCHW2NHWC";
break;
case 5:
map_key = "5D_NCHWD2NDHWC";
break;
}
} else if (data_layout == DataLayout::kNHWC && dim_size == 5) {
map_key = "5D_NHWDC2NDHWC";
}
assert(map_key != "");
forward_perm->assign(TransPermMap.at(map_key).first.begin(),
TransPermMap.at(map_key).first.end());
backward_perm->assign(TransPermMap.at(map_key).second.begin(),
TransPermMap.at(map_key).second.end());
auto in_dims = phi::vectorize(dims);
for (size_t i = 0; i < in_dims.size(); i++) {
out_shape->push_back(in_dims[forward_perm->at(i)]);
}
}
template <typename T>
inline void TransposeFromMLUTensor(const ExecutionContext& ctx,
const std::vector<int> perm,
const phi::DenseTensor* transformed_input,
phi::DenseTensor* transformed_output,
bool need_reshape_or_alloc) {
const int dim_size = perm.size();
if (need_reshape_or_alloc) {
std::vector<int> output_shape;
auto input_dims = transformed_input->dims();
for (int i = 0; i < dim_size; ++i) {
output_shape.push_back(input_dims[perm[i]]);
}
transformed_output->mutable_data<T>(
framework::DDim(output_shape.data(), dim_size), ctx.GetPlace());
}
MLUCnnlTensorDesc trans_in_desc(
*transformed_input, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc trans_out_desc(
*transformed_output, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::Transpose(ctx,
perm,
dim_size,
trans_in_desc.get(),
GetBasePtr(transformed_input),
trans_out_desc.get(),
GetBasePtr(transformed_output));
}
template <typename T>
inline void FillMLUTensorWithHostValue(const ExecutionContext& ctx,
T value,
phi::DenseTensor* out) {
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Fill(
ctx, CNNL_POINTER_MODE_HOST, &value, out_desc.get(), GetBasePtr(out));
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class AdamMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
auto* param = ctx.Input<phi::DenseTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Grad(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(param_var->Type())));
auto* grad = ctx.Input<phi::DenseTensor>("Grad");
auto* mom1 = ctx.Input<phi::DenseTensor>("Moment1");
auto* mom2 = ctx.Input<phi::DenseTensor>("Moment2");
auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
auto* beta1_pow = ctx.Input<phi::DenseTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<phi::DenseTensor>("Beta2Pow");
auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto* mom1_out = ctx.Output<phi::DenseTensor>("Moment1Out");
auto* mom2_out = ctx.Output<phi::DenseTensor>("Moment2Out");
auto* beta1_pow_out = ctx.Output<phi::DenseTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<phi::DenseTensor>("Beta2PowOut");
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(
*skip_update_tensor, ctx.device_context(), &skip_update_vec);
ctx.device_context().Wait();
skip_update = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update) {
VLOG(4) << "Adam skip update";
framework::TensorCopy(
*param,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
param_out);
framework::TensorCopy(
*mom1,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
mom1_out);
framework::TensorCopy(
*mom2,
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
mom2_out);
framework::TensorCopy(
*beta1_pow,
beta1_pow->place(),
ctx.template device_context<platform::MLUDeviceContext>(),
beta1_pow_out);
framework::TensorCopy(
*beta2_pow,
beta2_pow->place(),
ctx.template device_context<platform::MLUDeviceContext>(),
beta2_pow_out);
return;
}
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
param_out->ShareDataWith(*param);
mom1_out->ShareDataWith(*mom1);
mom2_out->ShareDataWith(*mom2);
phi::DenseTensor beta1_pow_tmp;
phi::DenseTensor beta2_pow_tmp;
if (beta1_pow->place() == platform::CPUPlace()) {
T beta1 = *beta1_pow->data<T>();
beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta1,
beta1_pow_tmp_desc.get(),
GetBasePtr(&beta1_pow_tmp));
beta1_pow = &beta1_pow_tmp;
}
if (beta2_pow->place() == platform::CPUPlace()) {
T beta2 = *beta2_pow->data<T>();
beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta2,
beta2_pow_tmp_desc.get(),
GetBasePtr(&beta2_pow_tmp));
beta2_pow = &beta2_pow_tmp;
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
<< "beta2_pow.numel() : " << beta2_pow->numel();
VLOG(3) << "param.numel(): " << param->numel();
PADDLE_ENFORCE_EQ(beta1_pow_out->numel(),
1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_out->numel(),
1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const phi::DenseTensor* beta1_tensor = nullptr;
const phi::DenseTensor* beta2_tensor = nullptr;
const phi::DenseTensor* epsilon_tensor = nullptr;
phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
if (ctx.HasInput("Beta1Tensor")) {
beta1_tensor = ctx.Input<phi::DenseTensor>("Beta1Tensor");
PADDLE_ENFORCE_EQ(beta1_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(Beta1Tensor) size must be 1, but get %d",
beta1_tensor->numel()));
} else {
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta1,
beta1_tmp_desc.get(),
GetBasePtr(&beta1_tmp));
beta1_tensor = &beta1_tmp;
}
if (ctx.HasInput("Beta2Tensor")) {
beta2_tensor = ctx.Input<phi::DenseTensor>("Beta2Tensor");
PADDLE_ENFORCE_EQ(beta2_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(Beta2Tensor) size must be 1, but get %d",
beta2_tensor->numel()));
} else {
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta2,
beta2_tmp_desc.get(),
GetBasePtr(&beta2_tmp));
beta2_tensor = &beta2_tmp;
}
if (ctx.HasInput("EpsilonTensor")) {
epsilon_tensor = ctx.Input<phi::DenseTensor>("EpsilonTensor");
PADDLE_ENFORCE_EQ(epsilon_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(EpsilonTensor) size must be 1, but get %d",
epsilon_tensor->numel()));
} else {
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&epsilon,
epsilon_tmp_desc.get(),
GetBasePtr(&epsilon_tmp));
epsilon_tensor = &epsilon_tmp;
}
MLUCnnlTensorDesc param_desc(*param);
MLUCnnlTensorDesc mom1_desc(*mom1);
MLUCnnlTensorDesc mom2_desc(*mom2);
MLUCnnlTensorDesc grad_desc(*grad);
MLUCnnl::ApplyAdam(ctx,
param_desc.get(),
GetBasePtr(param_out),
mom1_desc.get(),
GetBasePtr(mom1_out),
mom2_desc.get(),
GetBasePtr(mom2_out),
grad_desc.get(),
GetBasePtr(grad),
GetBasePtr(lr),
GetBasePtr(beta1_tensor),
GetBasePtr(beta2_tensor),
GetBasePtr(beta1_pow),
GetBasePtr(beta2_pow),
GetBasePtr(epsilon_tensor),
/*use_nesterov*/ false);
if (!use_global_beta_pow) {
beta1_pow_out->mutable_data<T>(ctx.GetPlace());
beta2_pow_out->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
beta1_desc.get(),
GetBasePtr(beta1_pow),
beta1_desc.get(),
GetBasePtr(beta1_tensor),
beta1_desc.get(),
GetBasePtr(beta1_pow_out),
ToCnnlDataType<T>());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
beta1_desc.get(),
GetBasePtr(beta2_pow),
beta1_desc.get(),
GetBasePtr(beta2_tensor),
beta1_desc.get(),
GetBasePtr(beta2_pow_out),
ToCnnlDataType<T>());
}
}
};
template <typename T>
class AdamWMLUKernel : public AdamMLUKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
VLOG(3) << "MLU AdamW Kernel";
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
VLOG(3) << "Has SkipUpdate";
auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(
*skip_update_tensor, ctx.device_context(), &skip_update_vec);
ctx.device_context().Wait();
skip_update = skip_update_vec[0];
}
bool with_decay = ctx.Attr<bool>("with_decay");
const bool multi_precision = ctx.Attr<bool>("multi_precision");
auto* param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto* master_param_out = ctx.Output<phi::DenseTensor>("MasterParamOut");
const auto* master_param = ctx.Input<phi::DenseTensor>("MasterParam");
VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
if (!skip_update && with_decay) {
auto* param = ctx.Input<phi::DenseTensor>("Param");
MLUCnnlTensorDesc param_desc(*param);
if (multi_precision) {
VLOG(3) << "[adamw] multi_precision, cast masterparam to param.";
bool has_master =
ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
PADDLE_ENFORCE_EQ(
has_master,
true,
platform::errors::InvalidArgument(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"));
// cast masterparam (fp32) to param (fp16), then paramout (fp16) to
// masterparamout (fp32)
MLUCnnlTensorDesc master_param_desc(*master_param);
cnnlCastDataType_t cast_type = GetCastDataType(
framework::TransToProtoVarType(master_param->dtype()),
framework::TransToProtoVarType(param->dtype()));
MLUCnnl::Cast(ctx,
cast_type,
master_param_desc.get(),
GetBasePtr(master_param),
param_desc.get(),
const_cast<void*>(GetBasePtr(param)));
} else {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<phi::DenseTensor>(),
true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be phi::DenseTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
auto* lr = ctx.Input<phi::DenseTensor>("LearningRate");
float coeff = ctx.Attr<float>("coeff");
// update param with decay coeff: mul(-1 * lr, coeff * param) + param
MLUCnnlTensorDesc lr_desc(*lr);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
lr_desc.get(),
GetBasePtr(lr),
param_desc.get(),
GetBasePtr(param),
param_desc.get(),
const_cast<void*>(GetBasePtr(param)),
ToCnnlDataType<T>(),
/*alpha1*/ -1.f,
/*alpha2*/ coeff,
/*beta*/ 1.f);
}
}
AdamMLUKernel<T>::Compute(ctx);
if (multi_precision) {
VLOG(3) << "[adamw] multi_precision, cast paramout to masterparamout.";
// cast paramout to masterparamout
master_param_out->mutable_data<float>(ctx.GetPlace());
cnnlCastDataType_t cast_type = GetCastDataType(
framework::TransToProtoVarType(param_out->dtype()),
framework::TransToProtoVarType(master_param_out->dtype()));
MLUCnnlTensorDesc param_out_desc(*param_out);
MLUCnnlTensorDesc master_param_out_desc(*master_param_out);
MLUCnnl::Cast(ctx,
cast_type,
param_out_desc.get(),
GetBasePtr(param_out),
master_param_out_desc.get(),
GetBasePtr(master_param_out));
}
}
};
template <typename T>
class MergedAdamMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// Get inputs and outputs
auto params = ctx.MultiInput<phi::DenseTensor>("Param");
auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
auto mom1s = ctx.MultiInput<phi::DenseTensor>("Moment1");
auto mom2s = ctx.MultiInput<phi::DenseTensor>("Moment2");
auto beta1_pows = ctx.MultiInput<phi::DenseTensor>("Beta1Pow");
auto beta2_pows = ctx.MultiInput<phi::DenseTensor>("Beta2Pow");
auto master_params = ctx.MultiInput<phi::DenseTensor>("MasterParam");
auto param_outs = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
auto mom1_outs = ctx.MultiOutput<phi::DenseTensor>("Moment1Out");
auto mom2_outs = ctx.MultiOutput<phi::DenseTensor>("Moment2Out");
auto beta1_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta1PowOut");
auto beta2_pow_outs = ctx.MultiOutput<phi::DenseTensor>("Beta2PowOut");
// Check validation of inputs and outputs
size_t param_num = params.size();
PADDLE_ENFORCE_EQ(param_num,
param_outs.size(),
platform::errors::InvalidArgument(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d.",
param_outs.size(),
param_num));
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
auto* skip_update_tensor = ctx.Input<phi::DenseTensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(
*skip_update_tensor, ctx.device_context(), &skip_update_vec);
ctx.device_context().Wait();
skip_update = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update) {
VLOG(4) << "MergedAdam skip update";
for (size_t i = 0; i < param_num; ++i) {
framework::TensorCopy(
*params[i],
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
param_outs[i]);
framework::TensorCopy(
*mom1s[i],
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
mom1_outs[i]);
framework::TensorCopy(
*mom2s[i],
ctx.GetPlace(),
ctx.template device_context<platform::MLUDeviceContext>(),
mom2_outs[i]);
framework::TensorCopy(
*beta1_pows[i],
beta1_pows[i]->place(),
ctx.template device_context<platform::MLUDeviceContext>(),
beta1_pow_outs[i]);
framework::TensorCopy(
*beta2_pows[i],
beta2_pows[i]->place(),
ctx.template device_context<platform::MLUDeviceContext>(),
beta2_pow_outs[i]);
}
return;
}
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
// Get beta1, beta2 and epsilon from attribute.
const phi::DenseTensor* beta1_tensor = nullptr;
const phi::DenseTensor* beta2_tensor = nullptr;
const phi::DenseTensor* epsilon_tensor = nullptr;
phi::DenseTensor beta1_tmp(phi::DataType::FLOAT32);
phi::DenseTensor beta2_tmp(phi::DataType::FLOAT32);
phi::DenseTensor epsilon_tmp(phi::DataType::FLOAT32);
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta1,
beta1_tmp_desc.get(),
GetBasePtr(&beta1_tmp));
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta2,
beta2_tmp_desc.get(),
GetBasePtr(&beta2_tmp));
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&epsilon,
epsilon_tmp_desc.get(),
GetBasePtr(&epsilon_tmp));
beta1_tensor = &beta1_tmp;
beta2_tensor = &beta2_tmp;
epsilon_tensor = &epsilon_tmp;
// Loop to compute
for (size_t i = 0; i < param_num; ++i) {
VLOG(4) << "[MergedAdam] loop: " << i;
param_outs[i]->ShareDataWith(*params[i]);
mom1_outs[i]->ShareDataWith(*mom1s[i]);
mom2_outs[i]->ShareDataWith(*mom2s[i]);
phi::DenseTensor beta1_pow_tmp;
phi::DenseTensor beta2_pow_tmp;
if (beta1_pows[i]->place() == platform::CPUPlace()) {
T beta1 = *beta1_pows[i]->data<T>();
beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta1,
beta1_pow_tmp_desc.get(),
GetBasePtr(&beta1_pow_tmp));
beta1_pows[i] = &beta1_pow_tmp;
}
if (beta2_pows[i]->place() == platform::CPUPlace()) {
T beta2 = *beta2_pows[i]->data<T>();
beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&beta2,
beta2_pow_tmp_desc.get(),
GetBasePtr(&beta2_pow_tmp));
beta2_pows[i] = &beta2_pow_tmp;
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pows[i]->numel()
<< "beta2_pow.numel() : " << beta2_pows[i]->numel();
VLOG(3) << "param.numel(): " << params[i]->numel();
PADDLE_ENFORCE_EQ(beta1_pow_outs[i]->numel(),
1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_outs[i]->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_outs[i]->numel(),
1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_outs[i]->numel()));
MLUCnnlTensorDesc param_desc(*params[i]);
MLUCnnlTensorDesc mom1_desc(*mom1s[i]);
MLUCnnlTensorDesc mom2_desc(*mom2s[i]);
MLUCnnlTensorDesc grad_desc(*grads[i]);
MLUCnnl::ApplyAdam(ctx,
param_desc.get(),
GetBasePtr(param_outs[i]),
mom1_desc.get(),
GetBasePtr(mom1_outs[i]),
mom2_desc.get(),
GetBasePtr(mom2_outs[i]),
grad_desc.get(),
GetBasePtr(grads[i]),
GetBasePtr(lrs[i]),
GetBasePtr(beta1_tensor),
GetBasePtr(beta2_tensor),
GetBasePtr(beta1_pows[i]),
GetBasePtr(beta2_pows[i]),
GetBasePtr(epsilon_tensor),
/*use_nesterov*/ false);
if (!use_global_beta_pow) {
beta1_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
beta2_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
beta1_desc.get(),
GetBasePtr(beta1_pows[i]),
beta1_desc.get(),
GetBasePtr(beta1_tensor),
beta1_desc.get(),
GetBasePtr(beta1_pow_outs[i]),
ToCnnlDataType<T>());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
beta1_desc.get(),
GetBasePtr(beta2_pows[i]),
beta1_desc.get(),
GetBasePtr(beta2_tensor),
beta1_desc.get(),
GetBasePtr(beta2_pow_outs[i]),
ToCnnlDataType<T>());
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(adam,
ops::AdamMLUKernel<float>,
ops::AdamMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(adamw,
ops::AdamWMLUKernel<float>,
ops::AdamWMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(merged_adam,
ops::MergedAdamMLUKernel<float>,
ops::MergedAdamMLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUMergedMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto params = ctx.MultiInput<phi::DenseTensor>("Param");
auto params_out = ctx.MultiOutput<phi::DenseTensor>("ParamOut");
size_t n = params.size();
PADDLE_ENFORCE_EQ(n,
params_out.size(),
platform::errors::InvalidArgument(
"The size of Output(ParamOut) must be equal to "
"Input(Param), but got the size of Output(ParamOut) "
"is %d, the size of Input(Param) is %d.",
params_out.size(),
n));
for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_EQ(params[i],
params_out[i],
platform::errors::InvalidArgument(
"The size of Input(Param) and Output(ParamOut) "
"must be the same Tensors."));
}
auto grads = ctx.MultiInput<phi::DenseTensor>("Grad");
PADDLE_ENFORCE_EQ(
n,
grads.size(),
platform::errors::InvalidArgument(
"The size of Input(Grad) must be equal to Input(Param), but got "
"the size of Input(Grad) is %d, the size of Input(Param) is %d.",
grads.size(),
n));
auto velocitys = ctx.MultiInput<phi::DenseTensor>("Velocity");
PADDLE_ENFORCE_EQ(n,
velocitys.size(),
platform::errors::InvalidArgument(
"The size of Input(Velocity) must be equal to "
"Input(Param), but got the size of Input(Velocity) "
"is %d, the size of Input(Param) is %d.",
velocitys.size(),
n));
auto velocitys_out = ctx.MultiOutput<phi::DenseTensor>("VelocityOut");
PADDLE_ENFORCE_EQ(
n,
velocitys_out.size(),
platform::errors::InvalidArgument(
"The size of Output(VelocityOut) must be "
"equal to Input(Param), but got the size of Output(VelocityOut) is "
"%d, the size of Input(Param) is %d.",
velocitys_out.size(),
n));
for (size_t i = 0; i < n; ++i) {
PADDLE_ENFORCE_EQ(velocitys[i],
velocitys_out[i],
platform::errors::InvalidArgument(
"Input(Velocity) and Output(VelocityOut) must be "
"the same Tensors."));
}
auto mu = static_cast<T>(ctx.Attr<float>("mu"));
auto lrs = ctx.MultiInput<phi::DenseTensor>("LearningRate");
if (lrs.size() != 1) {
PADDLE_ENFORCE_EQ(
n,
lrs.size(),
platform::errors::InvalidArgument(
"If the size of Input(LearningRate) is not 1, the size of "
"Input(LearningRate) must be "
"equal to Input(Param), but got the size of Input(LearningRate) "
"is %d, the size of Input(Param) is %d.",
lrs.size(),
n));
}
auto use_nesterov = ctx.Attr<bool>("use_nesterov");
auto regularization_methods =
ctx.Attr<std::vector<std::string>>("regularization_method");
auto regularization_coeffs =
ctx.Attr<std::vector<float>>("regularization_coeff");
if (regularization_methods.size() != 0) {
PADDLE_ENFORCE_EQ(
n,
regularization_methods.size(),
platform::errors::InvalidArgument(
"The size of Attr(regularization_method) must be equal "
"to Input(Param), but got the size of "
"Attr(regularization_method) is %d, the size of Input(Param) is "
"%d.",
regularization_methods.size(),
n));
PADDLE_ENFORCE_EQ(
n,
regularization_coeffs.size(),
platform::errors::InvalidArgument(
"The size of Attr(regularization_coeff) must be equal "
"to Input(Param), but got the size of Attr(regularization_coeff) "
"is %d, the size of Input(Param) is %d.",
regularization_coeffs.size(),
n));
}
VLOG(5) << "use_nesterov: " << use_nesterov
<< ", regularization_methods.size(): "
<< regularization_methods.size()
<< ", regularization_coeffs.size(): "
<< regularization_coeffs.size();
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
phi::DenseTensor mu_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&mu,
mu_tensor_desc.get(),
GetBasePtr(&mu_tensor));
for (size_t idx = 0; idx < n; ++idx) {
phi::RegularizationType regularization_flag =
regularization_methods.size() > 0 &&
regularization_methods[idx] == "l2_decay"
? phi::RegularizationType::kL2DECAY
: phi::RegularizationType::kNONE;
T regularization_coeff = static_cast<T>(0.0);
if (regularization_coeffs.size() != 0) {
regularization_coeff = static_cast<T>(regularization_coeffs[idx]);
}
auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0];
auto param_out = params_out[idx];
auto velocity_out = velocitys_out[idx];
auto grad = grads[idx];
phi::DenseTensor regularized_grad;
MLUCnnlTensorDesc param_desc(*param_out);
if (regularization_flag == phi::RegularizationType::kL2DECAY) {
regularized_grad = ctx.AllocateTmpTensor<T, MLUDeviceContext>(
param_out->dims(), dev_ctx);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
param_desc.get(),
GetBasePtr(param_out),
param_desc.get(),
GetBasePtr(grad),
param_desc.get(),
GetBasePtr(&regularized_grad),
ToCnnlDataType<T>(),
regularization_coeff);
} else {
regularized_grad = *grad;
}
MLUCnnl::ApplyMomentum(ctx,
param_desc.get(),
GetBasePtr(&regularized_grad),
use_nesterov,
GetBasePtr(learning_rate),
GetBasePtr(&mu_tensor),
GetBasePtr(param_out),
GetBasePtr(velocity_out));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(merged_momentum,
ops::MLUMergedMomentumOpKernel<float>,
ops::MLUMergedMomentumOpKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/optimizers/momentum_op.h"
#include "paddle/phi/kernels/impl/momentum_kernel_impl.h"
namespace paddle {
namespace operators {
template <typename T>
class MLUMomentumOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
std::string regularization_method =
ctx.Attr<std::string>("regularization_method");
auto regularization_coeff = ctx.Attr<float>("regularization_coeff");
phi::RegularizationType regularization_flag{
phi::RegularizationType::kNONE}; // disable regularization
if (regularization_method == "l2_decay") {
regularization_flag = phi::RegularizationType::kL2DECAY;
}
T mu = static_cast<T>(ctx.Attr<float>("mu"));
bool use_nesterov = ctx.Attr<bool>("use_nesterov");
auto learning_rate = ctx.Input<phi::DenseTensor>("LearningRate");
auto param = ctx.Input<phi::DenseTensor>("Param");
auto velocity = ctx.Input<phi::DenseTensor>("Velocity");
auto param_out = ctx.Output<phi::DenseTensor>("ParamOut");
auto velocity_out = ctx.Output<phi::DenseTensor>("VelocityOut");
param_out->mutable_data<T>(ctx.GetPlace());
velocity_out->mutable_data<T>(ctx.GetPlace());
auto* grad_var = ctx.InputVar("Grad");
if (grad_var->IsType<phi::DenseTensor>()) {
auto grad = ctx.Input<phi::DenseTensor>("Grad");
phi::DenseTensor mu_tensor =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
MLUCnnlTensorDesc mu_tensor_desc(mu_tensor);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&mu,
mu_tensor_desc.get(),
GetBasePtr(&mu_tensor));
phi::DenseTensor regularized_grad;
MLUCnnlTensorDesc param_desc(*param);
if (regularization_flag == phi::RegularizationType::kL2DECAY) {
regularized_grad =
ctx.AllocateTmpTensor<T, MLUDeviceContext>(param->dims(), dev_ctx);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
param_desc.get(),
GetBasePtr(param),
param_desc.get(),
GetBasePtr(grad),
param_desc.get(),
GetBasePtr(&regularized_grad),
ToCnnlDataType<T>(),
regularization_coeff);
} else {
regularized_grad = *grad;
}
framework::TensorCopy(*param, ctx.GetPlace(), dev_ctx, param_out);
framework::TensorCopy(*velocity, ctx.GetPlace(), dev_ctx, velocity_out);
MLUCnnl::ApplyMomentum(ctx,
param_desc.get(),
GetBasePtr(&regularized_grad),
use_nesterov,
GetBasePtr(learning_rate),
GetBasePtr(&mu_tensor),
GetBasePtr(param_out),
GetBasePtr(velocity_out));
} else if (grad_var->IsType<phi::SelectedRows>()) {
PADDLE_ENFORCE_EQ(
false,
true,
platform::errors::PermissionDenied("Unsupport SparseMomentum"));
} else {
PADDLE_ENFORCE_EQ(false,
true,
platform::errors::PermissionDenied(
"Unsupported Variable Type of Grad "
"in MomentumOp. Excepted LodTensor "
"or SelectedRows, But received [%s]",
paddle::framework::ToTypeName(grad_var->Type())));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(momentum,
ops::MLUMomentumOpKernel<float>,
ops::MLUMomentumOpKernel<plat::float16>);
...@@ -63,21 +63,6 @@ BufferedReader::BufferedReader( ...@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) {
int dev_idx = place_.device;
compute_stream_ =
((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance()
.Get(place_)))
->stream();
events_.resize(buffer_size);
for (auto &event : events_) {
event = platform::MluEventResourcePool::Instance().New(dev_idx);
}
stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place_)) { if (platform::is_xpu_place(place_)) {
int dev_idx = place_.device; int dev_idx = place_.device;
...@@ -260,57 +245,6 @@ void BufferedReader::ReadAsync(size_t i) { ...@@ -260,57 +245,6 @@ void BufferedReader::ReadAsync(size_t i) {
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place_)) {
TensorVec &mlu = mlu_buffer_[i];
if (mlu.empty()) {
mlu.resize(cpu.size());
} else {
PADDLE_ENFORCE_EQ(
mlu.size(),
cpu.size(),
platform::errors::InvalidArgument(
"Input tensor number on MLU and CPU devices are not matched. "
"The number on MLU is %d, on CPU is %d",
mlu.size(),
cpu.size()));
}
std::vector<void *> mlu_ptrs;
mlu_ptrs.reserve(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
mlu[i].Resize(cpu[i].dims());
mlu[i].set_layout(cpu[i].layout());
mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type()));
}
platform::SetMLUDeviceId(place_.device);
PADDLE_ENFORCE_MLU_SUCCESS(
cnPlaceNotifier(events_[i].get(), compute_stream_));
PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get()));
platform::RecordEvent record_event("BufferedReader:MemoryCopy",
platform::TracerEventType::UserDefined,
1);
for (size_t i = 0; i < cpu.size(); ++i) {
auto cpu_place = cpu[i].place();
auto cpu_ptr = cpu[i].data();
auto mlu_ptr = mlu_ptrs[i];
auto size = cpu[i].numel() * phi::SizeOf(cpu[i].dtype());
if ((platform::is_mlu_place(cpu_place))) {
memory::Copy(
place_, mlu_ptr, cpu_place, cpu_ptr, size, stream_.get());
} else {
memory::Copy(
place_, mlu_ptr, cpu_place, cpu_ptr, size, stream_.get());
platform::MLUStreamSync(stream_.get());
}
mlu[i].set_lod(cpu[i].lod());
}
platform::MLUStreamSync(stream_.get());
}
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
if (platform::is_xpu_place(place_)) { if (platform::is_xpu_place(place_)) {
TensorVec &xpu = xpu_buffer_[i]; TensorVec &xpu = xpu_buffer_[i];
......
...@@ -26,10 +26,6 @@ ...@@ -26,10 +26,6 @@
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h" #include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
...@@ -92,12 +88,6 @@ class BufferedReader : public framework::DecoratedReader { ...@@ -92,12 +88,6 @@ class BufferedReader : public framework::DecoratedReader {
std::vector<std::shared_ptr<platform::CudaEventObject>> events_; std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
#endif #endif
#ifdef PADDLE_WITH_MLU
mluStream compute_stream_;
std::shared_ptr<platform::MluStreamObject> stream_;
std::vector<std::shared_ptr<platform::MluEventObject>> events_;
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
xpuStream compute_stream_; xpuStream compute_stream_;
std::shared_ptr<platform::XpuStreamObject> stream_; std::shared_ptr<platform::XpuStreamObject> stream_;
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceMaxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<phi::DenseTensor>("X");
auto* output = context.Output<phi::DenseTensor>("Out");
int out_dtype = context.Attr<int>("out_dtype");
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto input_dims = input->dims();
const auto& input_dim_size = input->dims().size();
std::vector<int> reduce_dims;
if (reduce_all) {
for (int i = 0; i < input_dims.size(); i++) {
reduce_dims.push_back(static_cast<int>(i));
}
} else {
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
reduce_dims.push_back(dims[i] + input_dim_size);
} else {
reduce_dims.push_back(dims[i]);
}
}
}
auto place = context.GetPlace();
phi::DenseTensor cast_out(input->type());
cast_out.Resize(output->dims());
cast_out.mutable_data<T>(place);
auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
if (out_dtype != -1) {
cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
}
if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
if (cast_out_dtype == framework::proto::VarType::FP32) {
output->mutable_data<float>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP16) {
output->mutable_data<paddle::platform::float16>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT32) {
output->mutable_data<int32_t>(place);
}
} else {
output->ShareDataWith(cast_out);
}
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
MLUCnnlReduceDesc reduction_desc(reduce_dims,
CNNL_REDUCE_MAX,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
GetBasePtr(input),
0 /*indices_size*/,
nullptr,
nullptr,
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<phi::DenseTensor>("X");
auto* out = context.Input<phi::DenseTensor>("Out");
auto* out_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto reduce_dims = context.Attr<std::vector<int>>("dim");
bool reduce_all = context.Attr<bool>("reduce_all");
int in_dtype = context.Attr<int>("in_dtype");
PADDLE_ENFORCE_EQ(
in_dtype == -1,
true,
platform::errors::InvalidArgument(
"MLU only support in_dtype == -1 in reduce_max_grad op."));
auto* x_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
x_grad->mutable_data<T>(context.GetPlace());
auto place = context.GetPlace();
// broadcast
auto x_dims_vec = phi::vectorize(x->dims());
if (reduce_all) {
reduce_dims.clear();
for (size_t d = 0; d < x_dims_vec.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
phi::DenseTensor tmp_out, tmp_out_grad;
auto tmp_out_dims_vec = x_dims_vec;
for (auto d : reduce_dims) {
if (d < 0) {
d += x_dims_vec.size();
}
tmp_out_dims_vec[d] = 1;
}
tmp_out.ShareDataWith(*out);
tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
tmp_out_grad.ShareDataWith(*out_grad);
tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
phi::DenseTensor transformed_out(x->type());
transformed_out.Resize(phi::make_ddim(x_dims_vec));
transformed_out.mutable_data<T>(place);
MLUCnnlTensorDesc tmp_out_desc(tmp_out);
MLUCnnlTensorDesc transformed_out_desc(transformed_out);
MLUCnnl::BroadcastTo(context,
tmp_out_desc.get(),
GetBasePtr(&tmp_out),
transformed_out_desc.get(),
GetBasePtr(&transformed_out));
phi::DenseTensor transformed_out_grad(x->type());
transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
transformed_out_grad.mutable_data<T>(place);
MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
MLUCnnlTensorDesc transformed_out_grad_desc(transformed_out_grad);
MLUCnnl::BroadcastTo(context,
tmp_out_grad_desc.get(),
GetBasePtr(&tmp_out_grad),
transformed_out_grad_desc.get(),
GetBasePtr(&transformed_out_grad));
// compare
phi::DenseTensor equal_cond;
equal_cond.mutable_data<bool>(x_grad->dims(), place);
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc equal_cond_desc(equal_cond);
MLUCnnl::Logic(context,
CNNL_LOGIC_OP_EQ,
x_desc.get(),
GetBasePtr(x),
transformed_out_desc.get(),
GetBasePtr(&transformed_out),
equal_cond_desc.get(),
GetBasePtr(&equal_cond));
// select
phi::DenseTensor t_zero;
t_zero.mutable_data<T>(x_grad->dims(), place);
FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
t_zero.Resize(x_grad->dims());
MLUCnnlTensorDesc t_zero_desc(t_zero);
MLUCnnlTensorDesc x_grad_desc(*x_grad);
MLUCnnl::Select(context,
equal_cond_desc.get(),
GetBasePtr(&equal_cond),
transformed_out_grad_desc.get(),
GetBasePtr(&transformed_out_grad),
t_zero_desc.get(),
GetBasePtr(&t_zero),
x_grad_desc.get(),
GetBasePtr(x_grad));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_max,
ops::ReduceMaxMLUKernel<float>,
ops::ReduceMaxMLUKernel<plat::float16>,
ops::ReduceMaxMLUKernel<int>);
REGISTER_OP_MLU_KERNEL(reduce_max_grad,
ops::ReduceMaxGradMLUKernel<float>,
ops::ReduceMaxGradMLUKernel<plat::float16>,
ops::ReduceMaxGradMLUKernel<int>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceMeanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
MLUReduceOp<T>(context, "reduce_mean");
}
};
template <typename T>
class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<phi::DenseTensor>("X");
auto* output_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* input_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(context.GetPlace());
bool reduce_all = context.Attr<bool>("reduce_all");
auto reduce_dims = context.Attr<std::vector<int>>("dim");
auto input_dims = phi::vectorize(input->dims());
int reduce_numel = 1;
if (reduce_all) {
reduce_dims.clear();
for (size_t d = 0; d < input_dims.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
for (auto& d : reduce_dims) {
if (d < 0) {
d = d + input_dims.size();
}
reduce_numel *= input_dims[d];
}
phi::DenseTensor tmp_output_grad(output_grad->dtype());
auto tmp_output_dims = input_dims;
for (auto d : reduce_dims) {
tmp_output_dims[d] = 1;
}
tmp_output_grad.ShareDataWith(*output_grad);
tmp_output_grad.Resize(phi::make_ddim(tmp_output_dims));
MLUCnnlTensorDesc output_grad_desc(tmp_output_grad,
CNNL_LAYOUT_ARRAY,
ToCnnlDataType(tmp_output_grad.dtype()));
MLUCnnlTensorDesc input_grad_desc(
*input_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input_grad->dtype()));
auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
MLUCnnl::Fill(context,
CNNL_POINTER_MODE_HOST,
&value,
input_grad_desc.get(),
GetBasePtr(input_grad));
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(context,
op_tensor_desc.get(),
output_grad_desc.get(),
GetBasePtr(&tmp_output_grad),
input_grad_desc.get(),
GetBasePtr(input_grad),
input_grad_desc.get(),
GetBasePtr(input_grad),
ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_mean,
ops::ReduceMeanMLUKernel<float>,
ops::ReduceMeanMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(reduce_mean_grad,
ops::ReduceMeanGradMLUKernel<float>,
ops::ReduceMeanGradMLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceMinMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<phi::DenseTensor>("X");
auto* output = context.Output<phi::DenseTensor>("Out");
int out_dtype = context.Attr<int>("out_dtype");
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto input_dims = input->dims();
const auto& input_dim_size = input->dims().size();
std::vector<int> reduce_dims;
if (reduce_all) {
for (int i = 0; i < input_dims.size(); i++) {
reduce_dims.push_back(static_cast<int>(i));
}
} else {
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
reduce_dims.push_back(dims[i] + input_dim_size);
} else {
reduce_dims.push_back(dims[i]);
}
}
}
auto place = context.GetPlace();
phi::DenseTensor cast_out(input->type());
cast_out.Resize(output->dims());
cast_out.mutable_data<T>(place);
auto cast_out_dtype = framework::TransToProtoVarType(input->dtype());
if (out_dtype != -1) {
cast_out_dtype = static_cast<framework::proto::VarType::Type>(out_dtype);
}
if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) {
if (cast_out_dtype == framework::proto::VarType::FP32) {
output->mutable_data<float>(place);
} else if (cast_out_dtype == framework::proto::VarType::FP16) {
output->mutable_data<paddle::platform::float16>(place);
} else if (cast_out_dtype == framework::proto::VarType::INT32) {
output->mutable_data<int32_t>(place);
}
} else {
output->ShareDataWith(cast_out);
}
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
MLUCnnlReduceDesc reduction_desc(reduce_dims,
CNNL_REDUCE_MIN,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
GetBasePtr(input),
0 /*indices_size*/,
nullptr,
nullptr,
output_desc.get(),
GetBasePtr(output));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_min,
ops::ReduceMinMLUKernel<float>,
ops::ReduceMinMLUKernel<plat::float16>,
ops::ReduceMinMLUKernel<int>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_MLU
#include <string>
#include <vector>
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
template <typename T>
void MLUReduceOp(const framework::ExecutionContext& context,
std::string reduce_name) {
PADDLE_ENFORCE_EQ(
platform::is_mlu_place(context.GetPlace()),
true,
platform::errors::Unavailable("This kernel only runs on MLU."));
auto* input = context.Input<phi::DenseTensor>("X");
auto* output = context.Output<phi::DenseTensor>("Out");
output->mutable_data<T>(context.GetPlace());
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto input_dims = phi::vectorize(input->dims());
const auto& input_dim_size = input->dims().size();
std::vector<int> reduce_dims;
if (reduce_all) {
for (size_t i = 0; i < input_dims.size(); i++) {
reduce_dims.push_back(static_cast<int>(i));
}
} else {
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
reduce_dims.push_back(dims[i] + input_dim_size);
} else {
reduce_dims.push_back(dims[i]);
}
}
}
MLUCnnlTensorDesc input_desc(
*input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input->dtype()));
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
cnnlReduceOp_t reduce_op = GetMLUCnnlReduceOp(reduce_name);
MLUCnnlReduceDesc reduction_desc(reduce_dims,
reduce_op,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
input_desc.get(),
GetBasePtr(input),
0 /*indices_size*/,
nullptr,
nullptr,
output_desc.get(),
GetBasePtr(output));
}
} // namespace operators
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceMeanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
MLUReduceOp<T>(context, "reduce_prod");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_prod,
ops::ReduceMeanMLUKernel<float>,
ops::ReduceMeanMLUKernel<plat::float16>,
ops::ReduceMeanMLUKernel<int>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_op_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceSumMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
MLUReduceOp<T>(context, "reduce_sum");
}
};
template <typename T>
class ReduceSumGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<phi::DenseTensor>("X");
auto* out_grad =
context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* in_grad =
context.Output<phi::DenseTensor>(framework::GradVarName("X"));
in_grad->mutable_data<T>(context.GetPlace());
bool reduce_all = context.Attr<bool>("reduce_all");
auto reduce_dims = context.Attr<std::vector<int>>("dim");
auto in_dims = phi::vectorize(in->dims());
if (reduce_all) {
reduce_dims.clear();
for (size_t d = 0; d < in_dims.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
for (auto& d : reduce_dims) {
if (d < 0) {
d = d + in_dims.size();
}
}
phi::DenseTensor tmp_out(out_grad->dtype());
auto tmp_output_dims = in_dims;
for (auto d : reduce_dims) {
tmp_output_dims[d] = 1;
}
tmp_out.ShareDataWith(*out_grad);
tmp_out.Resize(phi::make_ddim(tmp_output_dims));
MLUCnnlTensorDesc out_desc(tmp_out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnlTensorDesc in_grad_desc(
*in_grad, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
MLUCnnl::BroadcastTo(context,
out_desc.get(),
GetBasePtr(&tmp_out),
in_grad_desc.get(),
GetBasePtr(in_grad));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_sum,
ops::ReduceSumMLUKernel<float>,
ops::ReduceSumMLUKernel<int>,
ops::ReduceSumMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(reduce_sum_grad,
ops::ReduceSumGradMLUKernel<float>,
ops::ReduceSumGradMLUKernel<plat::float16>);
...@@ -41,18 +41,6 @@ class SoftmaxWithCrossEntropyOpMaker ...@@ -41,18 +41,6 @@ class SoftmaxWithCrossEntropyOpMaker
"The outputs value of softmax activation by given the input batch, " "The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation.") "which will be used in backward calculation.")
.AsIntermediate(); .AsIntermediate();
#if defined(PADDLE_WITH_MLU)
AddOutput(
"Backprop",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits). "
"The intermediate value used for backward calculation. The calculation "
"is :"
"exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
"where labels is ont-hot."
"Currently, the tensor is generated and used in npu/mlu kernel. ")
.AsIntermediate();
#endif
AddOutput("Loss", AddOutput("Loss",
"(Tensor, default: Tensor<float>), A tensor in same shape with " "(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits) " "Input(Logits) "
...@@ -135,12 +123,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { ...@@ -135,12 +123,6 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Output(Softmax) should be not null.")); "Output(Softmax) should be not null."));
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"),
true,
platform::errors::InvalidArgument(
"Output(Backprop) should be not null."));
#endif
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->HasOutput("Loss"), ctx->HasOutput("Loss"),
true, true,
...@@ -235,12 +217,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { ...@@ -235,12 +217,6 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(Softmax) should be not null.")); "Input(Softmax) should be not null."));
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"),
true,
platform::errors::InvalidArgument(
"Input(Backprop) should be not null."));
#endif
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ctx->HasInput("Label"), ctx->HasInput("Label"),
true, true,
...@@ -324,9 +300,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -324,9 +300,6 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetType("softmax_with_cross_entropy_grad");
grad_op->SetInput("Label", this->Input("Label")); grad_op->SetInput("Label", this->Input("Label"));
grad_op->SetInput("Softmax", this->Output("Softmax")); grad_op->SetInput("Softmax", this->Output("Softmax"));
#if defined(PADDLE_WITH_MLU)
grad_op->SetInput("Backprop", this->Output("Backprop"));
#endif
grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
grad_op->SetOutput(framework::GradVarName("Logits"), grad_op->SetOutput(framework::GradVarName("Logits"),
this->InputGrad("Logits")); this->InputGrad("Logits"));
...@@ -356,26 +329,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad, ...@@ -356,26 +329,8 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops::SoftmaxWithCrossEntropyGradInplaceInferer); ops::SoftmaxWithCrossEntropyGradInplaceInferer);
REGISTER_OP_VERSION(softmax_with_cross_entropy) REGISTER_OP_VERSION(softmax_with_cross_entropy)
#if defined(PADDLE_WITH_MLU)
.AddCheckpoint(
R"ROC(
Add a new attribute [use_softmax] )ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_softmax", "A flag to indicate whether to do softmax", true))
.AddCheckpoint(
R"ROC(
Add a new dispensable/intermediate output [backprop] )ROC",
paddle::framework::compatible::OpVersionDesc().NewOutput(
"Backprop",
"The intermediate value used for backward calculation. The "
"calculation is :"
"exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
"where labels is ont-hot."
"Currently, the tensor is generated and used in npu/mlu kernel. "));
#else
.AddCheckpoint( .AddCheckpoint(
R"ROC( R"ROC(
Add a new attribute [use_softmax] )ROC", Add a new attribute [use_softmax] )ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr( paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_softmax", "A flag to indicate whether to do softmax", true)); "use_softmax", "A flag to indicate whether to do softmax", true));
#endif
...@@ -92,11 +92,6 @@ inline T GetValue(const phi::DenseTensor* x) { ...@@ -92,11 +92,6 @@ inline T GetValue(const phi::DenseTensor* x) {
if (!platform::is_cpu_place(x->place())) { if (!platform::is_cpu_place(x->place())) {
phi::DenseTensor cpu_x; phi::DenseTensor cpu_x;
framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x); framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
#if defined(PADDLE_WITH_MLU)
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx = pool.Get(x->place());
dev_ctx->Wait();
#endif
value = cpu_x.data<T>()[0]; value = cpu_x.data<T>()[0];
} else { } else {
value = x->data<T>()[0]; value = x->data<T>()[0];
......
...@@ -78,11 +78,7 @@ if(WITH_ASCEND_CL) ...@@ -78,11 +78,7 @@ if(WITH_ASCEND_CL)
set(NPU_CTX_DEPS npu_stream npu_info) set(NPU_CTX_DEPS npu_stream npu_info)
endif() endif()
if(WITH_MLU) if(WITH_ASCEND_CL)
set(MLU_CTX_DEPS mlu_device_context)
endif()
if(WITH_ASCEND_CL OR WITH_MLU)
cc_library( cc_library(
stream_callback_manager stream_callback_manager
SRCS stream_callback_manager.cc SRCS stream_callback_manager.cc
...@@ -175,10 +171,6 @@ if(WITH_XPU) ...@@ -175,10 +171,6 @@ if(WITH_XPU)
target_link_libraries(device_context xpu_resource_pool) target_link_libraries(device_context xpu_resource_pool)
endif() endif()
if(WITH_MLU)
target_link_libraries(device_context mlu_resource_pool)
endif()
if(WITH_CUSTOM_DEVICE) if(WITH_CUSTOM_DEVICE)
target_link_libraries(device_context custom_device_resource_pool) target_link_libraries(device_context custom_device_resource_pool)
endif() endif()
......
...@@ -15,11 +15,6 @@ if(WITH_IPU) ...@@ -15,11 +15,6 @@ if(WITH_IPU)
add_subdirectory(ipu) add_subdirectory(ipu)
endif() endif()
# MLU
if(WITH_MLU)
add_subdirectory(mlu)
endif()
if(WITH_CUSTOM_DEVICE) if(WITH_CUSTOM_DEVICE)
add_subdirectory(custom) add_subdirectory(custom)
endif() endif()
...@@ -25,11 +25,6 @@ limitations under the License. */ ...@@ -25,11 +25,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
......
...@@ -33,11 +33,6 @@ limitations under the License. */ ...@@ -33,11 +33,6 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/device/mlu/device_context_allocator.h"
#endif
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -224,18 +219,6 @@ void EmplaceDeviceContexts( ...@@ -224,18 +219,6 @@ void EmplaceDeviceContexts(
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"CUDAPlace is not supported. Please re-compile with WITH_GPU " "CUDAPlace is not supported. Please re-compile with WITH_GPU "
"option.")); "option."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
EmplaceDeviceContext<MLUDeviceContext>(
place_to_device_context,
place,
disable_setting_default_stream_for_allocator,
/*unused*/ stream_priority);
#else
PADDLE_THROW(
platform::errors::Unimplemented("MLUPlace is not supported. Please "
"re-compile with WITH_MLU option."));
#endif #endif
} else if (platform::is_ipu_place(place)) { } else if (platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
......
...@@ -135,10 +135,6 @@ class IPUDeviceContext ...@@ -135,10 +135,6 @@ class IPUDeviceContext
}; };
#endif #endif
#ifdef PADDLE_WITH_MLU
class MLUDeviceContext;
#endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
namespace xpu = baidu::xpu::api; namespace xpu = baidu::xpu::api;
using XPUDeviceContext = phi::XPUContext; using XPUDeviceContext = phi::XPUContext;
...@@ -173,11 +169,6 @@ struct DefaultDeviceContextType<phi::IPUPlace> { ...@@ -173,11 +169,6 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
}; };
#endif #endif
#ifdef PADDLE_WITH_MLU
template <>
struct DefaultDeviceContextType<phi::MLUPlace>;
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <> template <>
struct DefaultDeviceContextType<phi::GPUPinnedPlace> { struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
......
...@@ -36,10 +36,6 @@ limitations under the License. */ ...@@ -36,10 +36,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_info.h" #include "paddle/fluid/platform/device/xpu/xpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef WITH_WIN_DUMP_DBG #ifdef WITH_WIN_DUMP_DBG
#include <stdio.h> #include <stdio.h>
#include <time.h> #include <time.h>
...@@ -195,14 +191,6 @@ void InitDevices() { ...@@ -195,14 +191,6 @@ void InitDevices() {
LOG(WARNING) LOG(WARNING)
<< "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime."; << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime.";
} }
#endif
#ifdef PADDLE_WITH_MLU
try {
// use user specified MLUs in single-node multi-process mode.
devices = platform::GetMLUSelectedDevices();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_MLU, but no MLU found in runtime.";
}
#endif #endif
InitDevices(devices); InitDevices(devices);
}); });
...@@ -228,10 +216,6 @@ void InitDevices(const std::vector<int> devices) { ...@@ -228,10 +216,6 @@ void InitDevices(const std::vector<int> devices) {
#ifdef PADDLE_WITH_IPU #ifdef PADDLE_WITH_IPU
places.emplace_back(platform::IPUPlace(devices[i])); places.emplace_back(platform::IPUPlace(devices[i]));
#endif #endif
#ifdef PADDLE_WITH_MLU
places.emplace_back(platform::MLUPlace(devices[i]));
#endif
} }
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
...@@ -15,16 +15,13 @@ limitations under the License. */ ...@@ -15,16 +15,13 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
TEST(InitDevices, CPU) { TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \ #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_XPU) && \
!defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MLU) !defined(PADDLE_WITH_HIP)
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.Size(), 1U); ASSERT_EQ(pool.Size(), 1U);
...@@ -55,18 +52,6 @@ TEST(InitDevices, XPU) { ...@@ -55,18 +52,6 @@ TEST(InitDevices, XPU) {
#endif #endif
} }
TEST(InitDevices, MLU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_MLU
int count = paddle::platform::GetMLUDeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.Size(), 1U + static_cast<unsigned>(count));
#endif
}
#ifndef _WIN32 #ifndef _WIN32
TEST(SignalHandle, SignalHandle) { TEST(SignalHandle, SignalHandle) {
std::string msg = "Signal raises"; std::string msg = "Signal raises";
......
...@@ -32,7 +32,6 @@ using NPUPlace = phi::NPUPlace; ...@@ -32,7 +32,6 @@ using NPUPlace = phi::NPUPlace;
using NPUPinnedPlace = phi::NPUPinnedPlace; using NPUPinnedPlace = phi::NPUPinnedPlace;
using XPUPlace = phi::XPUPlace; using XPUPlace = phi::XPUPlace;
using IPUPlace = phi::IPUPlace; using IPUPlace = phi::IPUPlace;
using MLUPlace = phi::MLUPlace;
using CustomPlace = phi::CustomPlace; using CustomPlace = phi::CustomPlace;
using PlaceList = std::vector<Place>; using PlaceList = std::vector<Place>;
...@@ -110,15 +109,6 @@ typename Visitor::result_type VisitPlace(const Place &place, ...@@ -110,15 +109,6 @@ typename Visitor::result_type VisitPlace(const Place &place,
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with IPU. Cannot visit ipu device")); "Paddle is not compiled with IPU. Cannot visit ipu device"));
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif
}
case phi::AllocationType::MLU: {
#ifdef PADDLE_WITH_MLU
platform::MLUPlace p(place.GetDeviceId());
return visitor(p);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with MLU. Cannot visit mlu device"));
#endif #endif
} }
case phi::AllocationType::CUSTOM: { case phi::AllocationType::CUSTOM: {
......
...@@ -19,7 +19,6 @@ TEST(Place, Equality) { ...@@ -19,7 +19,6 @@ TEST(Place, Equality) {
paddle::platform::CPUPlace cpu; paddle::platform::CPUPlace cpu;
paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
paddle::platform::XPUPlace x0(0), x1(1), xx0(0); paddle::platform::XPUPlace x0(0), x1(1), xx0(0);
paddle::platform::MLUPlace m0(0), m1(1), mm0(0);
EXPECT_EQ(cpu, cpu); EXPECT_EQ(cpu, cpu);
EXPECT_EQ(g0, g0); EXPECT_EQ(g0, g0);
...@@ -28,13 +27,9 @@ TEST(Place, Equality) { ...@@ -28,13 +27,9 @@ TEST(Place, Equality) {
EXPECT_EQ(x0, x0); EXPECT_EQ(x0, x0);
EXPECT_EQ(x1, x1); EXPECT_EQ(x1, x1);
EXPECT_EQ(x0, xx0); EXPECT_EQ(x0, xx0);
EXPECT_EQ(m0, m0);
EXPECT_EQ(m1, m1);
EXPECT_EQ(m0, mm0);
EXPECT_NE(g0, g1); EXPECT_NE(g0, g1);
EXPECT_NE(x0, x1); EXPECT_NE(x0, x1);
EXPECT_NE(m0, m1);
EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0)); EXPECT_TRUE(paddle::platform::places_are_same_class(x0, xx0));
...@@ -49,11 +44,6 @@ TEST(Place, Print) { ...@@ -49,11 +44,6 @@ TEST(Place, Print) {
ss << paddle::platform::XPUPlace(1); ss << paddle::platform::XPUPlace(1);
EXPECT_EQ("Place(xpu:1)", ss.str()); EXPECT_EQ("Place(xpu:1)", ss.str());
} }
{
std::stringstream ss;
ss << paddle::platform::MLUPlace(1);
EXPECT_EQ("Place(mlu:1)", ss.str());
}
{ {
std::stringstream ss; std::stringstream ss;
ss << paddle::platform::CUDAPlace(1); ss << paddle::platform::CUDAPlace(1);
......
...@@ -6,7 +6,6 @@ cc_library( ...@@ -6,7 +6,6 @@ cc_library(
cuda_tracer cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc SRCS cuda_tracer.cc cupti_data_process.cc
DEPS workqueue_utils enforce glog) DEPS workqueue_utils enforce glog)
add_subdirectory(mlu)
add_subdirectory(custom_device) add_subdirectory(custom_device)
cc_library( cc_library(
event_node event_node
...@@ -33,12 +32,7 @@ cc_library( ...@@ -33,12 +32,7 @@ cc_library(
cc_library( cc_library(
new_profiler new_profiler
SRCS profiler.cc SRCS profiler.cc
DEPS host_tracer DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
cuda_tracer
profiler_utils
cpu_utilization
event_bind
mlu_tracer
custom_tracer) custom_tracer)
cc_test( cc_test(
test_event_node test_event_node
......
...@@ -790,11 +790,7 @@ void ChromeTracingLogger::RefineDisplayName( ...@@ -790,11 +790,7 @@ void ChromeTracingLogger::RefineDisplayName(
(*it).second * 2 + 1); (*it).second * 2 + 1);
} }
#ifdef PADDLE_WITH_MLU
static std::string device_type("MLU");
#else
static std::string device_type("GPU"); static std::string device_type("GPU");
#endif
for (auto it = deviceid_streamid_set_.begin(); for (auto it = deviceid_streamid_set_.begin();
it != deviceid_streamid_set_.end(); it != deviceid_streamid_set_.end();
......
if(WITH_MLU)
set(MLU_INFO mlu_info)
endif()
cc_library(
mlu_tracer
SRCS mlu_tracer.cc cnpapi_data_process.cc
DEPS workqueue_utils enforce glog ${MLU_INFO})
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#include <cstdio>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
#ifdef PADDLE_WITH_MLU
namespace paddle {
namespace platform {
namespace {
inline uint64_t GetTimeGap() {
static uint64_t time_gap = []() -> uint64_t {
uint64_t cpu_time = PosixInNsec();
uint64_t mlu_time = cnpapiGetTimestamp();
return (cpu_time - mlu_time);
}();
return time_gap;
}
void AddKernelRecord(const cnpapiActivityKernel* kernel,
uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (kernel->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = demangle(kernel->name);
event.type = TracerEventType::Kernel;
event.start_ns = kernel->start + time_gap;
event.end_ns = kernel->end + time_gap;
event.device_id = kernel->device_id;
event.context_id = kernel->context_id;
event.stream_id = kernel->queue_id;
event.correlation_id = kernel->correlation_id;
event.kernel_info.block_x = kernel->dimx;
event.kernel_info.block_y = kernel->dimy;
event.kernel_info.block_z = kernel->dimz;
event.kernel_info.grid_x = kernel->kernel_type;
event.kernel_info.grid_y = 0;
event.kernel_info.grid_z = 0;
event.kernel_info.queued = kernel->queued;
event.kernel_info.submitted = kernel->submitted;
event.kernel_info.completed = kernel->received;
collector->AddDeviceEvent(std::move(event));
}
const char* MemcpyKind(cnpapiActivityMemcpyType kind) {
switch (kind) {
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOD:
return "MEMCPY_HtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOH:
return "MEMCPY_DtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_DTOD:
return "MEMCPY_DtoD";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_HTOH:
return "MEMCPY_HtoH";
case CNPAPI_ACTIVITY_MEMCPY_TYPE_PTOP:
return "MEMCPY_PtoP";
default:
break;
}
return "MEMCPY";
}
void AddMemcpyRecord(const cnpapiActivityMemcpy* memcpy,
uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy->start + time_gap;
event.end_ns = memcpy->end + time_gap;
event.device_id = memcpy->device_id;
event.context_id = memcpy->context_id;
event.stream_id = memcpy->queue_id;
event.correlation_id = memcpy->correlation_id;
event.memcpy_info.num_bytes = memcpy->bytes;
snprintf(event.memcpy_info.copy_kind,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemcpy2Record(const cnpapiActivityMemcpyPtoP* memcpy2,
uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memcpy2->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = MemcpyKind(memcpy2->copy_type);
event.type = TracerEventType::Memcpy;
event.start_ns = memcpy2->start + time_gap;
event.end_ns = memcpy2->end + time_gap;
event.device_id = memcpy2->device_id;
event.context_id = memcpy2->context_id;
event.stream_id = memcpy2->queue_id;
event.correlation_id = memcpy2->correlation_id;
event.memcpy_info.num_bytes = memcpy2->bytes;
snprintf(event.memcpy_info.copy_kind,
phi::kMemKindMaxLen,
"%s",
MemcpyKind(memcpy2->copy_type));
collector->AddDeviceEvent(std::move(event));
}
void AddMemsetRecord(const cnpapiActivityMemset* memset,
uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (memset->start + time_gap < start_ns) {
return;
}
DeviceTraceEvent event;
event.name = "MEMSET";
event.type = TracerEventType::Memset;
event.start_ns = memset->start + time_gap;
event.end_ns = memset->end + time_gap;
event.device_id = memset->device_id;
event.context_id = memset->context_id;
event.stream_id = memset->queue_id;
event.correlation_id = memset->correlation_id;
event.memset_info.num_bytes = memset->bytes;
event.memset_info.value = memset->value;
collector->AddDeviceEvent(std::move(event));
}
class CnpapiRuntimeCbidStr {
public:
static const CnpapiRuntimeCbidStr& GetInstance() {
static CnpapiRuntimeCbidStr inst;
return inst;
}
std::string RuntimeKind(cnpapi_CallbackId cbid) const {
auto iter = cbid_str_.find(cbid);
if (iter == cbid_str_.end()) {
return "MLU Runtime API " + std::to_string(cbid);
}
return iter->second;
}
private:
CnpapiRuntimeCbidStr();
std::unordered_map<cnpapi_CallbackId, std::string> cbid_str_;
};
CnpapiRuntimeCbidStr::CnpapiRuntimeCbidStr() {
#define REGISTER_RUNTIME_CBID_STR(cbid) \
cbid_str_[CNPAPI_CNDRV_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cnMalloc);
REGISTER_RUNTIME_CBID_STR(cnMallocHost);
REGISTER_RUNTIME_CBID_STR(cnFree);
REGISTER_RUNTIME_CBID_STR(cnFreeHost);
REGISTER_RUNTIME_CBID_STR(cnMemcpy);
REGISTER_RUNTIME_CBID_STR(cnMemcpyPeer);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoH);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD);
REGISTER_RUNTIME_CBID_STR(cnMemcpyAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyHtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoHAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoDAsync);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpyDtoD3D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy2D);
REGISTER_RUNTIME_CBID_STR(cnMemcpy3D);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32);
REGISTER_RUNTIME_CBID_STR(cnMemsetD8Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD16Async);
REGISTER_RUNTIME_CBID_STR(cnMemsetD32Async);
REGISTER_RUNTIME_CBID_STR(cnInvokeKernel);
REGISTER_RUNTIME_CBID_STR(cnCreateQueue);
REGISTER_RUNTIME_CBID_STR(cnDestroyQueue);
REGISTER_RUNTIME_CBID_STR(cnQueueSync);
REGISTER_RUNTIME_CBID_STR(cnQueueWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnWaitNotifier);
REGISTER_RUNTIME_CBID_STR(cnCreateNotifier);
REGISTER_RUNTIME_CBID_STR(cnDestroyNotifier);
REGISTER_RUNTIME_CBID_STR(cnPlaceNotifier);
REGISTER_RUNTIME_CBID_STR(cnCtxCreate);
REGISTER_RUNTIME_CBID_STR(cnCtxDestroy);
REGISTER_RUNTIME_CBID_STR(cnCtxGetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxSetCurrent);
REGISTER_RUNTIME_CBID_STR(cnCtxGetDevice);
REGISTER_RUNTIME_CBID_STR(cnCtxSync);
REGISTER_RUNTIME_CBID_STR(cnInvokeHostFunc);
#undef REGISTER_RUNTIME_CBID_STR
}
void AddApiRecord(const cnpapiActivityAPI* api,
uint64_t start_ns,
TraceEventCollector* collector) {
static uint64_t time_gap = GetTimeGap();
if (api->start + time_gap < start_ns) {
return;
}
RuntimeTraceEvent event;
event.name = CnpapiRuntimeCbidStr::GetInstance().RuntimeKind(api->cbid);
event.start_ns = api->start + time_gap;
event.end_ns = api->end + time_gap;
event.process_id = api->process_id;
event.thread_id = api->thread_id;
event.correlation_id = api->correlation_id;
event.callback_id = api->cbid;
event.type = TracerEventType::MluRuntime;
collector->AddRuntimeEvent(std::move(event));
}
} // namespace
namespace details {
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector) {
switch (record->type) {
case CNPAPI_ACTIVITY_TYPE_KERNEL:
AddKernelRecord(reinterpret_cast<const cnpapiActivityKernel*>(record),
start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY:
AddMemcpyRecord(reinterpret_cast<const cnpapiActivityMemcpy*>(record),
start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP:
AddMemcpy2Record(
reinterpret_cast<const cnpapiActivityMemcpyPtoP*>(record),
start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_MEMSET:
AddMemsetRecord(reinterpret_cast<const cnpapiActivityMemset*>(record),
start_ns,
collector);
break;
case CNPAPI_ACTIVITY_TYPE_CNDRV_API:
AddApiRecord(reinterpret_cast<const cnpapiActivityAPI*>(record),
start_ns,
collector);
break;
default:
break;
}
}
} // namespace details
} // namespace platform
} // namespace paddle
#endif
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h"
namespace paddle {
namespace platform {
namespace details {
#ifdef PADDLE_WITH_MLU
void ProcessCnpapiActivityRecord(const cnpapiActivity* record,
uint64_t start_ns,
TraceEventCollector* collector);
#endif
} // namespace details
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#include <string>
#include <unordered_map>
#include "glog/logging.h"
#include "paddle/fluid/framework/new_executor/workqueue/workqueue_utils.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/mlu/cnpapi_data_process.h"
#define CNPAPI_CALL(call) \
do { \
cnpapiResult _status = call; \
if (_status != CNPAPI_SUCCESS) { \
const char* errstr; \
cnpapiGetResultString(_status, &errstr); \
LOG(ERROR) << "Function " << #call << " failed with error " << errstr; \
} \
} while (0)
namespace paddle {
namespace platform {
namespace {
void BufferRequestedCallback(uint64_t** buffer,
size_t* size,
size_t* max_num_records) {
constexpr size_t kBufferSize = 1 << 23; // 8 MB
constexpr size_t kBufferAlignSize = 8;
*buffer = reinterpret_cast<uint64_t*>(
paddle::framework::AlignedMalloc(kBufferSize, kBufferAlignSize));
*size = kBufferSize;
*max_num_records = 0;
}
void BufferCompletedCallback(uint64_t* buffer, size_t size, size_t valid_size) {
if (buffer == nullptr || valid_size == 0) {
return;
}
auto mlu_tracer = &MluTracer::GetInstance();
mlu_tracer->ProcessCnpapiActivity(buffer, valid_size);
paddle::framework::AlignedFree(buffer);
}
} // namespace
MluTracer::MluTracer() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiInit());
CNPAPI_CALL(cnpapiActivityRegisterCallbacks(BufferRequestedCallback,
BufferCompletedCallback));
#endif
}
void MluTracer::PrepareTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::UNINITED || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("MluTracer must be UNINITED"));
EnableCnpapiActivity();
state_ = TracerState::READY;
}
void MluTracer::StartTracing() {
PADDLE_ENFORCE_EQ(state_ == TracerState::READY,
true,
platform::errors::PreconditionNotMet(
"MluTracer must be READY or STOPPED"));
tracing_start_ns_ = PosixInNsec();
state_ = TracerState::STARTED;
}
void MluTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("MluTracer must be STARTED"));
DisableCnpapiActivity();
state_ = TracerState::STOPED;
}
void MluTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("MluTracer must be STOPED"));
for (auto he : collector_.HostEvents()) {
collector->AddHostEvent(std::move(he));
}
for (auto rte : collector_.RuntimeEvents()) {
collector->AddRuntimeEvent(std::move(rte));
}
for (auto de : collector_.DeviceEvents()) {
collector->AddDeviceEvent(std::move(de));
}
for (auto tn : collector_.ThreadNames()) {
collector->AddThreadName(tn.first, tn.second);
}
collector_.ClearAll();
}
void MluTracer::ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size) {
#ifdef PADDLE_WITH_MLU
cnpapiActivity* record = nullptr;
while (true) {
cnpapiResult status =
cnpapiActivityGetNextRecord(buffer, valid_size, &record);
if (status == CNPAPI_SUCCESS) {
details::ProcessCnpapiActivityRecord(
record, tracing_start_ns_, &collector_);
} else if (status == CNPAPI_ERROR_INSUFFICIENT_MEMORY ||
status == CNPAPI_ERROR_MAX_LIMIT_REACHED) {
break;
} else {
CNPAPI_CALL(status);
}
}
#endif
}
void MluTracer::EnableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityEnable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "enable cnpapi activity";
#endif
}
void MluTracer::DisableCnpapiActivity() {
#ifdef PADDLE_WITH_MLU
CNPAPI_CALL(cnpapiActivityFlushAll());
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_KERNEL));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMCPY_PTOP));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_MEMSET));
CNPAPI_CALL(cnpapiActivityDisable(CNPAPI_ACTIVITY_TYPE_CNDRV_API));
VLOG(3) << "disable cnpapi activity";
#endif
}
} // namespace platform
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdint>
#include <vector>
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/profiler/tracer_base.h"
namespace paddle {
namespace platform {
class MluTracer : public TracerBase {
public:
static MluTracer& GetInstance() {
static MluTracer instance;
return instance;
}
void PrepareTracing() override;
void StartTracing() override;
void StopTracing() override;
void CollectTraceData(TraceEventCollector* collector) override;
void ProcessCnpapiActivity(uint64_t* buffer, size_t valid_size);
private:
MluTracer();
DISABLE_COPY_AND_ASSIGN(MluTracer);
void EnableCnpapiActivity();
void DisableCnpapiActivity();
uint64_t tracing_start_ns_ = UINT64_MAX;
TraceEventCollector collector_;
};
} // namespace platform
} // namespace paddle
...@@ -29,10 +29,6 @@ ...@@ -29,10 +29,6 @@
#include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h" #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/extra_info.h"
#include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/host_tracer.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
#endif
#include "paddle/fluid/platform/profiler/trace_event_collector.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h"
#include "paddle/fluid/platform/profiler/utils.h" #include "paddle/fluid/platform/profiler/utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
...@@ -49,9 +45,6 @@ void SynchronizeDevice() { ...@@ -49,9 +45,6 @@ void SynchronizeDevice() {
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
#endif #endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : dev_types) { for (const auto& dev_type : dev_types) {
...@@ -86,9 +79,6 @@ bool Profiler::IsCuptiSupported() { ...@@ -86,9 +79,6 @@ bool Profiler::IsCuptiSupported() {
bool Profiler::IsCnpapiSupported() { bool Profiler::IsCnpapiSupported() {
bool supported = false; bool supported = false;
#ifdef PADDLE_WITH_MLU
supported = true;
#endif
return supported; return supported;
} }
...@@ -104,11 +94,6 @@ Profiler::Profiler(const ProfilerOptions& options, ...@@ -104,11 +94,6 @@ Profiler::Profiler(const ProfilerOptions& options,
if (trace_switch.test(kProfileGPUOptionBit)) { if (trace_switch.test(kProfileGPUOptionBit)) {
tracers_.emplace_back(&CudaTracer::GetInstance(), false); tracers_.emplace_back(&CudaTracer::GetInstance(), false);
} }
#ifdef PADDLE_WITH_MLU
if (trace_switch.test(kProfileMLUOptionBit)) {
tracers_.emplace_back(&MluTracer::GetInstance(), false);
}
#endif
if (trace_switch.test(kProfileCustomDeviceOptionBit)) { if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
for (const auto& dev_type : custom_device_types) { for (const auto& dev_type : custom_device_types) {
tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false); tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);
......
...@@ -34,10 +34,6 @@ limitations under the License. */ ...@@ -34,10 +34,6 @@ limitations under the License. */
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/enforce.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
#endif #endif
...@@ -112,13 +108,6 @@ void SynchronizeAllDevice() { ...@@ -112,13 +108,6 @@ void SynchronizeAllDevice() {
PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
} }
#endif #endif
#ifdef PADDLE_WITH_MLU
int count = GetMLUDeviceCount();
for (int i = 0; i < count; i++) {
SetMLUDeviceId(i);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtSyncDevice());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes(); auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto &dev_type : dev_types) { for (const auto &dev_type : dev_types) {
......
...@@ -32,10 +32,6 @@ static void StreamCallbackFunc(gpuStream_t stream, ...@@ -32,10 +32,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data)
#endif #endif
#endif #endif
#if PADDLE_WITH_MLU
static void StreamCallbackFunc(void *user_data)
#endif
{ {
std::unique_ptr<std::function<void()>> func( std::unique_ptr<std::function<void()>> func(
reinterpret_cast<std::function<void()> *>(user_data)); reinterpret_cast<std::function<void()> *>(user_data));
...@@ -71,20 +67,12 @@ void StreamCallbackManager<Stream>::AddCallback( ...@@ -71,20 +67,12 @@ void StreamCallbackManager<Stream>::AddCallback(
cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
#endif #endif
#endif #endif
#if PADDLE_WITH_MLU
VLOG(3) << "MLULaunchCallback at stream: " << stream_;
cnrtInvokeHostFunc(stream_, StreamCallbackFunc, func);
#endif
} }
template <typename Stream> template <typename Stream>
void StreamCallbackManager<Stream>::Wait() const { void StreamCallbackManager<Stream>::Wait() const {
#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
platform::GpuStreamSync(stream_); platform::GpuStreamSync(stream_);
#endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
#endif #endif
{ {
std::lock_guard<std::mutex> lock(mtx_); std::lock_guard<std::mutex> lock(mtx_);
...@@ -100,10 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>; ...@@ -100,10 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
template struct StreamCallbackManager<hipStream_t>; template struct StreamCallbackManager<hipStream_t>;
#endif #endif
#ifdef PADDLE_WITH_MLU
template struct StreamCallbackManager<mluStream>;
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -151,8 +151,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { ...@@ -151,8 +151,6 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
return place_obj.cast<platform::IPUPlace>(); return place_obj.cast<platform::IPUPlace>();
} else if (py::isinstance<platform::Place>(place_obj)) { } else if (py::isinstance<platform::Place>(place_obj)) {
return place_obj.cast<platform::Place>(); return place_obj.cast<platform::Place>();
} else if (py::isinstance<platform::MLUPlace>(place_obj)) {
return place_obj.cast<platform::MLUPlace>();
} else if (py::isinstance<platform::CustomPlace>(place_obj)) { } else if (py::isinstance<platform::CustomPlace>(place_obj)) {
return place_obj.cast<platform::CustomPlace>(); return place_obj.cast<platform::CustomPlace>();
} else { } else {
...@@ -207,8 +205,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self, ...@@ -207,8 +205,6 @@ static void InitVarBaseAndTensor(imperative::VarBase *self,
SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy); SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
} else if (platform::is_ipu_place(place)) { } else if (platform::is_ipu_place(place)) {
SetTensorFromPyArray<platform::IPUPlace>(tensor, array, place, zero_copy); SetTensorFromPyArray<platform::IPUPlace>(tensor, array, place, zero_copy);
} else if (platform::is_mlu_place(place)) {
SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
} else if (platform::is_custom_place(place)) { } else if (platform::is_custom_place(place)) {
SetTensorFromPyArray<platform::CustomPlace>( SetTensorFromPyArray<platform::CustomPlace>(
tensor, array, place, zero_copy); tensor, array, place, zero_copy);
...@@ -727,14 +723,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -727,14 +723,6 @@ void BindImperative(py::module *m_ptr) {
py::arg("zero_copy") = false, py::arg("zero_copy") = false,
py::arg("name") = "", py::arg("name") = "",
py::arg("stop_gradient") = -1) py::arg("stop_gradient") = -1)
.def("__init__",
&InitVarBaseFromNumpyWithArg<platform::MLUPlace>,
py::arg("value"),
py::arg("place"),
py::arg("persistable") = false,
py::arg("zero_copy") = false,
py::arg("name") = "",
py::arg("stop_gradient") = -1)
.def("__init__", .def("__init__",
&InitVarBaseFromNumpyWithArg<platform::CustomPlace>, &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
py::arg("value"), py::arg("value"),
...@@ -773,11 +761,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -773,11 +761,6 @@ void BindImperative(py::module *m_ptr) {
py::arg("tensor"), py::arg("tensor"),
py::arg("place"), py::arg("place"),
py::arg("name") = "") py::arg("name") = "")
.def("__init__",
&InitVarBaseFromTensorWithArg<platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("name") = "")
.def("__init__", .def("__init__",
&InitVarBaseFromTensorWithArg<platform::CustomPlace>, &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
py::arg("tensor"), py::arg("tensor"),
...@@ -1878,18 +1861,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -1878,18 +1861,6 @@ void BindImperative(py::module *m_ptr) {
return new_var; return new_var;
}, },
py::return_value_policy::copy) py::return_value_policy::copy)
.def(
"_copy_to",
[](const std::shared_ptr<imperative::VarBase> &self,
const platform::MLUPlace &place,
bool blocking) {
auto new_var = self->NewVarBase(place, blocking);
if (!blocking) {
IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
}
return new_var;
},
py::return_value_policy::copy)
.def( .def(
"_copy_to", "_copy_to",
[](const std::shared_ptr<imperative::VarBase> &self, [](const std::shared_ptr<imperative::VarBase> &self,
...@@ -2217,11 +2188,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -2217,11 +2188,6 @@ void BindImperative(py::module *m_ptr) {
self.SetExpectedPlace(*p); self.SetExpectedPlace(*p);
VLOG(4) << "Tracer(" << &self << ")" VLOG(4) << "Tracer(" << &self << ")"
<< " set expected place " << *p; << " set expected place " << *p;
} else if (py::isinstance<platform::MLUPlace>(obj)) {
auto p = obj.cast<platform::MLUPlace *>();
self.SetExpectedPlace(*p);
VLOG(4) << "Tracer(" << &self << ")"
<< " set expected place " << *p;
} else if (py::isinstance<platform::CustomPlace>(obj)) { } else if (py::isinstance<platform::CustomPlace>(obj)) {
auto p = obj.cast<platform::CustomPlace *>(); auto p = obj.cast<platform::CustomPlace *>();
self.SetExpectedPlace(*p); self.SetExpectedPlace(*p);
...@@ -2412,28 +2378,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -2412,28 +2378,6 @@ void BindImperative(py::module *m_ptr) {
inplace_map); inplace_map);
} }
}) })
.def("trace",
[](imperative::Tracer &self,
const std::string &type,
const PyNameVarBaseMap &ins,
const PyNameVarBaseMap &outs,
framework::AttributeMap attrs,
const platform::MLUPlace &place,
bool trace_backward,
const std::map<std::string, std::string> &inplace_map = {}) {
auto ins_map = ConvertToNameVarBaseMap(ins);
auto outs_map = ConvertToNameVarBaseMap(outs);
{
py::gil_scoped_release release;
self.TraceOp<imperative::VarBase>(type,
std::move(ins_map),
std::move(outs_map),
std::move(attrs),
place,
trace_backward,
inplace_map);
}
})
.def("trace", .def("trace",
[](imperative::Tracer &self, [](imperative::Tracer &self,
const std::string &type, const std::string &type,
...@@ -2505,7 +2449,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -2505,7 +2449,6 @@ void BindImperative(py::module *m_ptr) {
m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>); m.def("varbase_copy", &VarBaseCopy<platform::CUDAPinnedPlace>);
m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>); m.def("varbase_copy", &VarBaseCopy<platform::NPUPlace>);
m.def("varbase_copy", &VarBaseCopy<platform::CustomPlace>); m.def("varbase_copy", &VarBaseCopy<platform::CustomPlace>);
m.def("varbase_copy", &VarBaseCopy<platform::MLUPlace>);
m.def( m.def(
"dygraph_partial_grad", "dygraph_partial_grad",
...@@ -2616,19 +2559,6 @@ void BindImperative(py::module *m_ptr) { ...@@ -2616,19 +2559,6 @@ void BindImperative(py::module *m_ptr) {
py::arg("ring_id")); py::arg("ring_id"));
#endif #endif
#if defined(PADDLE_WITH_CNCL)
py::class_<imperative::CNCLParallelContext,
imperative::ParallelContext,
std::shared_ptr<imperative::CNCLParallelContext>>(
m, "CNCLParallelContext")
.def(py::init<const imperative::ParallelStrategy &,
const platform::MLUPlace &>())
.def("init", [](imperative::CNCLParallelContext &self) { self.Init(); })
.def("init_with_ring_id",
&imperative::CNCLParallelContext::InitWithRingID,
py::arg("ring_id"));
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::HeterParallelContext, py::class_<imperative::HeterParallelContext,
......
...@@ -152,10 +152,6 @@ limitations under the License. */ ...@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO #ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h" #include "paddle/fluid/pybind/crypto.h"
#endif #endif
......
...@@ -152,10 +152,6 @@ limitations under the License. */ ...@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO #ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h" #include "paddle/fluid/pybind/crypto.h"
#endif #endif
...@@ -194,7 +190,6 @@ PyTypeObject *g_cpuplace_pytype = nullptr; ...@@ -194,7 +190,6 @@ PyTypeObject *g_cpuplace_pytype = nullptr;
PyTypeObject *g_xpuplace_pytype = nullptr; PyTypeObject *g_xpuplace_pytype = nullptr;
PyTypeObject *g_npuplace_pytype = nullptr; PyTypeObject *g_npuplace_pytype = nullptr;
PyTypeObject *g_cudapinnedplace_pytype = nullptr; PyTypeObject *g_cudapinnedplace_pytype = nullptr;
PyTypeObject *g_mluplace_pytype = nullptr;
PyTypeObject *g_ipuplace_pytype = nullptr; PyTypeObject *g_ipuplace_pytype = nullptr;
template <typename PlaceType> template <typename PlaceType>
...@@ -371,7 +366,6 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -371,7 +366,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>) .def("_equals", &IsSamePlace<platform::CUDAPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::MLUPlace>)
.def("_equals", .def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>) &IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("_get_device_id", .def("_get_device_id",
...@@ -614,82 +608,8 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -614,82 +608,8 @@ void BindPlace(pybind11::module &m) { // NOLINT
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>) .def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
.def("_equals", .def("_equals",
&IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>) &IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
#ifdef PADDLE_WITH_IPU
.def("get_device_id",
[](const platform::IPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::IPUPlace &>); .def("__str__", string::to_string<const platform::IPUPlace &>);
// MLUPlace
py::class_<platform::MLUPlace> mluplace(m, "MLUPlace", R"DOC(
MLUPlace is a descriptor of a device.
It represents a MLU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: mlu
mlu_place = paddle.MLUPlace(0)
)DOC");
g_mluplace_pytype = reinterpret_cast<PyTypeObject *>(mluplace.ptr());
mluplace
.def("__init__",
[](platform::MLUPlace &self, int dev_id) {
#ifdef PADDLE_WITH_MLU
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), device id must be 0 or "
"positive integer",
dev_id);
std::exit(-1);
}
if (UNLIKELY(dev_id >= platform::GetMLUDeviceCount())) {
if (platform::GetMLUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use MLU because there is no MLU "
"detected on your "
"machine.";
std::exit(-1);
} else {
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d",
dev_id,
platform::GetMLUDeviceCount(),
platform::GetMLUDeviceCount());
std::exit(-1);
}
}
new (&self) platform::MLUPlace(dev_id);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use MLU because you have installed CPU/GPU/... "
"version "
"PaddlePaddle.\n"
"If you want to use MLU, please try to install MLU version "
"PaddlePaddle by: pip install paddlepaddle-mlu\n"
"If you only have CPU, please change MLUPlace(%d) to be "
"CPUPlace().\n",
dev_id);
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::MLUPlace>)
#ifdef PADDLE_WITH_MLU
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::MLUPlace, platform::MLUPlace>)
.def("_equals",
&IsSamePlace<platform::MLUPlace, platform::CUDAPinnedPlace>)
.def("get_device_id",
[](const platform::MLUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::MLUPlace &>);
py::class_<platform::Place> platformplace(m, "Place"); py::class_<platform::Place> platformplace(m, "Place");
g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr()); g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
platformplace.def(py::init<>()) platformplace.def(py::init<>())
...@@ -701,7 +621,6 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -701,7 +621,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::MLUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>) .def("_equals", &IsSamePlace<platform::Place, platform::CustomPlace>)
.def("is_gpu_place", .def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); }) [](platform::Place &self) { return platform::is_gpu_place(self); })
...@@ -758,10 +677,6 @@ void BindPlace(pybind11::module &m) { // NOLINT ...@@ -758,10 +677,6 @@ void BindPlace(pybind11::module &m) { // NOLINT
[](platform::Place &self, const platform::IPUPlace &ipu_place) { [](platform::Place &self, const platform::IPUPlace &ipu_place) {
self = ipu_place; self = ipu_place;
}) })
.def("set_place",
[](platform::Place &self, const platform::MLUPlace &mlu_place) {
self = mlu_place;
})
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::CustomPlace &plug_place) { [](platform::Place &self, const platform::CustomPlace &plug_place) {
self = plug_place; self = plug_place;
......
...@@ -152,10 +152,6 @@ limitations under the License. */ ...@@ -152,10 +152,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/ipu/ipu_info.h" #include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif #endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO #ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h" #include "paddle/fluid/pybind/crypto.h"
#endif #endif
...@@ -252,10 +248,6 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -252,10 +248,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
[](phi::DenseTensor &self, paddle::platform::NPUPlace &place) { [](phi::DenseTensor &self, paddle::platform::NPUPlace &place) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("_alloc_float",
[](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<float>(place);
})
.def("_alloc_double", .def("_alloc_double",
[](phi::DenseTensor &self, paddle::platform::CPUPlace &place) { [](phi::DenseTensor &self, paddle::platform::CPUPlace &place) {
self.mutable_data<double>(place); self.mutable_data<double>(place);
...@@ -276,10 +268,6 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -276,10 +268,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
[](phi::DenseTensor &self, paddle::platform::CUDAPlace &place) { [](phi::DenseTensor &self, paddle::platform::CUDAPlace &place) {
self.mutable_data<int>(place); self.mutable_data<int>(place);
}) })
.def("_alloc_int",
[](phi::DenseTensor &self, paddle::platform::MLUPlace &place) {
self.mutable_data<int>(place);
})
.def( .def(
"_alloc_int", "_alloc_int",
[](phi::DenseTensor &self, paddle::platform::CUDAPinnedPlace &place) { [](phi::DenseTensor &self, paddle::platform::CUDAPinnedPlace &place) {
...@@ -325,13 +313,6 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -325,13 +313,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data",
[](phi::DenseTensor &self,
paddle::platform::MLUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_clear", &phi::DenseTensor::clear) .def("_clear", &phi::DenseTensor::clear)
.def("_mutable_data", .def("_mutable_data",
[](phi::DenseTensor &self, [](phi::DenseTensor &self,
...@@ -370,11 +351,6 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -370,11 +351,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
py::arg("tensor"), py::arg("tensor"),
py::arg("place"), py::arg("place"),
py::arg("batch_size") = -1) py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from", .def("_copy_from",
&TensorCopyFrom<paddle::platform::IPUPlace>, &TensorCopyFrom<paddle::platform::IPUPlace>,
py::arg("tensor"), py::arg("tensor"),
...@@ -415,11 +391,6 @@ void BindTensor(pybind11::module &m) { // NOLINT ...@@ -415,11 +391,6 @@ void BindTensor(pybind11::module &m) { // NOLINT
py::arg("array"), py::arg("array"),
py::arg("place"), py::arg("place"),
py::arg("zero_copy") = false) py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set", .def("set",
SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>, SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("array"),
......
...@@ -292,13 +292,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) { ...@@ -292,13 +292,6 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
auto p = self.place(); auto p = self.place();
paddle::memory::Copy( paddle::memory::Copy(
platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr); platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
#endif
} else if (platform::is_mlu_place(self.place())) {
#ifdef PADDLE_WITH_MLU
const T *a = self.data<T>();
auto p = self.place();
paddle::memory::Copy(
platform::CPUPlace(), &b, p, a + offset, sizeof(T), nullptr);
#endif #endif
} else if (platform::is_custom_place(self.place())) { } else if (platform::is_custom_place(self.place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE) #if defined(PADDLE_WITH_CUSTOM_DEVICE)
...@@ -336,13 +329,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) { ...@@ -336,13 +329,6 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
T *a = self->mutable_data<T>(p); T *a = self->mutable_data<T>(p);
paddle::memory::Copy( paddle::memory::Copy(
p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr); p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
#endif
} else if (platform::is_mlu_place(self->place())) {
#ifdef PADDLE_WITH_MLU
auto p = self->place();
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(
p, a + offset, platform::CPUPlace(), &elem, sizeof(T), nullptr);
#endif #endif
} else if (platform::is_custom_place(self->place())) { } else if (platform::is_custom_place(self->place())) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE) #if defined(PADDLE_WITH_CUSTOM_DEVICE)
...@@ -413,21 +399,6 @@ void SetTensorFromPyArrayT( ...@@ -413,21 +399,6 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, " "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with IPU support.")); "Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (paddle::platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
platform::Place tmp_place = place;
platform::MLUDeviceGuard guard(tmp_place.device);
auto dst = self->mutable_data<T>(place);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto dev_ctx = static_cast<platform::MLUDeviceContext *>(pool.Get(place));
paddle::platform::MLUMemcpyH2DAsync(
dst, array.data(), array.nbytes(), dev_ctx->stream());
dev_ctx->Wait();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use MLUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with MLU support."));
#endif #endif
} else if (paddle::platform::is_custom_place(place)) { } else if (paddle::platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
...@@ -779,10 +750,6 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self, ...@@ -779,10 +750,6 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
output->mutable_data(place, self.dtype()); output->mutable_data(place, self.dtype());
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
output->mutable_data(place, self.dtype());
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -1064,39 +1031,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor, ...@@ -1064,39 +1031,6 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, " "Cannot use CUDAPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support.")); "Please recompile or reinstall Paddle with CUDA support."));
#endif
} else if (is_mlu_tensor) {
#ifdef PADDLE_WITH_MLU
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(),
true,
platform::errors::InvalidArgument(
"PyArray is not writable, in which case memory leak "
"or double free would occur"));
PADDLE_ENFORCE_EQ(
py_arr.owndata(),
true,
platform::errors::InvalidArgument(
"PyArray does not own data, in which case memory leak "
"or double free would occur"));
size_t copy_bytes = sizeof_dtype * numel;
auto p = tensor.place();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(),
py_arr.mutable_data(),
p,
tensor_buf_ptr,
copy_bytes,
reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use MLUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with MLU support."));
#endif #endif
} else if (is_custom_device_tensor) { } else if (is_custom_device_tensor) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
......
...@@ -21,9 +21,6 @@ limitations under the License. */ ...@@ -21,9 +21,6 @@ limitations under the License. */
#include "paddle/phi/core/errors.h" #include "paddle/phi/core/errors.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/phi/backends/mlu/mlu_info.h"
#endif
namespace phi { namespace phi {
...@@ -42,11 +39,9 @@ inline size_t Alignment(size_t size, ...@@ -42,11 +39,9 @@ inline size_t Alignment(size_t size,
alignment = phi::backends::gpu::GpuMinChunkSize(); alignment = phi::backends::gpu::GpuMinChunkSize();
#elif defined(PADDLE_WITH_XPU) #elif defined(PADDLE_WITH_XPU)
alignment = alignment; alignment = alignment;
#elif defined(PADDLE_WITH_MLU)
alignment = phi::backends::mlu::MLUMinChunkSize();
#else #else
PADDLE_THROW(phi::errors::PreconditionNotMet( PADDLE_THROW(phi::errors::PreconditionNotMet(
"Fluid is not compiled with CUDA/XPU/NPU/MLU.")); "Fluid is not compiled with CUDA/XPU/NPU."));
#endif #endif
} }
} }
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_MLU
namespace phi {
namespace backends {
namespace mlu {
//! Get the minimum chunk size for MLU buddy allocator.
inline size_t MLUMinChunkSize() {
// Allow to allocate the minimum chunk size is 256 bytes.
return 1 << 8;
}
} // namespace mlu
} // namespace backends
} // namespace phi
#endif
...@@ -193,16 +193,6 @@ class IPUPlace : public Place { ...@@ -193,16 +193,6 @@ class IPUPlace : public Place {
: Place(AllocationType::IPU, place.GetDeviceId()) {} : Place(AllocationType::IPU, place.GetDeviceId()) {}
}; };
class MLUPlace : public Place {
public:
MLUPlace() : Place(AllocationType::MLU, 0) {}
explicit MLUPlace(int device_id) : Place(AllocationType::MLU, device_id) {}
MLUPlace(const MLUPlace&) = default;
MLUPlace(const Place& place) // NOLINT
: Place(AllocationType::MLU, place.GetDeviceId()) {}
};
class CustomPlace : public Place { class CustomPlace : public Place {
public: public:
CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {} CustomPlace() : Place(AllocationType::CUSTOM, 0, "") {}
......
...@@ -62,15 +62,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place, ...@@ -62,15 +62,6 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
PADDLE_THROW(phi::errors::Unavailable( PADDLE_THROW(phi::errors::Unavailable(
("Paddle is not compiled with IPU. Cannot visit ipu device"))); ("Paddle is not compiled with IPU. Cannot visit ipu device")));
return typename Visitor::result_type(); return typename Visitor::result_type();
#endif
}
case phi::AllocationType::MLU: {
#ifdef PADDLE_WITH_MLU
phi::MLUPlace p(place.GetDeviceId());
return visitor(p);
#else
PADDLE_THROW(phi::errors::Unavailable(
("Paddle is not compiled with MLU. Cannot visit mlu device")));
#endif #endif
} }
case phi::AllocationType::CUSTOM: { case phi::AllocationType::CUSTOM: {
......
...@@ -1980,11 +1980,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> { ...@@ -1980,11 +1980,7 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { static constexpr ActBwdOpFwdDeps FwdDeps() {
#ifdef PADDLE_WITH_MLU
return ActBwdOpFwdDeps::kDepX;
#else
return ActBwdOpFwdDeps::kDepOut; return ActBwdOpFwdDeps::kDepOut;
#endif
} }
}; };
......
...@@ -203,13 +203,6 @@ void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context, ...@@ -203,13 +203,6 @@ void set_constant_with_place<phi::CPUPlace>(const phi::DeviceContext& context,
phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value)); phi::VisitDataType(tensor->dtype(), TensorSetConstantCPU(tensor, value));
} }
template <>
void set_constant_with_place<phi::MLUPlace>(const phi::DeviceContext& context,
phi::DenseTensor* tensor,
float value) {
PADDLE_THROW(phi::errors::Unimplemented("MLUPlace is not supported"));
}
template <> template <>
void set_constant_with_place<phi::GPUPinnedPlace>( void set_constant_with_place<phi::GPUPinnedPlace>(
const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) { const phi::DeviceContext& context, phi::DenseTensor* tensor, float value) {
......
...@@ -56,8 +56,7 @@ inline void CopyWithContext(const Context& ctx, ...@@ -56,8 +56,7 @@ inline void CopyWithContext(const Context& ctx,
const Place& src_place, const Place& src_place,
const void* src, const void* src,
size_t num) { size_t num) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
defined(PADDLE_WITH_MLU)
memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream()); memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
#else #else
PADDLE_THROW( PADDLE_THROW(
......
...@@ -72,7 +72,6 @@ from .core import ( ...@@ -72,7 +72,6 @@ from .core import (
CUDAPlace, CUDAPlace,
CUDAPinnedPlace, CUDAPinnedPlace,
IPUPlace, IPUPlace,
MLUPlace,
CustomPlace, CustomPlace,
) )
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
...@@ -127,7 +126,6 @@ __all__ = ( ...@@ -127,7 +126,6 @@ __all__ = (
'CUDAPlace', 'CUDAPlace',
'CUDAPinnedPlace', 'CUDAPinnedPlace',
'IPUPlace', 'IPUPlace',
'MLUPlace',
'Tensor', 'Tensor',
'ParamAttr', 'ParamAttr',
'WeightNormParamAttr', 'WeightNormParamAttr',
......
...@@ -25,7 +25,6 @@ from ..fluid.core import IPUPlace # noqa: F401 ...@@ -25,7 +25,6 @@ from ..fluid.core import IPUPlace # noqa: F401
from ..fluid.core import CUDAPlace # noqa: F401 from ..fluid.core import CUDAPlace # noqa: F401
from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401
from ..fluid.core import NPUPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401
from ..fluid.core import MLUPlace # noqa: F401
from ..fluid.core import CustomPlace # noqa: F401 from ..fluid.core import CustomPlace # noqa: F401
from ..fluid import core # noqa: F401 from ..fluid import core # noqa: F401
......
...@@ -170,9 +170,6 @@ if(${len} GREATER_EQUAL 1) ...@@ -170,9 +170,6 @@ if(${len} GREATER_EQUAL 1)
if(WITH_XPU) if(WITH_XPU)
target_link_libraries(${test_name} xpulib) target_link_libraries(${test_name} xpulib)
endif() endif()
if(WITH_MLU)
target_link_libraries(${test_name} neuware_lib)
endif()
if(NOT if(NOT
("${test_name}" STREQUAL "c_broadcast_op_npu_test" ("${test_name}" STREQUAL "c_broadcast_op_npu_test"
OR "${test_name}" STREQUAL "c_allreduce_sum_op_npu_test" OR "${test_name}" STREQUAL "c_allreduce_sum_op_npu_test"
......
...@@ -28,12 +28,6 @@ else() ...@@ -28,12 +28,6 @@ else()
SRCS bkcl_context_test.cc SRCS bkcl_context_test.cc
DEPS bkcl_context) DEPS bkcl_context)
endif() endif()
if(WITH_CNCL)
cc_test(
cncl_context_test
SRCS cncl_context_test.cc
DEPS cncl_context)
endif()
endif() endif()
cc_test( cc_test(
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/imperative/cncl_context.h"
#include <thread> // NOLINT
#include "gtest/gtest.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
namespace imperative = paddle::imperative;
namespace platform = paddle::platform;
namespace framework = paddle::framework;
// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test
// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test
int nrings = 1;
imperative::ParallelStrategy GetStrategy(int local_rank) {
std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
imperative::ParallelStrategy strategy;
strategy.trainer_endpoints_ = eps;
strategy.current_endpoint_ = eps[local_rank];
strategy.nranks_ = 2;
strategy.local_rank_ = local_rank;
strategy.nrings_ = nrings;
return strategy;
}
#if defined(PADDLE_WITH_CNCL)
void Broadcast(int local_rank, int device_id) {
int data_size = 4;
float test_data = 7;
const auto& place = platform::MLUPlace(device_id);
platform::MLUDeviceContext ctx(place);
imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
// init
cpc.Init();
framework::Variable* src_dev_var(new framework::Variable());
auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
// fill data for rank 0 only
std::vector<float> src_vec;
if (local_rank == 0) {
for (int i = 0; i < data_size; ++i) {
src_vec.push_back(test_data);
}
framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
}
ctx.Wait();
// call broadcast
cpc.Broadcast(src_dev_var, 0);
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// check result
std::vector<float> dst_vec;
framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec);
ctx.Wait();
for (int i = 0; i < data_size; ++i) {
EXPECT_EQ(dst_vec[i], test_data);
}
}
TEST(Broadcast, Run) {
if (platform::GetMLUDeviceCount() >= 2) {
int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
int device_id = atoi(getenv("FLAGS_selected_mlus"));
Broadcast(local_rank, device_id);
}
}
void AllReduceByStream(int local_rank, int device_id) {
int data_size = 32;
const auto& place = platform::MLUPlace(device_id);
platform::MLUDeviceContext ctx(place);
imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place);
// init
cpc.Init();
// input data
framework::Variable* src_dev_var(new framework::Variable());
auto* src_dev_tensor = src_dev_var->GetMutable<phi::DenseTensor>();
src_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
// fill input data
std::vector<float> src_vec;
for (int i = 0; i < data_size; ++i) {
src_vec.push_back(1.0 + local_rank);
}
framework::TensorFromVector(src_vec, ctx, src_dev_tensor);
ctx.Wait();
// output data
framework::Variable* dst_dev_var(new framework::Variable());
auto* dst_dev_tensor = dst_dev_var->GetMutable<phi::DenseTensor>();
dst_dev_tensor->mutable_data<float>(phi::make_ddim({data_size}), place);
// call allreduce
cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false);
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// check result
std::vector<float> dst_vec;
framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec);
ctx.Wait();
EXPECT_EQ(dst_vec.size(), src_vec.size());
for (int i = 0; i < data_size; ++i) {
EXPECT_EQ(dst_vec[i], 3.0);
}
}
TEST(AllReduceByStream, Run) {
if (platform::GetMLUDeviceCount() >= 2) {
int local_rank = atoi(getenv("PADDLE_TRAINER_ID"));
int device_id = atoi(getenv("FLAGS_selected_mlus"));
AllReduceByStream(local_rank, device_id);
}
}
#endif
...@@ -76,8 +76,7 @@ void GroupConcatSplit(Place place, size_t size) { ...@@ -76,8 +76,7 @@ void GroupConcatSplit(Place place, size_t size) {
value.push_back(static_cast<T>(1.0 * j)); value.push_back(static_cast<T>(1.0 * j));
} }
if (std::is_same<Place, platform::CUDAPlace>::value || if (std::is_same<Place, platform::CUDAPlace>::value) {
std::is_same<Place, platform::MLUPlace>::value) {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_CNCL) defined(PADDLE_WITH_CNCL)
paddle::memory::Copy( paddle::memory::Copy(
...@@ -185,20 +184,5 @@ TEST(TestGroup, TestXPUConcatSplit) { ...@@ -185,20 +184,5 @@ TEST(TestGroup, TestXPUConcatSplit) {
GroupConcatSplit<float>(xpu_place, size); GroupConcatSplit<float>(xpu_place, size);
} }
#endif #endif
#if defined(PADDLE_WITH_CNCL)
TEST(TestGroup, TestMLUConcatSplit) {
platform::MLUPlace mlu_place(0);
platform::CPUPlace cpu_place;
int size = 3;
GroupConcatSplit<float>(cpu_place, size);
GroupConcatSplit<float>(mlu_place, size);
size = 15;
GroupConcatSplit<float>(cpu_place, size);
GroupConcatSplit<float>(mlu_place, size);
}
#endif
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册