未验证 提交 e75c01f9 编写于 作者: W Wang Xin 提交者: GitHub

clean up WITH_MLU (#52546)

上级 075d6b14
......@@ -53,7 +53,6 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
option(WITH_XPU_XFT "Compile PaddlePaddle with BAIDU XPU-XFT" OFF)
option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
......@@ -81,9 +80,6 @@ endif()
if(WITH_GPU AND WITH_ROCM)
message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
endif()
if(WITH_GPU AND WITH_MLU)
message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
endif()
if(WITH_GPU AND NOT APPLE)
enable_language(CUDA)
......@@ -430,14 +426,6 @@ if(NOT WITH_XPU AND WITH_XPU_BKCL)
CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
endif()
if(NOT WITH_MLU AND WITH_CNCL)
message(
WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
set(WITH_MLU
OFF
CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
endif()
if(WITH_NCCL)
add_definitions("-DPADDLE_WITH_NCCL")
include(nccl)
......@@ -469,10 +457,6 @@ if(WITH_GPU)
endif()
endif()
if(WITH_MLU)
include(neuware)
endif()
if(WITH_ROCM)
include(hip)
include(miopen) # set miopen libraries, must before configure
......
......@@ -116,11 +116,6 @@ if(WITH_IPU)
add_definitions(-DPADDLE_WITH_IPU)
endif()
if(WITH_MLU)
message(STATUS "Compile with MLU!")
add_definitions(-DPADDLE_WITH_MLU)
endif()
if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU)
......
if(NOT WITH_MLU)
return()
endif()
if(NOT ENV{NEUWARE_HOME})
set(NEUWARE_HOME "/usr/local/neuware")
else()
set(NEUWARE_HOME $ENV{NEUWARE_HOME})
endif()
message(STATUS "NEUWARE_HOME: " ${NEUWARE_HOME})
set(NEUWARE_INCLUDE_DIR ${NEUWARE_HOME}/include)
set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
include_directories(${NEUWARE_INCLUDE_DIR})
set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
${CNPAPI_LIB})
if(WITH_CNCL)
message(STATUS "Compile with CNCL!")
add_definitions(-DPADDLE_WITH_CNCL)
set(CNCL_LIB ${NEUWARE_LIB_DIR}/libcncl.so)
list(APPEND NEUWARE_LIB_DEPS ${CNCL_LIB})
endif()
target_link_libraries(neuware_lib ${NEUWARE_LIB_DEPS})
......@@ -74,9 +74,6 @@ function(op_library TARGET)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function layer
common_infer_shape_functions)
if(WITH_MLU)
set(op_common_deps ${op_common_deps} mlu_baseop)
endif()
# Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build.
set(options UNITY)
......@@ -169,12 +166,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${TARGET}.kps)
endif()
endif()
if(WITH_MLU)
string(REPLACE "_op" "_op_mlu" MLU_FILE "${TARGET}")
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MLU_FILE}.cc)
list(APPEND mlu_cc_srcs ${MLU_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$")
......@@ -201,8 +192,6 @@ function(op_library TARGET)
list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_XPU_KP AND ${src} MATCHES ".*\\.kps$")
list(APPEND xpu_kp_cc_srcs ${src})
elseif(WITH_MLU AND ${src} MATCHES ".*_op_mlu.cc$")
list(APPEND mlu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src})
elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
......@@ -519,18 +508,6 @@ function(op_library TARGET)
endforeach()
endif()
# pybind USE_OP_DEVICE_KERNEL for MLU
if(WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
foreach(mlu_src ${mlu_cc_srcs})
set(op_name "")
find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
......
......@@ -356,11 +356,6 @@ if(WITH_XPU)
list(APPEND third_party_deps extern_xpu)
endif()
if(WITH_MLU)
include(external/concurrentqueue) # download, build, install concurrentqueue
list(APPEND third_party_deps extern_concurrentqueue)
endif()
if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib)
......
......@@ -99,11 +99,6 @@ struct DLDeviceVisitor
"platform::NPUPinnedPlace is not supported"));
}
inline ::DLDevice operator()(const platform::MLUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::MLUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::CustomPlace &place) const {
PADDLE_THROW(platform::errors::Unimplemented(
"platform::CustomPlace is not supported"));
......
......@@ -516,17 +516,6 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
#endif
} else if (platform::is_mlu_place(place_)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(place_, max_memory_size));
} else {
gc.reset(new MLUDefaultStreamGarbageCollector(place_, max_memory_size));
}
#else
PADDLE_THROW(
platform::errors::Unimplemented("No MLU gc found in CPU/MLU paddle"));
#endif
} else if (platform::is_custom_place(place_)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
......
......@@ -125,56 +125,6 @@ void CUDAPinnedGarbageCollector::ClearCallback(
}
#endif
#ifdef PADDLE_WITH_MLU
MLUDefaultStreamGarbageCollector::MLUDefaultStreamGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void MLUDefaultStreamGarbageCollector::Wait() const {
static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
->WaitStreamCallback();
}
void MLUDefaultStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
static_cast<platform::MLUDeviceContext *>(this->dev_ctx_)
->AddStreamCallback(callback);
}
MLUUnsafeFastGarbageCollector::MLUUnsafeFastGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void MLUUnsafeFastGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback();
}
MLUStreamGarbageCollector::MLUStreamGarbageCollector(
const platform::MLUPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {
platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueCreate(&stream_));
callback_manager_.reset(
new platform::StreamCallbackManager<mluStream>(stream_));
}
MLUStreamGarbageCollector::~MLUStreamGarbageCollector() {
auto place = this->dev_ctx_->GetPlace();
platform::MLUDeviceGuard guard(place.device);
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(stream_));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueDestroy(stream_));
}
mluStream MLUStreamGarbageCollector::stream() const { return stream_; }
void MLUStreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
void MLUStreamGarbageCollector::ClearCallback(
const std::function<void()> &callback) {
callback_manager_->AddCallback(callback);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
CustomDefaultStreamGarbageCollector::CustomDefaultStreamGarbageCollector(
const platform::CustomPlace &place, size_t max_memory_size)
......
......@@ -22,9 +22,6 @@
#include "gflags/gflags.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/platform/stream_callback_manager.h"
namespace paddle {
......@@ -139,46 +136,6 @@ class CUDAPinnedGarbageCollector : public GarbageCollector {
};
#endif
#ifdef PADDLE_WITH_MLU
class MLUDefaultStreamGarbageCollector : public GarbageCollector {
public:
MLUDefaultStreamGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
void Wait() const override;
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class MLUUnsafeFastGarbageCollector : public GarbageCollector {
public:
MLUUnsafeFastGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
class MLUStreamGarbageCollector : public GarbageCollector {
public:
MLUStreamGarbageCollector(const platform::MLUPlace &place,
size_t max_memory_size);
~MLUStreamGarbageCollector();
void Wait() const override;
mluStream stream() const;
protected:
void ClearCallback(const std::function<void()> &callback) override;
private:
mluStream stream_;
std::unique_ptr<platform::StreamCallbackManager<mluStream>> callback_manager_;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomDefaultStreamGarbageCollector : public GarbageCollector {
public:
......
......@@ -376,9 +376,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
#define REGISTER_OP_NPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, NPU, ::paddle::platform::NPUPlace, __VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
......@@ -421,12 +418,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, IPU, ::paddle::platform::IPUPlace, DEFAULT_TYPE, \
......
......@@ -57,10 +57,6 @@ class DenseTensor;
#include "paddle/fluid/platform/mkldnn_op_list.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#endif
......@@ -770,16 +766,6 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else
auto dev_id = place.device;
platform::SetXPUDeviceId(dev_id);
#endif
} else if (platform::is_mlu_place(place)) {
#ifndef PADDLE_WITH_MLU
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with MLU support.",
place));
#else
auto dev_id = place.device;
platform::SetMLUDeviceId(dev_id);
#endif
} else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
......@@ -2301,16 +2287,6 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
}
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
platform::is_mlu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing MLU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() &&
platform::is_custom_place(expected_kernel_key.place_)) {
......
......@@ -522,19 +522,6 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."));
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new MLUUnsafeFastGarbageCollector(place, max_memory_size));
} else {
gc.reset(new MLUStreamGarbageCollector(place, max_memory_size));
}
VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (platform::is_xpu_place(place)) {
#if defined(PADDLE_WITH_XPU)
......
......@@ -112,15 +112,6 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
}
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_key.backend() == phi::Backend::MLU) {
VLOG(3) << "phi missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << kernel_key
<< ", fallback to CPU one!";
return phi::KernelKey(
phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
}
#endif
#ifdef PADDLE_WITH_IPU
if (kernel_key.backend() == phi::Backend::IPU) {
VLOG(3) << "phi missing IPU kernel: " << op.Type()
......
......@@ -267,59 +267,6 @@ void TensorCopyImpl(const TENSOR& src,
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
auto src_mlu_place = src_place;
auto dst_cpu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_cpu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_cpu_place = src_place;
auto dst_mlu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
auto src_mlu_place = src_place;
auto dst_mlu_place = dst_place;
auto stream =
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream();
memory::Copy(dst_mlu_place, dst_ptr, src_mlu_place, src_ptr, size, stream);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data sync from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copying from %s to %s is not supported.", src_place, dst_place));
}
#endif
}
template <typename TENSOR>
......@@ -480,29 +427,6 @@ void TensorCopySync(const phi::DenseTensor& src,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else if (platform::is_mlu_place(src_place) && // NOLINT
platform::is_mlu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
<< dst_place;
return;
}
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
......@@ -604,31 +528,6 @@ void TensorToStream(std::ostream& os,
#else
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_mlu_place(tensor.place())) {
#ifdef PADDLE_WITH_MLU
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& mlu_dev_ctx =
static_cast<const platform::MLUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu,
buf.get(),
tensor.place(),
reinterpret_cast<const void*>(data),
size_to_write,
mlu_dev_ctx.stream());
mlu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
#endif
} else if (platform::is_custom_place(tensor.place())) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
......@@ -720,8 +619,7 @@ void TensorFromStream(std::istream& is,
platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(shape));
framework::VisitDataType(
......@@ -741,12 +639,6 @@ void TensorFromStream(std::istream& is,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
}
#endif
} else {
......@@ -803,8 +695,7 @@ void TensorFromStream(std::istream& is,
platform::is_npu_place(dev_ctx.GetPlace()) ||
platform::is_custom_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
phi::DenseTensor cpu_tensor;
cpu_tensor.Resize(phi::make_ddim(dims));
framework::VisitDataType(
......@@ -824,9 +715,6 @@ void TensorFromStream(std::istream& is,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else if (platform::is_npu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU"));
......
......@@ -26,9 +26,6 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/core/dense_tensor.h"
......@@ -142,11 +139,6 @@ void TensorFromArray(const T* src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
......@@ -193,11 +185,6 @@ void TensorFromVector(const std::vector<T>& src,
reinterpret_cast<const phi::GPUContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(dst_place)) { // NOLINT
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(dst_place)) { // NOLINT
memory::Copy(
......@@ -332,17 +319,6 @@ void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(
dst_place,
dst_ptr,
src.place(),
src_ptr,
size,
reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
......@@ -385,11 +361,6 @@ inline void TensorToVector(const phi::DenseTensor& src,
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
else if (platform::is_custom_place(src.place())) { // NOLINT
memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size, nullptr);
......
......@@ -177,10 +177,6 @@ if(WITH_GLOO)
endif()
endif()
if(WITH_MLU)
set(MLU_DEPS mlu_baseop)
endif()
if(NOT WITH_ASCEND_CL)
cc_library(
gradient_accumulator
......
......@@ -159,15 +159,6 @@ AmpOperators::AmpOperators()
OpSupportedInfos("XPU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_xpu_bf16.begin(),
unsupported_ops_xpu_bf16.end());
#elif defined(PADDLE_WITH_MLU)
auto unsupported_ops_mlu_fp16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::FP16));
unsupported_fp16_ops_->insert(unsupported_ops_mlu_fp16.begin(),
unsupported_ops_mlu_fp16.end());
auto unsupported_ops_mlu_bf16 = std::get<2>(
OpSupportedInfos("MLU", paddle::framework::proto::VarType::BF16));
unsupported_bf16_ops_->insert(unsupported_ops_mlu_bf16.begin(),
unsupported_ops_mlu_bf16.end());
#endif
VLOG(4) << allow_ops_->size() << " " << block_ops_->size() << " "
<< unsupported_fp16_ops_->size() << " "
......
......@@ -34,9 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/phi/backends/device_manager.h"
#endif
......@@ -288,41 +285,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
}
#endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(place)) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::DeviceContext* ctx = pool.Get(place);
auto dev_ctx = dynamic_cast<platform::MLUDeviceContext*>(ctx);
if (data_type == framework::DataTypeTrait<float>::DataType()) {
dst_tensor->mutable_data<float>(place);
} else if (data_type ==
framework::DataTypeTrait<platform::float16>::DataType()) {
dst_tensor->mutable_data<platform::float16>(place);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
framework::DataTypeToString(data_type),
place));
}
static const float alpha = 1.f;
static const float beta = 1.f;
operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor);
operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor);
PADDLE_ENFORCE_MLU_SUCCESS(
cnnlAssignAdd(dev_ctx->cnnl_handle(),
static_cast<const void*>(&alpha),
src_tensor_desc.get(),
operators::GetBasePtr(&src_tensor),
nullptr,
0,
static_cast<const void*>(&beta),
dst_tensor_desc.get(),
operators::GetBasePtr(dst_tensor)));
return;
}
#endif
PADDLE_THROW(platform::errors::Unimplemented(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode",
......
......@@ -150,48 +150,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
kernel_signature_(std::move(kernel_signature)),
phi_kernel_(phi_kernel) {}
#ifdef PADDLE_WITH_MLU
static void tokenize(const std::string& ops,
char delim,
std::unordered_set<std::string>* op_set) {
std::string::size_type beg = 0;
for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
++end) {
op_set->insert(ops.substr(beg, end - beg));
beg = end + 1;
}
op_set->insert(ops.substr(beg));
}
static bool is_in_mlu_black_list(const std::string& op_name) {
static bool inited = false;
static std::unordered_set<std::string> mlu_black_list;
static std::mutex s_mtx;
if (!inited) {
std::lock_guard<std::mutex> guard(s_mtx);
if (!inited) {
if (std::getenv("MLU_BLACK_LIST") != nullptr) {
std::string ops(std::getenv("MLU_BLACK_LIST"));
tokenize(ops, ',', &mlu_black_list);
}
inited = true;
VLOG(3) << "MLU Black List: ";
for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
++iter) {
VLOG(3) << *iter << " ";
}
}
}
if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
return true;
}
return false;
}
#endif
template <typename VarType>
PreparedOp PrepareImpl(
const NameVarMap<VarType>& ins,
......@@ -258,12 +216,6 @@ PreparedOp PrepareImpl(
op.Type(), expected_kernel_key.dtype());
#endif
#ifdef PADDLE_WITH_MLU
if (is_in_mlu_black_list(op.Type())) {
expected_kernel_key.set_backend(phi::Backend::CPU);
}
#endif
bool has_phi_kernel = false;
const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
......@@ -468,16 +420,6 @@ PreparedOp PrepareImpl(
kernel_iter = kernels.find(fluid_kernel_type);
}
#endif
#ifdef PADDLE_WITH_MLU
if (kernel_iter == kernels.end() &&
paddle::platform::is_mlu_place(fluid_kernel_type.place_)) {
VLOG(3) << "missing MLU kernel: " << op.Type()
<< ", expected_kernel_key:" << fluid_kernel_type
<< ", fallbacking to CPU one!";
fluid_kernel_type.place_ = platform::CPUPlace();
kernel_iter = kernels.find(fluid_kernel_type);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (kernel_iter == kernels.end() &&
paddle::platform::is_custom_place(fluid_kernel_type.place_)) {
......
......@@ -147,15 +147,6 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use IPU device since it's not compiled with IPU,"
"Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (platform::is_mlu_place(place)) {
#if defined(PADDLE_WITH_MLU)
gc.reset(new framework::MLUDefaultStreamGarbageCollector(place, 0));
VLOG(10) << "Created GarbageCollector at " << place;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Paddle can't use MLU device since it's not compiled with MLU,"
"Please recompile or reinstall Paddle with MLU support."));
#endif
} else if (platform::is_custom_place(place)) {
#if defined(PADDLE_WITH_CUSTOM_DEVICE)
......@@ -300,13 +291,6 @@ void Tracer::TraceOpImpl(const std::string& type,
} else if (platform::is_npu_place(place)) {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with NPU if use NPUPlace."));
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
platform::SetMLUDeviceId(place.device);
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU if use MLUPlace."));
#endif
} else if (platform::is_custom_place(place)) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
phi::DeviceManager::SetDevice(place);
......
......@@ -58,10 +58,6 @@
#include "paddle/fluid/platform/device/ipu/ipu_info.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
......@@ -194,11 +190,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
......@@ -254,11 +245,6 @@ class AllocatorFacadePrivate {
InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
......@@ -290,11 +276,6 @@ class AllocatorFacadePrivate {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_MLU
for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
}
#endif
break;
}
......@@ -801,12 +782,6 @@ class AllocatorFacadePrivate {
}
#endif
#ifdef PADDLE_WITH_MLU
void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
......@@ -851,13 +826,6 @@ class AllocatorFacadePrivate {
system_allocators_[p] = CreateCUDAAllocator(p);
}
#endif
#ifdef PADDLE_WITH_MLU
int device_count = platform::GetMLUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::MLUPlace p(i);
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
......@@ -894,12 +862,6 @@ class AllocatorFacadePrivate {
places.emplace_back(platform::IPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_MLU
int device_count = platform::GetMLUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::MLUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes();
for (const auto& dev_type : device_types) {
......
......@@ -56,9 +56,6 @@ BuddyAllocator::BuddyAllocator(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
init_allocate_size_func_ = &platform::GpuInitAllocSize;
re_allocate_size_func_ = &platform::GpuReallocSize;
#elif defined(PADDLE_WITH_MLU)
init_allocate_size_func_ = &platform::MLUInitAllocSize;
re_allocate_size_func_ = &platform::MLUReallocSize;
#endif
}
#endif
......@@ -253,9 +250,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allocate_bytes = DeviceAllocateSize(
&platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
#elif defined(PADDLE_WITH_MLU)
allocate_bytes = DeviceAllocateSize(
&platform::MLUInitAllocSize, &platform::MLUReallocSize, request_bytes);
#endif
#endif
......
......@@ -25,9 +25,6 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use);
......@@ -395,202 +392,6 @@ TEST(BuddyAllocator, Release) {
}
#endif
#ifdef PADDLE_WITH_MLU
TEST(BuddyAllocator, MluFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
buddy_allocator.Release();
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator,
600 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, InitRealloc) {
FLAGS_initial_gpu_memory_in_mb = 100;
FLAGS_reallocate_gpu_memory_in_mb = 50;
EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(100 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less then initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 10 << 20);
// Between initial size and reallocate size and not exceed pool
TestBuddyAllocator(&buddy_allocator, 80 << 20);
TestBuddyAllocator(&buddy_allocator, 99 << 20);
// Greater than max chunk size
TestBuddyAllocator(&buddy_allocator,
101 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, ReallocSizeGreaterThanInit) {
FLAGS_initial_gpu_memory_in_mb = 5;
FLAGS_reallocate_gpu_memory_in_mb = 10;
EXPECT_EQ(platform::MLUMaxChunkSize(), static_cast<size_t>(10 << 20));
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than initial size and reallocate size
TestBuddyAllocator(&buddy_allocator, 1 << 20);
// Between initial size and reallocate size and exceed pool
TestBuddyAllocator(&buddy_allocator, 6 << 20);
TestBuddyAllocator(&buddy_allocator, 8 << 20);
TestBuddyAllocator(&buddy_allocator, 9 << 20);
// Greater than max trunk size
TestBuddyAllocator(&buddy_allocator,
11 << 20,
/* use_system_allocator = */ true);
TestBuddyAllocator(&buddy_allocator,
1 * static_cast<size_t>(1 << 30),
/* use_system_allocator = */ true);
}
TEST(BuddyAllocator, FractionRefillPool) {
FLAGS_fraction_of_gpu_memory_to_use = 0.6;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t max_chunk_size = platform::MLUMaxChunkSize();
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
max_chunk_size);
// Less than pool size
int* p0 = TestBuddyAllocator(&buddy_allocator,
max_chunk_size - 1000,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
size_t alloc =
platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
int* p1 = TestBuddyAllocator(&buddy_allocator,
alloc,
/* use_system_allocator = */ false,
/* free_ptr = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
alloc =
platform::MLUAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use;
// Exceed pool trigger refilling size of fraction of avaiable mlu, and should
// be able to alloc 60% of the remaining MLU
TestBuddyAllocator(&buddy_allocator,
alloc,
/* use_system_allocator = */ false);
// Max chunk size should be same during allocation
EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize());
buddy_allocator.Free(p0);
buddy_allocator.Free(p1);
}
TEST(BuddyAllocator, AllocFromAvailable) {
FLAGS_fraction_of_gpu_memory_to_use = 0.7;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
size_t total = 0, available = 0;
platform::SetMLUDeviceId(0);
platform::MLUMemoryUsage(&available, &total);
// Take half of available MLU
void* p;
cnrtStatus result = cnrtMalloc(&p, available >> 1);
EXPECT_TRUE(result == cnrtSuccess);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 10 << 20);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
if (p) {
EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
}
}
TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_fraction_of_gpu_memory_to_use = 1.0;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
void* p = nullptr;
EXPECT_TRUE(cnrtMalloc(&p, static_cast<size_t>(1) << 30) == cnrtSuccess);
// BuddyAllocator should be able to alloc the remaining MLU
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
if (p) {
EXPECT_TRUE(cnrtFree(p) == cnrtSuccess);
}
}
TEST(BuddyAllocator, Release) {
// In a 8 GB machine, the pool size will be about 800 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.1;
FLAGS_initial_gpu_memory_in_mb = 0;
FLAGS_reallocate_gpu_memory_in_mb = 0;
BuddyAllocator buddy_allocator(
std::unique_ptr<SystemAllocator>(new MLUAllocator(0)),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize());
// Less than pool size
TestBuddyAllocator(&buddy_allocator, 10);
TestBuddyAllocator(&buddy_allocator, 10 << 10);
TestBuddyAllocator(&buddy_allocator, 50 << 20);
buddy_allocator.Release();
}
#endif
} // namespace detail
} // namespace memory
} // namespace paddle
......@@ -420,140 +420,6 @@ uint64_t Release<platform::CUDAPinnedPlace>(
#endif
}
// For MLU
#ifdef PADDLE_WITH_MLU
class MLUBuddyAllocatorList {
private:
MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
auto mlu_num = devices_.size();
allocators_.resize(mlu_num);
init_flags_.reserve(mlu_num);
for (size_t i = 0; i < mlu_num; ++i) {
init_flags_.emplace_back(new std::once_flag());
}
}
static MLUBuddyAllocatorList *CreateNewInstance() {
return new MLUBuddyAllocatorList();
}
public:
static MLUBuddyAllocatorList *Instance() {
static auto *instance = CreateNewInstance();
return instance;
}
BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
<< "or 'FLAGS_initial_gpu_memory_in_mb' "
<< "or 'FLAGS_reallocate_gpu_memory_in_mb' "
<< "to change the memory size for MLU usage.\n"
<< "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
<< FLAGS_fraction_of_gpu_memory_to_use
<< ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
<< FLAGS_initial_gpu_memory_in_mb
<< ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
<< FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
});
return allocators_[pos].get();
}
private:
std::vector<int> devices_;
std::vector<std::unique_ptr<std::once_flag>> init_flags_;
std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
};
BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
}
#endif
template <>
size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
return GetMLUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
#ifdef PADDLE_WITH_MLU
auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
platform::MLUDeviceGuard(place.device);
size_t avail = 0, total = 0;
platform::MLUMemoryUsage(&avail, &total);
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
cnrtMemset(ptr, 0xEF, size);
}
}
return ptr;
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetMLUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
template <>
uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
#ifdef PADDLE_WITH_MLU
return GetMLUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"'MLUPlace' is not supported in CPU only device."));
#endif
}
// For CustomDevice
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class BuddyAllocatorList {
......
......@@ -61,21 +61,6 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
}
#endif
#ifdef PADDLE_WITH_MLU
TEST(NaiveBestFitAllocatorTest, MluAlloc) {
NaiveBestFitAllocator alloc{platform::MLUPlace(0)};
{
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
}
sleep(10);
alloc.Release(platform::MLUPlace(0));
size_t size = (1 << 20);
auto allocation = alloc.Allocate(size);
alloc.Release(platform::MLUPlace(0));
}
#endif
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -31,9 +31,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
......@@ -287,78 +284,6 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; }
#endif
#ifdef PADDLE_WITH_MLU
void* MLUAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
void* p;
auto result = platform::RecordedMLUMalloc(&p, size, mlu_id_);
if (result == cnrtSuccess) {
*index = 0;
mlu_alloc_size_ += size;
return p;
} else {
size_t avail, total, actual_avail, actual_total;
bool is_limited = platform::RecordedMLUMemGetInfo(
&avail, &total, &actual_avail, &actual_total, mlu_id_);
size_t allocated = total - avail;
std::string err_msg;
if (is_limited) {
auto limit_size = (total >> 20);
err_msg = string::Sprintf(
"\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a "
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
"\n\nOut of memory error on MLU %d. "
"Cannot allocate %s memory on MLU %d, %s memory has been allocated and "
"available memory is only %s.\n\n"
"Please check whether there is any other process using MLU %d.\n"
"1. If yes, please stop them, or start PaddlePaddle on another MLU.\n"
"2. If no, please try one of the following suggestions:\n"
" 1) Decrease the batch size of your model.\n"
" 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, "
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_,
string::HumanReadableSize(size),
mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size,
mlu_alloc_size_));
mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_);
}
bool MLUAllocator::UseGpu() const { return true; }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void* CustomAllocator::Alloc(size_t* index, size_t size) {
if (size <= 0) return nullptr;
......
......@@ -68,21 +68,6 @@ class CUDAPinnedAllocator : public SystemAllocator {
};
#endif
#ifdef PADDLE_WITH_MLU
class MLUAllocator : public SystemAllocator {
public:
explicit MLUAllocator(int mlu_id) : mlu_id_(mlu_id) {}
virtual void* Alloc(size_t* index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t mlu_alloc_size_ = 0;
int mlu_id_;
};
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
class CustomAllocator : public SystemAllocator {
public:
......
......@@ -82,23 +82,3 @@ TEST(GPUAllocator, AllocFailure) {
}
}
#endif
#ifdef PADDLE_WITH_MLU
TEST(MLUAllocator, Alloc) {
paddle::memory::detail::MLUAllocator a(0);
TestAllocator(&a, 2048);
TestAllocator(&a, 0);
}
TEST(MLUAllocator, AllocFailure) {
paddle::memory::detail::MLUAllocator allocator(0);
size_t index;
size_t alloc_size = (static_cast<size_t>(1) << 40); // Very large number
try {
allocator.Alloc(&index, alloc_size);
ASSERT_TRUE(false);
} catch (paddle::memory::allocation::BadAlloc&) {
PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetLastError());
}
}
#endif
......@@ -23,10 +23,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/xpu/xpu_header.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
namespace paddle {
namespace memory {
......@@ -736,226 +732,6 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
}
#endif
#ifdef PADDLE_WITH_MLU
template <>
void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(src_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2HAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event(
"MLUMemcpyD2HSync:MLU->CPU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyD2HSync(dst, src, num);
}
}
template <>
void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetMLUDeviceId(dst_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyH2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(dst_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event(
"MLUMemcpyH2DSync:CPU->MLU", platform::TracerEventType::UserDefined, 1);
platform::MLUMemcpyH2DSync(dst, src, num);
}
}
template <>
void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
if (dst_place == src_place) {
platform::SetMLUDeviceId(dst_place.device);
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DSync(dst, src, num);
}
} else {
if (stream) {
VLOG(4) << "Async memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place << " by mlu stream(" << stream << ")";
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerAsync(dst,
dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<mluStream>(stream));
} else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerSync(
dst, dst_place.device, src, src_place.device, num);
}
}
}
// NOTE: only for CPUPlace and MLUPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
return Copy(place_dst, dst, place_src, src, num);
} else if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::MLU) {
platform::MLUPlace place_dst(dst_place.GetDeviceId());
platform::CPUPlace place_src;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::MLU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::MLUPlace place_src(src_place.GetDeviceId());
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::MLU &&
dst_place.GetType() == phi::AllocationType::MLU) {
platform::MLUPlace place_src(src_place.GetDeviceId());
platform::MLUPlace place_dst(dst_place.GetDeviceId());
return Copy(place_dst, dst, place_src, src, num, stream);
#ifdef PADDLE_WITH_CUSTOM_DEVICE
} else if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CustomPlace place_src(src_place);
platform::CPUPlace place_dst;
return Copy(place_dst, dst, place_src, src, num, stream);
} else if (src_place.GetType() == phi::AllocationType::CUSTOM && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CustomPlace place_src(src_place);
platform::CustomPlace place_dst(dst_place);
return Copy(place_dst, dst, place_src, src, num, stream);
#endif
}
}
// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
template <>
void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
template <>
void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
void* dst,
phi::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
#endif // PADDLE_WITH_MLU
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
......
......@@ -16,9 +16,6 @@ limitations under the License. */
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/device_context.h"
#endif
namespace paddle {
namespace memory {
......
......@@ -55,10 +55,6 @@ if (WITH_LITE)
add_subdirectory(lite)
endif()
if (WITH_MLU)
add_subdirectory(mlu)
endif()
if(WITH_CINN)
add_subdirectory(cinn)
endif()
......@@ -135,10 +131,6 @@ if (WITH_ASCEND_CL)
op_library(sync_batch_norm_op)
endif()
if (WITH_MLU)
op_library(sync_batch_norm_op)
endif()
op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel<T> {
using MPDType = typename details::MPTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
const auto xs = ctx.MultiInput<phi::DenseTensor>("X");
const auto* scale = ctx.Input<phi::DenseTensor>("Scale");
auto outs = ctx.MultiOutput<phi::DenseTensor>("Out");
auto* found_inf = ctx.Output<phi::DenseTensor>("FoundInfinite");
found_inf->mutable_data<bool>(dev_ctx.GetPlace());
MLUCnnlTensorDesc scale_desc(*scale);
MLUCnnlTensorDesc found_inf_desc(
*found_inf, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
for (size_t i = 0; i < xs.size(); ++i) {
const auto* x = xs[i];
auto* out = outs[i];
out->mutable_data<T>(ctx.GetPlace());
// check is_finite or is_nan
phi::DenseTensor is_finite(found_inf->type());
if (i != 0) {
is_finite.Resize(phi::make_ddim({1}));
is_finite.mutable_data<bool>(ctx.GetPlace());
} else {
is_finite.ShareDataWith(*found_inf);
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::IsNanInf(
ctx, x_desc.get(), GetBasePtr(x), GetBasePtr(&is_finite));
// save is_finite by logical_and op after checking every input
if (i != 0) {
MLUCnnlTensorDesc is_finite_desc(
is_finite, CNNL_LAYOUT_ARRAY, ToCnnlDataType<bool>());
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_OR,
found_inf_desc.get(),
GetBasePtr(found_inf),
is_finite_desc.get(),
GetBasePtr(&is_finite),
found_inf_desc.get(),
GetBasePtr(found_inf));
}
// The normal logic is :
// out = in, if found_inf = true
// out = in/scale, if found_inf = false
// But when found_inf is true, the data of Out should not be used.
// So, on MLU, we always compute out with in/scale.
phi::DenseTensor float_x;
phi::DenseTensor float_out;
if (std::is_same<T, paddle::platform::float16>::value) {
float_x.Resize(x->dims());
float_out.Resize(out->dims());
float_x.mutable_data<MPDType>(ctx.GetPlace());
float_out.mutable_data<MPDType>(ctx.GetPlace());
MLUCnnlTensorDesc float_x_desc(float_x);
MLUCnnlTensorDesc float_out_desc(float_out);
auto cast_fp16_type =
GetCastDataType(DataType::FLOAT16, DataType::FLOAT32);
MLUCnnl::Cast(ctx,
cast_fp16_type,
x_desc.get(),
GetBasePtr(x),
float_x_desc.get(),
GetBasePtr(&float_x));
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
float_x_desc.get(),
GetBasePtr(&float_x),
scale_desc.get(),
GetBasePtr(scale),
float_out_desc.get(),
GetBasePtr(&float_out));
auto cast_fp32_type =
GetCastDataType(DataType::FLOAT32, DataType::FLOAT16);
MLUCnnl::Cast(ctx,
cast_fp32_type,
float_out_desc.get(),
GetBasePtr(&float_out),
out_desc.get(),
GetBasePtr(out));
} else {
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
scale_desc.get(),
GetBasePtr(scale),
out_desc.get(),
GetBasePtr(out));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(check_finite_and_unscale,
ops::CheckFiniteAndUnscaleMLUKernel<float>,
ops::CheckFiniteAndUnscaleMLUKernel<plat::float16>);
......@@ -21,9 +21,7 @@ limitations under the License. */
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/unary.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/prim/api/composite_backward/composite_backward_api.h"
#include "paddle/fluid/prim/utils/static/composite_grad_desc_maker.h"
#include "paddle/fluid/prim/utils/static/desc_tensor.h"
......@@ -119,21 +117,6 @@ class CastOp : public framework::OperatorWithKernel {
}
// NOTE(jiahongyu): Above codes originally enclosed by PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MLU
auto src_type = static_cast<VT::Type>(ctx.Attr<int>("in_dtype"));
auto dst_type = static_cast<VT::Type>(ctx.Attr<int>("out_dtype"));
if (src_type == dst_type || MLUSupportsCast(src_type, dst_type)) {
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
tensor_place);
} else {
VLOG(3) << "MLU not support cast type: "
<< framework::DataTypeToString(src_type)
<< " to type: " << framework::DataTypeToString(dst_type)
<< ", fallbacking to CPU one!";
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
platform::CPUPlace());
}
#endif
return phi::KernelKey(framework::TransToProtoVarType(tensor->dtype()),
tensor_place);
}
......
......@@ -23,9 +23,6 @@
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/framework/convert_utils.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/infermeta/multiary.h"
......@@ -57,17 +54,8 @@ struct FillConstantVisitor {
void apply(typename std::enable_if<!(std::is_same<T, int8_t>::value ||
std::is_same<T, int16_t>::value)>::type
* = nullptr) const {
#if defined(PADDLE_WITH_MLU)
if (platform::is_mlu_place(context_.GetPlace())) {
FillMLUTensorWithHostValue<T>(context_, static_cast<T>(value_), tensor_);
} else {
phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
}
#else
phi::funcs::SetConstant<DeviceContext, T> set_constant;
set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
#endif
}
const DeviceContext &dev_ctx_;
......@@ -509,14 +497,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace ops = paddle::operators;
namespace plat = paddle::platform;
#if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL(
coalesce_tensor,
ops::CoalesceTensorOpKernel<phi::CPUContext, plat::float16>,
ops::CoalesceTensorOpKernel<phi::CPUContext, int>,
ops::CoalesceTensorOpKernel<phi::CPUContext, float>);
#endif
REGISTER_OP_VERSION(coalesce_tensor)
.AddCheckpoint(
R"ROC(
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/barrier_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class BarrierOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CNCL)
auto in = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
int64_t numel = in->numel();
const void* sendbuff = in->data();
void* recvbuff = out->mutable_data<T>(place);
int rid = ctx.Attr<int>("ring_id");
auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
auto* comm = cncl_comm->comm();
auto comm_stream = cncl_comm->stream();
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
cnclReduceOp_t cncl_red_type = cnclSum;
dev_ctx.Wait();
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
#else
PADDLE_THROW(platform::errors::Unavailable(
"PaddlePaddle should compile with CNCL."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#include "paddle/fluid/framework/convert_utils.h"
namespace paddle {
namespace operators {
template <typename T>
class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
#if defined(PADDLE_WITH_CNCL)
auto x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int nranks = ctx.Attr<int>("nranks");
int rid = ctx.Attr<int>("ring_id");
auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
PADDLE_ENFORCE_EQ(
nranks,
comm->nranks(),
platform::errors::InvalidArgument(
"nranks: %s should equal to %s", nranks, comm->nranks()));
framework::DDim out_dims = x->dims();
out_dims[0] *= nranks;
out->mutable_data<T>(out_dims, place);
uint32_t send_numel = x->numel();
void* send_buff;
void* recv_buff;
phi::DenseTensor in_tensor, out_tensor;
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast from int64 to int32 since cncl do not support int64
in_tensor.mutable_data<int32_t>(x->dims(), place);
out_tensor.mutable_data<int32_t>(out->dims(), place);
MLUCnnlTensorDesc x_int64_desc(*x);
MLUCnnlTensorDesc x_int32_desc(in_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
x_int64_desc.get(),
GetBasePtr(x),
x_int32_desc.get(),
GetBasePtr(&in_tensor));
send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
} else {
in_tensor.ShareDataWith(*x);
out_tensor.ShareDataWith(*out);
send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
}
mluStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
cnclDataType_t dtype = platform::ToCNCLDataType(
framework::TransToProtoVarType(in_tensor.dtype()));
PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
if (framework::TransToProtoVarType(x->dtype()) ==
framework::proto::VarType::INT64) {
// cast back from int64 out_tensor to out
MLUCnnlTensorDesc out_int64_desc(*out);
MLUCnnlTensorDesc out_int32_desc(out_tensor);
cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
MLUCnnl::Cast(ctx,
cast_type,
out_int32_desc.get(),
GetBasePtr(&out_tensor),
out_int64_desc.get(),
GetBasePtr(out));
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allgather,
ops::CAllGatherOpMLUKernel<float>,
ops::CAllGatherOpMLUKernel<uint8_t>,
ops::CAllGatherOpMLUKernel<int>,
ops::CAllGatherOpMLUKernel<int8_t>,
ops::CAllGatherOpMLUKernel<int16_t>,
ops::CAllGatherOpMLUKernel<int64_t>,
ops::CAllGatherOpMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_max,
ops::CAllReduceOpMLUKernel<ops::kRedMax, float>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMax, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_min,
ops::CAllReduceOpMLUKernel<ops::kRedMin, float>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedMin, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_prod,
ops::CAllReduceOpMLUKernel<ops::kRedProd, float>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedProd, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_allreduce_sum,
ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_CNCL)
auto x = ctx.Input<phi::DenseTensor>("X");
auto out = ctx.Output<phi::DenseTensor>("Out");
int numel = x->numel();
cnclDataType_t dtype =
platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
int rid = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
mluStream stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
int root = ctx.Attr<int>("root");
if (root == comm->rank()) {
PADDLE_ENFORCE_MLU_SUCCESS(
cnclBcast(reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
numel,
dtype,
root,
comm->comm(),
stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
<< x->numel();
if (out != x) {
framework::TensorCopy(
*static_cast<const phi::DenseTensor*>(x),
place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<phi::DenseTensor*>(out));
}
} else {
PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place),
numel,
dtype,
root,
comm->comm(),
stream));
VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
<< phi::product(out->dims());
}
out->Resize(x->dims());
out->set_lod(x->lod());
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with MLU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_broadcast,
ops::CBroadcastOPMLUKernel<float>,
ops::CBroadcastOPMLUKernel<plat::float16>,
ops::CBroadcastOPMLUKernel<int>,
ops::CBroadcastOPMLUKernel<int16_t>,
ops::CBroadcastOPMLUKernel<int8_t>,
ops::CBroadcastOPMLUKernel<uint8_t>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_max,
ops::CReduceOpMLUKernel<ops::kRedMax, float>,
ops::CReduceOpMLUKernel<ops::kRedMax, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedMax, int>,
ops::CReduceOpMLUKernel<ops::kRedMax, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedMax, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedMax, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_min,
ops::CReduceOpMLUKernel<ops::kRedMin, float>,
ops::CReduceOpMLUKernel<ops::kRedMin, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedMin, int>,
ops::CReduceOpMLUKernel<ops::kRedMin, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedMin, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedMin, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_prod,
ops::CReduceOpMLUKernel<ops::kRedProd, float>,
ops::CReduceOpMLUKernel<ops::kRedProd, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedProd, int>,
ops::CReduceOpMLUKernel<ops::kRedProd, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedProd, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedProd, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(c_reduce_sum,
ops::CReduceOpMLUKernel<ops::kRedSum, float>,
ops::CReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CReduceOpMLUKernel<ops::kRedSum, int>,
ops::CReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CReduceOpMLUKernel<ops::kRedSum, uint8_t>)
......@@ -33,22 +33,7 @@ Call calculation stream synchronization.
namespace ops = paddle::operators;
namespace plat = paddle::platform;
using MLU = plat::MLUPlace;
REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream,
ops::CSyncCalcStreamOp,
ops::CSyncCalcStreamOpMaker);
REGISTER_OP_NPU_KERNEL(c_sync_calc_stream,
ops::CSyncCalcStreamKernel<float, MLU>,
ops::CSyncCalcStreamKernel<double, MLU>,
ops::CSyncCalcStreamKernel<int, MLU>,
ops::CSyncCalcStreamKernel<int64_t, MLU>,
ops::CSyncCalcStreamKernel<plat::float16, MLU>);
REGISTER_OP_MLU_KERNEL(c_sync_calc_stream,
ops::CSyncCalcStreamKernel<float, MLU>,
ops::CSyncCalcStreamKernel<double, MLU>,
ops::CSyncCalcStreamKernel<int, MLU>,
ops::CSyncCalcStreamKernel<int64_t, MLU>,
ops::CSyncCalcStreamKernel<plat::float16, MLU>);
......@@ -56,6 +56,3 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream,
REGISTER_OP_NPU_KERNEL(c_sync_comm_stream,
ops::CSyncCommStreamKernel<float, plat::NPUPlace>);
REGISTER_OP_MLU_KERNEL(c_sync_comm_stream,
ops::CSyncCommStreamKernel<float, plat::MLUPlace>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(mp_allreduce_sum,
ops::CAllReduceOpMLUKernel<ops::kRedSum, float>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, plat::float16>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int16_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, int8_t>,
ops::CAllReduceOpMLUKernel<ops::kRedSum, uint8_t>)
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class EqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_EQ,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class NotEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_NE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class LessThanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_LT,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class LessEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_LE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class GreaterThanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_GT,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
template <typename DeviceContext, typename T>
class GreaterEqualMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc input_x(
*x, CNNL_LAYOUT_ARRAY, ToCnnlDataType(x->dtype()));
MLUCnnlTensorDesc input_y(
*y, CNNL_LAYOUT_ARRAY, ToCnnlDataType(y->dtype()));
MLUCnnlTensorDesc output(
*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType(out->dtype()));
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_GE,
input_x.get(),
GetBasePtr(x),
input_y.get(),
GetBasePtr(y),
output.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(
equal,
ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
not_equal,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
less_than,
ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
less_equal,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
greater_than,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
REGISTER_OP_MLU_KERNEL(
greater_equal,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T, cnnlLogicOp_t log_method>
class LogicalMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
if (log_method == CNNL_LOGIC_OP_NOT) {
y = x;
}
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc out_desc(*out);
MLUCnnl::Logic(ctx,
log_method,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(logical_not,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_NOT>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_NOT>);
REGISTER_OP_MLU_KERNEL(logical_and,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_AND>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_AND>);
REGISTER_OP_MLU_KERNEL(logical_or,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_OR>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_OR>);
REGISTER_OP_MLU_KERNEL(logical_xor,
ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_XOR>,
ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_XOR>);
......@@ -38,11 +38,6 @@ if(WITH_XPU)
detection_library(prior_box_op SRCS prior_box_op.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc)
detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
elseif(WITH_MLU)
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op_mlu.cc)
detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
else()
detection_library(iou_similarity_op SRCS iou_similarity_op.cc
iou_similarity_op.cu)
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detection/iou_similarity_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
struct IouFunction {
public:
explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
place = ctx.GetPlace();
}
void Transpose(const phi::DenseTensor* x,
phi::DenseTensor* y,
const std::vector<int>& axis) {
// y should be init first
TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
}
void Add(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc add_op_desc(
CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
add_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void Sub(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void Mul(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z),
ToCnnlDataType<T>());
}
void DivNoNan(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
MLUCnnl::DivNoNan(ctx,
prefer,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
void Adds(const phi::DenseTensor* x, float scalar, phi::DenseTensor* y) {
// y should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
float alpha = 1.0;
float beta = scalar;
MLUCnnl::Transform(ctx,
&alpha,
&beta,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y));
}
void Maximum(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnl::Maximum(ctx,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
void Minimum(const phi::DenseTensor* x,
const phi::DenseTensor* y,
phi::DenseTensor* z) {
// z should be init first
MLUCnnlTensorDesc x_desc(*x);
MLUCnnlTensorDesc y_desc(*y);
MLUCnnlTensorDesc z_desc(*z);
MLUCnnl::Minimum(ctx,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
z_desc.get(),
GetBasePtr(z));
}
private:
platform::Place place;
const framework::ExecutionContext& ctx;
};
template <typename T>
class IouSimilarityMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
bool normalized = ctx.Attr<bool>("box_normalized");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto _type = x->dtype();
auto place = ctx.GetPlace();
IouFunction<T> F(ctx);
auto N = x->dims()[0];
auto M = y->dims()[0];
out->mutable_data<T>({N, M}, place);
phi::DenseTensor xt(_type);
phi::DenseTensor yt(_type);
xt.mutable_data<T>({4, N}, place);
yt.mutable_data<T>({4, M}, place);
std::vector<int> vec_trans = {1, 0};
F.Transpose(x, &xt, vec_trans);
F.Transpose(y, &yt, vec_trans);
phi::DenseTensor xmin1 = xt.Slice(0, 1);
phi::DenseTensor ymin1 = xt.Slice(1, 2);
phi::DenseTensor xmax1 = xt.Slice(2, 3);
phi::DenseTensor ymax1 = xt.Slice(3, 4);
phi::DenseTensor xmin2 = yt.Slice(0, 1);
phi::DenseTensor ymin2 = yt.Slice(1, 2);
phi::DenseTensor xmax2 = yt.Slice(2, 3);
phi::DenseTensor ymax2 = yt.Slice(3, 4);
xmin1.Resize({N, 1});
ymin1.Resize({N, 1});
xmax1.Resize({N, 1});
ymax1.Resize({N, 1});
xmin2.Resize({1, M});
ymin2.Resize({1, M});
xmax2.Resize({1, M});
ymax2.Resize({1, M});
phi::DenseTensor w1(_type);
phi::DenseTensor h1(_type);
phi::DenseTensor w2(_type);
phi::DenseTensor h2(_type);
phi::DenseTensor area1(_type);
phi::DenseTensor area2(_type);
w1.mutable_data<T>({N, 1}, place);
h1.mutable_data<T>({N, 1}, place);
w2.mutable_data<T>({1, M}, place);
h2.mutable_data<T>({1, M}, place);
area1.mutable_data<T>({N, 1}, place);
area2.mutable_data<T>({1, M}, place);
F.Sub(&xmax1, &xmin1, &w1);
F.Sub(&ymax1, &ymin1, &h1);
F.Sub(&xmax2, &xmin2, &w2);
F.Sub(&ymax2, &ymin2, &h2);
if (!normalized) {
F.Adds(&w1, 1.0f, &w1);
F.Adds(&h1, 1.0f, &h1);
F.Adds(&w2, 1.0f, &w2);
F.Adds(&h2, 1.0f, &h2);
}
F.Mul(&w1, &h1, &area1);
F.Mul(&w2, &h2, &area2);
phi::DenseTensor inter_xmax(_type);
phi::DenseTensor inter_ymax(_type);
phi::DenseTensor inter_xmin(_type);
phi::DenseTensor inter_ymin(_type);
inter_xmax.mutable_data<T>({N, M}, place);
inter_ymax.mutable_data<T>({N, M}, place);
inter_xmin.mutable_data<T>({N, M}, place);
inter_ymin.mutable_data<T>({N, M}, place);
F.Minimum(&xmax1, &xmax2, &inter_xmax);
F.Minimum(&ymax1, &ymax2, &inter_ymax);
F.Maximum(&xmin1, &xmin2, &inter_xmin);
F.Maximum(&ymin1, &ymin2, &inter_ymin);
phi::DenseTensor inter_w(_type);
phi::DenseTensor inter_h(_type);
inter_w.mutable_data<T>({N, M}, place);
inter_h.mutable_data<T>({N, M}, place);
F.Sub(&inter_xmax, &inter_xmin, &inter_w);
F.Sub(&inter_ymax, &inter_ymin, &inter_h);
if (!normalized) {
F.Adds(&inter_w, 1.0f, &inter_w);
F.Adds(&inter_h, 1.0f, &inter_h);
}
phi::DenseTensor zeros(_type);
zeros.mutable_data<T>({1}, place);
FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
F.Maximum(&inter_w, &zeros, &inter_w);
F.Maximum(&inter_h, &zeros, &inter_h);
F.Mul(&inter_w, &inter_h, out);
phi::DenseTensor union_area(_type);
union_area.mutable_data<T>({N, M}, place);
F.Add(&area1, &area2, &union_area);
F.Sub(&union_area, out, &union_area);
F.DivNoNan(out, &union_area, out);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(iou_similarity,
ops::IouSimilarityMLUKernel<float>,
ops::IouSimilarityMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/detection/prior_box_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class PriorBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<phi::DenseTensor>("Input");
auto* image = ctx.Input<phi::DenseTensor>("Image");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* variances = ctx.Output<phi::DenseTensor>("Variances");
float step_w = ctx.Attr<float>("step_w");
float step_h = ctx.Attr<float>("step_h");
float offset = ctx.Attr<float>("offset");
bool clip = ctx.Attr<bool>("clip");
bool min_max_aspect_ratios_order =
ctx.Attr<bool>("min_max_aspect_ratios_order");
int im_width = image->dims()[3];
int im_height = image->dims()[2];
int width = input->dims()[3];
int height = input->dims()[2];
auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
bool flip = ctx.Attr<bool>("flip");
std::vector<float> new_aspect_ratios;
ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
phi::DenseTensor ratios;
paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
MLUOpTensorDesc new_aspect_ratios_desc(ratios);
auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
phi::DenseTensor min;
paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
MLUOpTensorDesc min_sizes_desc(min);
auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
phi::DenseTensor max;
paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
MLUOpTensorDesc max_sizes_desc(max);
auto variances_attr = ctx.Attr<std::vector<float>>("variances");
phi::DenseTensor var_tensor;
paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
MLUOpTensorDesc variances_attr_desc(var_tensor);
auto place = ctx.GetPlace();
boxes->mutable_data<T>(place);
variances->mutable_data<T>(place);
MLUOpTensorDesc var_desc(*variances);
MLUOpTensorDesc output_desc(*boxes);
MLUOP::OpPriorBox(ctx,
min_sizes_desc.get(),
GetBasePtr(&min),
new_aspect_ratios_desc.get(),
GetBasePtr(&ratios),
variances_attr_desc.get(),
GetBasePtr(&var_tensor),
max_sizes_desc.get(),
GetBasePtr(&max),
height,
width,
im_height,
im_width,
step_h,
step_w,
offset,
clip,
min_max_aspect_ratios_order,
output_desc.get(),
GetBasePtr(boxes),
var_desc.get(),
GetBasePtr(variances));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class YoloBoxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
auto* scores = ctx.Output<phi::DenseTensor>("Scores");
const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
auto class_num = ctx.Attr<int>("class_num");
auto conf_thresh = ctx.Attr<float>("conf_thresh");
auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
auto clip_bbox = ctx.Attr<bool>("clip_bbox");
auto scale = ctx.Attr<float>("scale_x_y");
auto iou_aware = ctx.Attr<bool>("iou_aware");
auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
int anchor_num = anchors.size() / 2;
int64_t size = anchors.size();
auto dim_x = x->dims();
int n = dim_x[0];
int s = anchor_num;
int h = dim_x[2];
int w = dim_x[3];
// The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
// H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
// anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
// In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
// of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
// classification scores of boxes.
std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
phi::DenseTensor boxes_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
phi::DenseTensor scores_tensor_mluops =
ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
dev_ctx);
MLUOpTensorDesc boxes_trans_desc_mluops(
4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc boxes_trans_desc_cnnl(
4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
MLUOpTensorDesc scores_trans_desc_mluops(
4, scores_dim_mluops.data(), ToMluOpDataType<T>());
MLUCnnlTensorDesc scores_trans_desc_cnnl(
4, scores_dim_mluops.data(), ToCnnlDataType<T>());
boxes->mutable_data<T>(ctx.GetPlace());
scores->mutable_data<T>(ctx.GetPlace());
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
MLUOpTensorDesc img_size_desc(
*img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
phi::DenseTensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
anchors_temp.Resize({size});
paddle::framework::TensorFromVector(
anchors, ctx.device_context(), &anchors_temp);
MLUOpTensorDesc anchors_desc(anchors_temp);
MLUCnnlTensorDesc boxes_desc_cnnl(
4, boxes_out_dim.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc scores_desc_cnnl(
4, scores_out_dim.data(), ToCnnlDataType<T>());
MLUOP::OpYoloBox(ctx,
x_desc.get(),
GetBasePtr(x),
img_size_desc.get(),
GetBasePtr(img_size),
anchors_desc.get(),
GetBasePtr(&anchors_temp),
class_num,
conf_thresh,
downsample_ratio,
clip_bbox,
scale,
iou_aware,
iou_aware_factor,
boxes_trans_desc_mluops.get(),
GetBasePtr(&boxes_tensor_mluops),
scores_trans_desc_mluops.get(),
GetBasePtr(&scores_tensor_mluops));
const std::vector<int> perm = {0, 1, 3, 2};
// transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
MLUCnnl::Transpose(ctx,
perm,
4,
boxes_trans_desc_cnnl.get(),
GetBasePtr(&boxes_tensor_mluops),
boxes_desc_cnnl.get(),
GetBasePtr(boxes));
// transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
// class_num]
MLUCnnl::Transpose(ctx,
perm,
4,
scores_trans_desc_cnnl.get(),
GetBasePtr(&scores_tensor_mluops),
scores_desc_cnnl.get(),
GetBasePtr(scores));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseAddMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_ADD);
}
};
template <typename T>
class ElementwiseAddGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
MLUCnnlTensorDesc dout_desc(*dout);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_add,
ops::ElementwiseAddMLUKernel<float>,
ops::ElementwiseAddMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradMLUKernel<float>,
ops::ElementwiseAddGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseDivMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<DIV, T>(ctx);
}
};
template <typename T>
class ElementwiseDivGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
: axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
// compute dout/y == 1/y * dout
phi::DenseTensor dout_div_y(dout->dtype());
dout_div_y.Resize(dout->dims());
dout_div_y.mutable_data<T>(ctx.GetPlace());
MLUBinary<DIV>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
dout_desc.get(),
GetBasePtr(&dout_div_y));
if (dx) {
// compute dx = dout/y = 1/y * dout
if (dx->dims() != dout->dims()) {
dx->mutable_data<T>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_div_y.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dout_div_y),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
dx->ShareDataWith(dout_div_y);
}
}
if (dy) {
// compute dy = -out * (dout/y) = -out/y * dout
phi::DenseTensor neg_out(out->type());
neg_out.mutable_data<T>(out->dims(), ctx.GetPlace());
MLUCnnlTensorDesc out_desc(*out);
MLUUnary<NEG>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
out_desc.get(),
GetBasePtr(out),
out_desc.get(),
GetBasePtr(&neg_out));
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(&neg_out),
dout_desc.get(),
GetBasePtr(&dout_div_y),
dout_desc.get(),
GetBasePtr(&dy_temp),
ToCnnlDataType<T>());
if (dy->dims() != dout->dims()) {
dy->mutable_data<T>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
dy->ShareDataWith(dy_temp);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_div,
ops::ElementwiseDivMLUKernel<int>,
ops::ElementwiseDivMLUKernel<float>,
ops::ElementwiseDivMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_div_grad,
ops::ElementwiseDivGradMLUKernel<int>,
ops::ElementwiseDivGradMLUKernel<float>,
ops::ElementwiseDivGradMLUKernel<plat::float16>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseMaxMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<MAXIMUM, T>(ctx);
}
};
template <typename T>
class ElementwiseMaxGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUMinMaxGradHelper<MAXIMUM_GRAD, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(elementwise_max,
ops::ElementwiseMaxMLUKernel<int>,
ops::ElementwiseMaxMLUKernel<float>,
ops::ElementwiseMaxMLUKernel<paddle::platform::float16>);
REGISTER_OP_MLU_KERNEL(
elementwise_max_grad,
ops::ElementwiseMaxGradMLUKernel<int>,
ops::ElementwiseMaxGradMLUKernel<float>,
ops::ElementwiseMaxGradMLUKernel<paddle::platform::float16>);
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseMinMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<MINIMUM, T>(ctx);
}
};
template <typename T>
class ElementwiseMinGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUMinMaxGradHelper<MINIMUM_GRAD, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_min,
ops::ElementwiseMinMLUKernel<int>,
ops::ElementwiseMinMLUKernel<float>,
ops::ElementwiseMinMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_min_grad,
ops::ElementwiseMinGradMLUKernel<int>,
ops::ElementwiseMinGradMLUKernel<float>,
ops::ElementwiseMinGradMLUKernel<plat::float16>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_MLU
#include <vector>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
inline void GetReduceAxes(const int axis,
const framework::DDim& src_ddims,
const framework::DDim& target_ddims,
std::vector<int>* axes) {
int64_t src_dim_size = src_ddims.size();
int64_t target_dim_size = target_ddims.size();
for (int64_t i = 0; i < src_dim_size; ++i) {
if (i < axis || i >= target_dim_size + axis) {
axes->push_back(i);
continue;
}
if (src_ddims[i] > target_ddims[i - axis]) {
axes->push_back(i);
}
}
}
inline void GetReduceAxesAndDstDims(const int axis,
const framework::DDim& src_ddims,
const framework::DDim& target_ddims,
std::vector<int>* reduce_axes,
std::vector<int>* dst_dims_vec) {
int64_t src_dim_size = src_ddims.size();
int64_t target_dim_size = target_ddims.size();
int src_axis = (target_dim_size < src_dim_size ? axis : 0);
for (int ax = 0; ax < src_dim_size; ++ax) {
if ((ax < src_axis || ax >= src_axis + target_dim_size) ||
(src_ddims[ax] > 1 && target_ddims[ax - src_axis] == 1)) {
reduce_axes->push_back(ax);
} else {
dst_dims_vec->push_back(src_ddims[ax]);
}
}
if (dst_dims_vec->size() == 0) {
// target_var is scalar
dst_dims_vec->push_back(1);
}
}
template <typename T>
void MLUOpTensorKernel(const framework::ExecutionContext& ctx,
const cnnlOpTensorDesc_t op_tensor_op) {
PADDLE_ENFORCE_EQ(
platform::is_mlu_place(ctx.GetPlace()),
true,
platform::errors::Unavailable("This kernel only runs on MLU."));
PADDLE_ENFORCE_EQ((op_tensor_op == CNNL_OP_TENSOR_ADD) ||
(op_tensor_op == CNNL_OP_TENSOR_SUB) ||
(op_tensor_op == CNNL_OP_TENSOR_MUL),
true,
platform::errors::Unavailable(
"This kernel of MLU only support ADD, SUB, MUL."));
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out);
MLUCnnlOpTensorDesc op_tensor_desc(
op_tensor_op, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out),
ToCnnlDataType<T>());
}
// ------------------ BinaryOp -----------------
enum BINARY_FUNCTOR {
DIV,
DIVNONAN,
MAXIMUM,
MINIMUM,
POW,
};
template <BINARY_FUNCTOR func>
void MLUBinary(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out);
template <>
inline void MLUBinary<DIV>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Div(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
}
template <>
inline void MLUBinary<MAXIMUM>(
const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer, // useless, only for compatible
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Maximum(ctx, x_desc, x, y_desc, y, out_desc, out);
}
template <>
inline void MLUBinary<MINIMUM>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t in1_desc,
const void* in1,
const cnnlTensorDescriptor_t in2_desc,
const void* in2,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Minimum(ctx, in1_desc, in1, in2_desc, in2, out_desc, out);
}
template <>
inline void MLUBinary<POW>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t x_desc,
const void* x,
const cnnlTensorDescriptor_t y_desc,
const void* y,
const cnnlTensorDescriptor_t out_desc,
void* out) {
MLUCnnl::Pow(ctx, prefer, x_desc, x, y_desc, y, out_desc, out);
}
template <BINARY_FUNCTOR Functor, typename T>
void MLUBinaryOp(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
MLUBinary<Functor>(ctx,
prefer_type,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(out));
}
// ------------------ UnaryOp -----------------
enum UNARY_FUNCTOR {
NEG,
RECIPROCAL,
};
template <UNARY_FUNCTOR func>
void MLUUnary(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output);
template <>
inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output) {
MLUCnnl::Neg(ctx, input_desc, input, output_desc, output);
}
template <>
inline void MLUUnary<RECIPROCAL>(const framework::ExecutionContext& ctx,
cnnlComputationPreference_t prefer,
const cnnlTensorDescriptor_t input_desc,
const void* input,
const cnnlTensorDescriptor_t output_desc,
void* output) {
MLUCnnl::Reciprocal(ctx, input_desc, input, output_desc, output);
}
template <UNARY_FUNCTOR Functor, typename Tin, typename Tout = Tin>
void MLUUnaryOp(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<Tout>(ctx.GetPlace());
MLUCnnlTensorDesc x_desc(x, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tin>());
MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<Tout>());
cnnlComputationPreference_t prefer_type = CNNL_COMPUTATION_HIGH_PRECISION;
MLUUnary<Functor>(ctx,
prefer_type,
x_desc.get(),
GetBasePtr(x),
out_desc.get(),
GetBasePtr(out));
}
// ------------------ MLUElementwiseGradOp -----------------
enum MINMAX_GRAD_FUNCTOR {
MAXIMUM_GRAD,
MINIMUM_GRAD,
};
template <MINMAX_GRAD_FUNCTOR Functor, typename Tin, typename Tout = Tin>
void MLUMinMaxGradHelper(const framework::ExecutionContext& ctx) {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis =
(axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
// mask = Logic(x, y) only support min & max
cnnlLogicOp_t logic =
Functor == MAXIMUM_GRAD ? CNNL_LOGIC_OP_GE : CNNL_LOGIC_OP_LE;
phi::DenseTensor mask(x->dtype());
mask.Resize(phi::make_ddim(out_dims_array));
mask.mutable_data<Tin>(ctx.GetPlace());
cnnlDataType_t data_type = ToCnnlDataType<Tin>();
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
MLUCnnlTensorDesc mask_desc(max_dim, out_dims_array.data(), data_type);
MLUCnnl::Logic(ctx,
logic,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
mask_desc.get(),
GetBasePtr(&mask));
// dx = Mul(dz, mask)
phi::DenseTensor dx_temp(x->dtype());
dx_temp.Resize(dout->dims());
dx_temp.mutable_data<Tout>(ctx.GetPlace());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
dout_desc.get(),
GetBasePtr(&mask),
dout_desc.get(),
GetBasePtr(&dx_temp),
data_type);
// dy = Sub(dz, dx)
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<Tout>(ctx.GetPlace());
MLUCnnlOpTensorDesc sub_op_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
sub_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
dout_desc.get(),
GetBasePtr(&dx_temp),
dout_desc.get(),
GetBasePtr(&dy_temp),
data_type);
if (dx) {
if (dx->dims() != dout->dims()) {
dx->mutable_data<Tout>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dx_temp),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
dx->ShareDataWith(dx_temp);
}
}
if (dy) {
if (dy->dims() != dout->dims()) {
dy->mutable_data<Tout>(ctx.GetPlace());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
} else {
dy->ShareDataWith(dy_temp);
}
}
}
} // namespace operators
} // namespace paddle
#endif
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
using MLUDeviceContext = platform::MLUDeviceContext;
template <typename T>
class ElementwiseMulMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_MUL);
}
};
template <typename T>
class ElementwiseMulGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
const auto& x_dims = x->dims();
const auto& y_dims = y->dims();
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
: axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType<T>());
MLUCnnlTensorDesc dout_desc(*dout);
MLUCnnlOpTensorDesc mul_op_desc(
CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout->dims()) {
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
x_desc.get(),
GetBasePtr(dx),
ToCnnlDataType<T>());
} else {
phi::DenseTensor dx_temp(x->dtype());
dx_temp.Resize(dout->dims());
dx_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
y_desc.get(),
GetBasePtr(y),
dout_desc.get(),
GetBasePtr(&dx_temp),
ToCnnlDataType<T>());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dx_temp),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout->dims()) {
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(dy),
ToCnnlDataType<T>());
} else {
phi::DenseTensor dy_temp(y->dtype());
dy_temp.Resize(dout->dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
MLUCnnl::OpTensor(ctx,
mul_op_desc.get(),
dout_desc.get(),
GetBasePtr(dout),
x_desc.get(),
GetBasePtr(x),
dout_desc.get(),
GetBasePtr(&dy_temp),
ToCnnlDataType<T>());
std::vector<int> reduce_axes;
GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(&dy_temp),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(elementwise_mul,
ops::ElementwiseMulMLUKernel<float>,
ops::ElementwiseMulMLUKernel<paddle::platform::float16>,
ops::ElementwiseMulMLUKernel<int>);
REGISTER_OP_MLU_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradMLUKernel<float>,
ops::ElementwiseMulGradMLUKernel<paddle::platform::float16>,
ops::ElementwiseMulGradMLUKernel<int>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwisePowMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUBinaryOp<POW, T>(ctx);
}
};
template <typename T>
class ElementwisePowGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto place = ctx.GetPlace();
auto x_dims = x->dims();
auto y_dims = y->dims();
axis =
(axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
int max_dim = std::max(x_dims.size(), y_dims.size());
std::vector<int> x_dims_array(max_dim);
std::vector<int> y_dims_array(max_dim);
std::vector<int> out_dims_array(max_dim);
GetBroadcastDimsArrays(x_dims,
y_dims,
x_dims_array.data(),
y_dims_array.data(),
out_dims_array.data(),
max_dim,
axis);
cnnlDataType_t data_type = ToCnnlDataType<T>();
MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), data_type);
MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), data_type);
MLUCnnlTensorDesc out_desc(max_dim, out_dims_array.data(), data_type);
auto dout_dims = dout->dims();
if (dx) {
// dx = dout * y * pow(x, y - 1);
phi::DenseTensor one_dx(y->type());
one_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
FillMLUTensorWithHostValue(ctx, static_cast<T>(1), &one_dx);
phi::DenseTensor sub_dx(y->type());
sub_dx.mutable_data<T>(phi::make_ddim(y_dims_array), place);
MLUCnnlOpTensorDesc op_tensor_desc(
CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(ctx,
op_tensor_desc.get(),
y_desc.get(),
GetBasePtr(y),
y_desc.get(),
GetBasePtr(&one_dx),
y_desc.get(),
GetBasePtr(&sub_dx),
data_type);
phi::DenseTensor tmp_dx(x->type());
tmp_dx.mutable_data<T>(phi::make_ddim(out_dims_array), place);
MLUCnnl::Pow(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(&sub_dx),
out_desc.get(),
GetBasePtr(&tmp_dx));
MLUCnnl::MulAx(ctx,
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(&tmp_dx));
MLUCnnl::MulAx(ctx,
out_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(&tmp_dx));
if (x_dims != dout_dims) {
dx->mutable_data<T>(place);
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_dims, x_dims, &reduce_axes);
if (!reduce_axes.empty()) {
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(*dx);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
out_desc.get(),
GetBasePtr(&tmp_dx),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
}
} else {
dx->ShareDataWith(tmp_dx);
}
}
if (dy) {
// dy = dout * log(x) * pow(x, y)
phi::DenseTensor tmp_dy(y->type());
tmp_dy.mutable_data<T>(phi::make_ddim(out_dims_array), place);
MLUCnnl::Pow(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
x_desc.get(),
GetBasePtr(x),
y_desc.get(),
GetBasePtr(y),
out_desc.get(),
GetBasePtr(&tmp_dy));
phi::DenseTensor log_x(x->type());
log_x.mutable_data<T>(x->dims(), place);
MLUCnnl::Log(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
CNNL_LOG_E,
x_desc.get(),
GetBasePtr(x),
x_desc.get(),
GetBasePtr(&log_x));
MLUCnnl::MulAx(ctx,
x_desc.get(),
GetBasePtr(&log_x),
out_desc.get(),
GetBasePtr(&tmp_dy));
MLUCnnl::MulAx(ctx,
out_desc.get(),
GetBasePtr(dout),
out_desc.get(),
GetBasePtr(&tmp_dy));
if (y_dims != dout_dims) {
dy->mutable_data<T>(place);
std::vector<int> reduce_axes;
GetReduceAxes(axis, dout_dims, y_dims, &reduce_axes);
if (!reduce_axes.empty()) {
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
data_type,
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(*dy);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
out_desc.get(),
GetBasePtr(&tmp_dy),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
}
} else {
dy->ShareDataWith(tmp_dy);
}
}
if (!dx && !dy) {
PADDLE_THROW(platform::errors::Unavailable(
"Not support all outputs to be empty."));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_pow,
ops::ElementwisePowMLUKernel<plat::float16>,
ops::ElementwisePowMLUKernel<float>);
REGISTER_OP_MLU_KERNEL(elementwise_pow_grad,
ops::ElementwisePowGradMLUKernel<plat::float16>,
ops::ElementwisePowGradMLUKernel<float>);
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mlu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseSubMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
MLUOpTensorKernel<T>(ctx, CNNL_OP_TENSOR_SUB);
}
};
template <typename T>
class ElementwiseSubGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::MLUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
MLUCnnlTensorDesc dout_desc(*dout);
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dx->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dx_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dx_desc.get(),
GetBasePtr(dx));
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
GetReduceAxesAndDstDims(
axis, dout->dims(), dy->dims(), &reduce_axes, &dst_dims_vec);
MLUCnnlReduceDesc reduction_desc(reduce_axes,
CNNL_REDUCE_ADD,
ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnlTensorDesc dy_desc(
dst_dims_vec.size(), dst_dims_vec.data(), ToCnnlDataType<T>());
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduction_desc.get(),
nullptr,
dout_desc.get(),
GetBasePtr(dout),
0,
nullptr,
nullptr,
dy_desc.get(),
GetBasePtr(dy));
tmp_dout = dy;
}
// call neg op, dy = -dout
MLUCnnlTensorDesc tmp_dout_desc(*tmp_dout);
MLUCnnlTensorDesc dy_desc(*dy);
MLUUnary<NEG>(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
tmp_dout_desc.get(),
GetBasePtr(tmp_dout),
dy_desc.get(),
GetBasePtr(dy));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(elementwise_sub,
ops::ElementwiseSubMLUKernel<int>,
ops::ElementwiseSubMLUKernel<float>,
ops::ElementwiseSubMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(elementwise_sub_grad,
ops::ElementwiseSubGradMLUKernel<int>,
ops::ElementwiseSubGradMLUKernel<float>,
ops::ElementwiseSubGradMLUKernel<plat::float16>);
......@@ -43,13 +43,6 @@ inline std::vector<int> get_expand_shape(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
#endif
#ifdef PADDLE_WITH_MLU
if (platform::is_mlu_place(shape_tensor->place())) {
paddle::framework::TensorCopySync(
*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor);
shape_data = cpu_shape_tensor.data<int>();
}
#endif
auto vec_shape =
std::vector<int>(shape_data, shape_data + shape_tensor->numel());
......@@ -74,13 +67,6 @@ inline std::vector<int> get_expand_shape(
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>());
}
#endif
#ifdef PADDLE_WITH_MLU
else if (platform::is_mlu_place(tensor->place())) { // NOLINT
phi::DenseTensor temp;
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp);
vec_epxand_shape.push_back(*temp.data<int32_t>());
}
#endif
else { // NOLINT
vec_epxand_shape.push_back(*tensor->data<int32_t>());
......
......@@ -6,11 +6,7 @@ if(WITH_XPU)
endif()
# please add new math_library in alphabetical order
if(WITH_MLU)
math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop)
else()
math_library(concat_and_split DEPS concat_and_split_functor)
endif()
math_library(concat_and_split DEPS concat_and_split_functor)
math_library(context_project DEPS im2col math_function)
math_library(cos_sim_functor)
math_library(depthwise_conv)
......
......@@ -17,9 +17,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
......@@ -181,100 +178,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
};
#endif
#ifdef PADDLE_WITH_MLU
template <typename T>
class ConcatFunctor<platform::MLUDeviceContext, T> {
public:
void operator()(const platform::MLUDeviceContext& context,
const std::vector<phi::DenseTensor>& input,
int axis,
phi::DenseTensor* output) {
int dev_id = context.GetPlace().GetDeviceId();
platform::MLUDeviceGuard guard(dev_id);
auto ins_size = input.size();
const int axis_t = axis;
const int ins_size_t = ins_size;
// mlu should do sth
// init ins tensors
std::vector<const void*> inputs;
std::vector<MLUCnnlTensorDesc> input_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < ins_size; i++) {
input_descs.emplace_back(MLUCnnlTensorDesc(
input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype())));
desc_vector.push_back(input_descs.back().get());
inputs.push_back(input[i].data());
}
// init out tensors
MLUCnnlTensorDesc output_desc(
*output, CNNL_LAYOUT_ARRAY, ToCnnlDataType(output->dtype()));
// MLU should do sth
MLUCnnl::Concat(context,
ins_size_t,
axis_t,
desc_vector.data(),
inputs.data(),
output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class SplitFunctor<platform::MLUDeviceContext, T> {
public:
void operator()(const platform::MLUDeviceContext& context,
const phi::DenseTensor& input,
const std::vector<const phi::DenseTensor*>& ref_inputs,
const int axis,
std::vector<phi::DenseTensor*>* outputs) {
if (input.numel() == 0) {
return;
}
int dev_id = context.GetPlace().GetDeviceId();
platform::MLUDeviceGuard guard(dev_id);
auto in_dims = input.dims();
auto out_size = outputs->size();
std::vector<framework::DDim> outs_dims(out_size, in_dims);
for (size_t i = 0; i < out_size; ++i) {
outs_dims[i][axis] = ref_inputs[i]->dims()[axis];
}
// init out tensors
std::vector<void*> vct_tensor;
std::vector<MLUCnnlTensorDesc> output_descs;
std::vector<cnnlTensorDescriptor_t> desc_vector;
for (size_t i = 0; i < out_size; i++) {
(*outputs)[i]->Resize(outs_dims[i]);
output_descs.emplace_back(
MLUCnnlTensorDesc(*(*outputs)[i],
CNNL_LAYOUT_ARRAY,
ToCnnlDataType((*outputs)[i]->dtype())));
desc_vector.push_back(output_descs.back().get());
vct_tensor.push_back(GetBasePtr((*outputs)[i]));
}
// init in tensors
MLUCnnlTensorDesc input_desc(
input, CNNL_LAYOUT_ARRAY, ToCnnlDataType(input.dtype()));
// MLU should do sth
MLUCnnl::Split(context,
out_size,
axis,
input_desc.get(),
input.data(),
desc_vector.data(),
vct_tensor.data());
}
};
#endif
#define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<phi::CPUContext, type>; \
template class SplitFunctor<phi::CPUContext, type>;
......@@ -289,20 +192,6 @@ FOR_ALL_TYPES(DEFINE_FUNCTOR);
DEFINE_XPU_FUNCTOR(float)
DEFINE_XPU_FUNCTOR(platform::float16)
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
template class SplitFunctor<platform::MLUDeviceContext, type>;
DEFINE_MLU_FUNCTOR(float)
DEFINE_MLU_FUNCTOR(platform::float16)
DEFINE_MLU_FUNCTOR(int64_t)
DEFINE_MLU_FUNCTOR(bool)
DEFINE_MLU_FUNCTOR(int)
DEFINE_MLU_FUNCTOR(int8_t)
DEFINE_MLU_FUNCTOR(int16_t)
DEFINE_MLU_FUNCTOR(uint8_t)
#endif
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
namespace paddle {
namespace operators {
template <typename T>
class AccuracyMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* indices = ctx.Input<phi::DenseTensor>("Indices");
auto* label = ctx.Input<phi::DenseTensor>("Label");
auto* accuracy = ctx.Output<phi::DenseTensor>("Accuracy");
auto* correct = ctx.Output<phi::DenseTensor>("Correct");
auto* total = ctx.Output<phi::DenseTensor>("Total");
int num_samples = indices->dims()[0];
if (num_samples == 0) {
return;
}
// cast `indices` or `label` if their type is not INT32
phi::DenseTensor indices_int32(framework::TransToPhiDataType(VT::INT32));
phi::DenseTensor label_int32(framework::TransToPhiDataType(VT::INT32));
auto indices_type = framework::TransToProtoVarType(indices->type());
if (indices_type != VT::INT32) {
PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32),
true,
platform::errors::Unimplemented(
"In accuracy mlu kernel, cast indices from [%s] to "
"[%s] is not supported.",
framework::DataTypeToString(indices_type),
framework::DataTypeToString(VT::INT32)));
indices_int32.Resize(indices->dims());
indices_int32.mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc org_indices_desc(*indices);
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
cnnlCastDataType_t cast_type = GetCastDataType(indices_type, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
org_indices_desc.get(),
GetBasePtr(indices),
indices_int32_desc.get(),
GetBasePtr(&indices_int32));
} else {
indices_int32.ShareDataWith(*indices);
}
auto label_type = framework::TransToProtoVarType(label->type());
if (label_type != VT::INT32) {
PADDLE_ENFORCE_EQ(
MLUSupportsCast(label_type, VT::INT32),
true,
platform::errors::Unimplemented(
"In accuracy mlu kernel, cast label from [%s] to [%s] "
"is not supported.",
framework::DataTypeToString(label_type),
framework::DataTypeToString(VT::INT32)));
label_int32.Resize(label->dims());
label_int32.mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc org_label_desc(*label);
MLUCnnlTensorDesc label_int32_desc(label_int32);
cnnlCastDataType_t cast_type = GetCastDataType(label_type, VT::INT32);
MLUCnnl::Cast(ctx,
cast_type,
org_label_desc.get(),
GetBasePtr(label),
label_int32_desc.get(),
GetBasePtr(&label_int32));
} else {
label_int32.ShareDataWith(*label);
}
// equal
MLUCnnlTensorDesc indices_int32_desc(indices_int32);
MLUCnnlTensorDesc label_int32_desc(label_int32);
phi::DenseTensor equal_tensor(framework::TransToPhiDataType(VT::BOOL));
equal_tensor.Resize(indices->dims());
equal_tensor.mutable_data<bool>(ctx.GetPlace());
MLUCnnlTensorDesc equal_tensor_desc(equal_tensor);
MLUCnnl::Logic(ctx,
CNNL_LOGIC_OP_EQ,
indices_int32_desc.get(),
GetBasePtr(&indices_int32),
label_int32_desc.get(),
GetBasePtr(&label_int32),
equal_tensor_desc.get(),
GetBasePtr(&equal_tensor));
// cast equal
phi::DenseTensor equal_fp32(framework::TransToPhiDataType(VT::FP32));
equal_fp32.Resize(indices->dims());
equal_fp32.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc equal_fp32_desc(equal_fp32);
cnnlCastDataType_t equal_cast_type = GetCastDataType(VT::BOOL, VT::FP32);
MLUCnnl::Cast(ctx,
equal_cast_type,
equal_tensor_desc.get(),
GetBasePtr(&equal_tensor),
equal_fp32_desc.get(),
GetBasePtr(&equal_fp32));
// [correct]
// reduce_max
phi::DenseTensor correct_max(framework::TransToPhiDataType(VT::FP32));
correct_max.Resize(phi::make_ddim({num_samples}));
correct_max.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc correct_max_desc(correct_max);
MLUCnnlReduceDesc reduce_max_desc({1},
CNNL_REDUCE_MAX,
ToCnnlDataType<float>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_max_desc.get(),
nullptr,
equal_fp32_desc.get(),
GetBasePtr(&equal_fp32),
0 /*indices_size*/,
nullptr,
nullptr,
correct_max_desc.get(),
GetBasePtr(&correct_max));
// reduce_sum
phi::DenseTensor correct_sum(framework::TransToPhiDataType(VT::FP32));
correct_sum.Resize(correct->dims());
correct_sum.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc correct_sum_desc(correct_sum);
MLUCnnlReduceDesc reduce_sum_desc({0},
CNNL_REDUCE_ADD,
ToCnnlDataType<float>(),
CNNL_NOT_PROPAGATE_NAN,
CNNL_REDUCE_NO_INDICES,
CNNL_32BIT_INDICES);
MLUCnnl::Reduce(ctx,
true /*need_workspace*/,
reduce_sum_desc.get(),
nullptr,
correct_max_desc.get(),
GetBasePtr(&correct_max),
0 /*indices_size*/,
nullptr,
nullptr,
correct_sum_desc.get(),
GetBasePtr(&correct_sum));
// cast to int
correct->mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc correct_desc(*correct);
cnnlCastDataType_t correct_cast_type = GetCastDataType(VT::FP32, VT::INT32);
MLUCnnl::Cast(ctx,
correct_cast_type,
correct_sum_desc.get(),
GetBasePtr(&correct_sum),
correct_desc.get(),
GetBasePtr(correct));
// [total]
total->mutable_data<int>(ctx.GetPlace());
MLUCnnlTensorDesc total_desc(*total);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&num_samples,
total_desc.get(),
GetBasePtr(total));
// use `total` of type `float32` for calculating accuracy
phi::DenseTensor total_fp32(framework::TransToPhiDataType(VT::FP32));
total_fp32.Resize(total->dims());
total_fp32.mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc total_fp32_desc(total_fp32);
float num_samples_fp32 = static_cast<float>(num_samples);
MLUCnnl::Fill(ctx,
CNNL_POINTER_MODE_HOST,
&num_samples_fp32,
total_fp32_desc.get(),
GetBasePtr(&total_fp32));
// [accuracy]
accuracy->mutable_data<float>(ctx.GetPlace());
MLUCnnlTensorDesc accuracy_desc(*accuracy);
MLUCnnl::Div(ctx,
CNNL_COMPUTATION_HIGH_PRECISION,
correct_sum_desc.get(),
GetBasePtr(&correct_sum),
total_fp32_desc.get(),
GetBasePtr(&total_fp32),
accuracy_desc.get(),
GetBasePtr(accuracy));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_MLU_KERNEL(accuracy,
ops::AccuracyMLUKernel<float>,
ops::AccuracyMLUKernel<paddle::platform::float16>,
ops::AccuracyMLUKernel<int16_t>,
ops::AccuracyMLUKernel<int64_t>,
ops::AccuracyMLUKernel<uint8_t>,
ops::AccuracyMLUKernel<int>);
if(WITH_MLU)
cc_library(
mlu_baseop
SRCS mlu_baseop.cc
DEPS neuware_lib device_context)
cc_test(
activation_op_mlu_test
SRCS activation_op_mlu_test.cc
DEPS op_registry activation_op scope device_context executor)
endif()
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
if(WITH_MLU)
set(MLU_INFO mlu_info)
endif()
cc_library(
mlu_tracer
SRCS mlu_tracer.cc cnpapi_data_process.cc
DEPS workqueue_utils enforce glog ${MLU_INFO})
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册