未验证 提交 7f8d5bc8 编写于 作者: Q qipengh 提交者: GitHub

[MLU]Add mean and reduce_mean op (#38872)

* [MLU]: add mean and reduce mean op

* [MLU]add mlu pytest dir in CMakeLists.txt

* [MLU]fix tensor data

* [MLU]fix TensorToPyArray and license
上级 9e0686ed
...@@ -396,7 +396,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place, ...@@ -396,7 +396,8 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
TENSOR* dst) { TENSOR* dst) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
const platform::DeviceContext* dev_ctx; const platform::DeviceContext* dev_ctx;
if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) { if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place) ||
platform::is_mlu_place(dst_place)) {
dev_ctx = pool.Get(dst_place); dev_ctx = pool.Get(dst_place);
} else { } else {
dev_ctx = pool.Get(src.place()); dev_ctx = pool.Get(src.place());
...@@ -1048,6 +1049,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, ...@@ -1048,6 +1049,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
#endif
} else if (platform::is_mlu_place(tensor.place())) {
#ifdef PADDLE_WITH_MLU
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& mlu_dev_ctx =
static_cast<const platform::MLUDeviceContext&>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
BOOST_GET_CONST(platform::MLUPlace, tensor.place()),
reinterpret_cast<const void*>(data), size_to_write,
mlu_dev_ctx.stream());
mlu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
#endif #endif
} else if (platform::is_npu_place(tensor.place())) { } else if (platform::is_npu_place(tensor.place())) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -1127,9 +1151,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1127,9 +1151,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) { platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape)); cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType( framework::VisitDataType(
...@@ -1148,6 +1174,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1148,6 +1174,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) { } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU")); "NPUPlace is not supported when not compiled with NPU"));
...@@ -1192,9 +1221,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1192,9 +1221,11 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace()) ||
platform::is_mlu_place(dev_ctx.GetPlace()) ||
platform::is_npu_place(dev_ctx.GetPlace())) { platform::is_npu_place(dev_ctx.GetPlace())) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MLU) || \
defined(PADDLE_WITH_ASCEND_CL)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims)); cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType( framework::VisitDataType(
...@@ -1213,6 +1244,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -1213,6 +1244,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
} else if (platform::is_xpu_place(dev_ctx.GetPlace())) { } else if (platform::is_xpu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"XPUPlace is not supported when not compiled with XPU")); "XPUPlace is not supported when not compiled with XPU"));
} else if (platform::is_mlu_place(dev_ctx.GetPlace())) {
PADDLE_THROW(platform::errors::Unimplemented(
"MLUPlace is not supported when not compiled with MLU"));
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"NPUPlace is not supported when not compiled with NPU")); "NPUPlace is not supported when not compiled with NPU"));
......
...@@ -231,9 +231,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -231,9 +231,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize, allocate_bytes = DeviceAllocateSize(&platform::NPUInitAllocSize,
&platform::NPUReallocSize, request_bytes); &platform::NPUReallocSize, request_bytes);
#elif defined(PADDLE_WITH_MLU) #elif defined(PADDLE_WITH_MLU)
allocate_bytes = allocate_bytes = DeviceAllocateSize(&platform::MLUInitAllocSize,
DeviceAllocateSize(&platform::MLUInitAllocSize(), &platform::MLUReallocSize, request_bytes);
&platform::MLUReallocSize(), request_bytes);
#endif #endif
// Allocate a new block // Allocate a new block
......
...@@ -508,6 +508,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -508,6 +508,9 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU"); platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU");
platform::MLUMemcpyD2HAsync(dst, src, num, stream); platform::MLUMemcpyD2HAsync(dst, src, num, stream);
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU"); platform::RecordEvent record_event("MLUMemcpyD2HSync:MLU->CPU");
...@@ -530,6 +533,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -530,6 +533,9 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU"); platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU");
platform::MLUMemcpyH2DAsync(dst, src, num, stream); platform::MLUMemcpyH2DAsync(dst, src, num, stream);
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU"); platform::RecordEvent record_event("MLUMemcpyH2DSync:CPU->MLU");
...@@ -554,6 +560,10 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -554,6 +560,10 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
"MLUMemcpyD2DAsync(same_mlu):MLU->MLU"); "MLUMemcpyD2DAsync(same_mlu):MLU->MLU");
platform::MLUMemcpyD2DAsync(dst, src, num, stream); platform::MLUMemcpyD2DAsync(dst, src, num, stream);
} else { } else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU"); platform::RecordEvent record_event("MLUMemcpyD2DSync(same_mlu):MLU->MLU");
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/mean_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T>
class MeanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
const T* in_data = input->data<T>();
T* out_data = output->mutable_data<T>(context.GetPlace());
auto numel = input->numel();
auto rank = input->dims().size();
auto place = context.GetPlace();
auto stream = context.template device_context<MLUDeviceContext>().stream();
if (rank == 0) { // scalar
auto mlu_place = BOOST_GET(platform::MLUPlace, place);
memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
stream);
return;
}
std::vector<int> reduce_dims;
reduce_dims.reserve(rank);
for (decltype(rank) i = 0; i < rank; ++i) {
reduce_dims.push_back(i);
}
MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(input->type()));
MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(output->type()));
MLUCnnlReduceDesc reduction_desc(
reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
nullptr, input_desc.get(),
reinterpret_cast<const void*>(in_data), 0 /*indices_size*/,
nullptr, nullptr, output_desc.get(),
reinterpret_cast<void*>(out_data));
}
};
template <typename T>
class MeanMLUGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(output_grad->numel(), 1,
platform::errors::InvalidArgument(
"Mean Gradient Input Tensor len should be 1. But "
"received Out@Grad's elements num is %d.",
output_grad->numel()));
auto input_grad = context.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(context.GetPlace());
auto in_data = output_grad->data<T>();
auto numel = input_grad->numel();
auto rank = input_grad->dims().size();
auto out_data = input_grad->data<T>();
auto place = context.GetPlace();
auto stream = context.template device_context<MLUDeviceContext>().stream();
if (rank == 0) { // scalar
auto mlu_place = BOOST_GET(platform::MLUPlace, place);
memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
stream);
return;
}
// means
Tensor mean_var(output_grad->type());
mean_var.mutable_data<T>(input_grad->dims(), context.GetPlace());
MLUCnnlTensorDesc mean_var_desc(mean_var, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(mean_var.type()));
auto value = static_cast<T>(1.0 / static_cast<float>(input_grad->numel()));
MLUCnnl::Fill(context, value, mean_var_desc.get(), GetBasePtr(&mean_var));
// means mul output_grad
MLUCnnlTensorDesc in_desc(*output_grad, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(output_grad->type()));
MLUCnnlTensorDesc out_desc(*input_grad, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(input_grad->type()));
MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(context, op_tensor_desc.get(), in_desc.get(),
reinterpret_cast<const void*>(in_data),
mean_var_desc.get(), GetBasePtr(&mean_var),
out_desc.get(), reinterpret_cast<void*>(out_data),
ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(mean, ops::MeanMLUKernel<float>,
ops::MeanMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(mean_grad, ops::MeanMLUGradKernel<float>,
ops::MeanMLUGradKernel<plat::float16>);
...@@ -45,12 +45,22 @@ enum MLULogicMethod { ...@@ -45,12 +45,22 @@ enum MLULogicMethod {
CNNL_LOGIC_OP_OR = 7, CNNL_LOGIC_OP_OR = 7,
}; };
inline const void* GetBasePtr(const Tensor* t) { return t->data(); }
inline void* GetBasePtr(Tensor* t) { return t->data(); }
template <typename T> template <typename T>
inline cnnlDataType_t ToCnnlDataType(const T& t) { inline cnnlDataType_t ToCnnlDataType(const T& t) {
auto type = framework::ToDataType(t); auto type = framework::ToDataType(t);
return ToCnnlDataType(type); return ToCnnlDataType(type);
} }
template <typename T>
inline cnnlDataType_t ToCnnlDataType() {
auto type = framework::ToDataType(std::type_index(typeid(T)));
return ToCnnlDataType(type);
}
template <> template <>
inline cnnlDataType_t ToCnnlDataType(const framework::proto::VarType::Type& t) { inline cnnlDataType_t ToCnnlDataType(const framework::proto::VarType::Type& t) {
cnnlDataType_t type = CNNL_DTYPE_FLOAT; cnnlDataType_t type = CNNL_DTYPE_FLOAT;
...@@ -89,11 +99,12 @@ NarrowT CheckedNarrowing(const WideT& wide) { ...@@ -89,11 +99,12 @@ NarrowT CheckedNarrowing(const WideT& wide) {
return narrow; return narrow;
} }
static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) { inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
return ctx.template device_context<MLUDeviceContext>().cnnl_handle(); return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
} }
static const MLUDeviceContext& GetDevCtxFromCTX(const ExecutionContext& ctx) { inline static const MLUDeviceContext& GetDevCtxFromCTX(
const ExecutionContext& ctx) {
return ctx.template device_context<MLUDeviceContext>(); return ctx.template device_context<MLUDeviceContext>();
} }
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#include "paddle/fluid/platform/device/mlu/device_context.h"
namespace paddle {
namespace operators {
template <typename T>
class ReduceMeanMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<T>(context.GetPlace());
bool reduce_all = context.Attr<bool>("reduce_all");
auto dims = context.Attr<std::vector<int>>("dim");
auto input_dims = framework::vectorize(input->dims());
const auto& input_dim_size = input->dims().size();
std::vector<int> reduce_dims;
if (reduce_all) {
for (size_t i = 0; i < input_dims.size(); i++) {
reduce_dims.push_back(static_cast<int>(i));
}
} else {
for (size_t i = 0; i < dims.size(); ++i) {
if (dims[i] < 0) {
reduce_dims.push_back(dims[i] + input_dim_size);
} else {
reduce_dims.push_back(dims[i]);
}
}
}
MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(input->type()));
MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(output->type()));
MLUCnnlReduceDesc reduction_desc(
reduce_dims, CNNL_REDUCE_AVG, ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(),
nullptr, input_desc.get(), GetBasePtr(input),
0 /*indices_size*/, nullptr, nullptr, output_desc.get(),
GetBasePtr(output));
}
};
template <typename T>
class ReduceMeanGradMLUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* input = context.Input<Tensor>("X");
auto* output_grad = context.Input<Tensor>(framework::GradVarName("Out"));
auto* input_grad = context.Output<Tensor>(framework::GradVarName("X"));
input_grad->mutable_data<T>(context.GetPlace());
bool reduce_all = context.Attr<bool>("reduce_all");
auto reduce_dims = context.Attr<std::vector<int>>("dim");
auto input_dims = framework::vectorize(input->dims());
int reduce_numel = 1;
if (reduce_all) {
reduce_dims.clear();
for (size_t d = 0; d < input_dims.size(); ++d) {
reduce_dims.push_back(static_cast<int>(d));
}
}
for (auto& d : reduce_dims) {
if (d < 0) {
d = d + input_dims.size();
}
reduce_numel *= input_dims[d];
}
Tensor tmp_output_grad(output_grad->type());
auto tmp_output_dims = input_dims;
for (auto d : reduce_dims) {
tmp_output_dims[d] = 1;
}
tmp_output_grad.ShareDataWith(*output_grad);
tmp_output_grad.Resize(framework::make_ddim(tmp_output_dims));
MLUCnnlTensorDesc output_grad_desc(tmp_output_grad, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(tmp_output_grad.type()));
MLUCnnlTensorDesc input_grad_desc(*input_grad, CNNL_LAYOUT_ARRAY,
ToCnnlDataType(input_grad->type()));
auto value = static_cast<T>(1.0 / static_cast<float>(reduce_numel));
MLUCnnl::Fill(context, value, input_grad_desc.get(),
GetBasePtr(input_grad));
MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
CNNL_NOT_PROPAGATE_NAN);
MLUCnnl::OpTensor(context, op_tensor_desc.get(), output_grad_desc.get(),
GetBasePtr(&tmp_output_grad), input_grad_desc.get(),
GetBasePtr(input_grad), input_grad_desc.get(),
GetBasePtr(input_grad), ToCnnlDataType<T>());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_MLU_KERNEL(reduce_mean, ops::ReduceMeanMLUKernel<float>,
ops::ReduceMeanMLUKernel<plat::float16>);
REGISTER_OP_MLU_KERNEL(reduce_mean_grad, ops::ReduceMeanGradMLUKernel<float>,
ops::ReduceMeanGradMLUKernel<plat::float16>);
...@@ -232,6 +232,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -232,6 +232,13 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr); nullptr);
#endif
} else if (platform::is_mlu_place(self.place())) {
#ifdef PADDLE_WITH_MLU
const T *a = self.data<T>();
auto p = BOOST_GET_CONST(platform::MLUPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
nullptr);
#endif #endif
} else if (platform::is_npu_place(self.place())) { } else if (platform::is_npu_place(self.place())) {
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
...@@ -267,6 +274,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { ...@@ -267,6 +274,13 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
T *a = self->mutable_data<T>(p); T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr); nullptr);
#endif
} else if (platform::is_mlu_place(self->place())) {
#ifdef PADDLE_WITH_MLU
auto p = BOOST_GET_CONST(platform::MLUPlace, self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
nullptr);
#endif #endif
} else if (platform::is_npu_place(self->place())) { } else if (platform::is_npu_place(self->place())) {
#if defined(PADDLE_WITH_ASCEND_CL) #if defined(PADDLE_WITH_ASCEND_CL)
...@@ -543,6 +557,11 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, ...@@ -543,6 +557,11 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place), output->mutable_data(BOOST_GET_CONST(platform::XPUPlace, place),
self.type()); self.type());
#endif
} else if (platform::is_mlu_place(place)) {
#ifdef PADDLE_WITH_MLU
output->mutable_data(BOOST_GET_CONST(platform::MLUPlace, place),
self.type());
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -845,8 +864,13 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, ...@@ -845,8 +864,13 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
size_t copy_bytes = sizeof_dtype * numel; size_t copy_bytes = sizeof_dtype * numel;
auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place()); auto p = BOOST_GET_CONST(platform::MLUPlace, tensor.place());
paddle::memory::Copy(platform::CPUPlace(), py_arr.mutable_data(), p, platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
tensor_buf_ptr, copy_bytes, nullptr); auto &ctx = *pool.Get(tensor.place());
paddle::memory::Copy(
platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr,
copy_bytes,
reinterpret_cast<const platform::MLUDeviceContext &>(ctx).stream());
ctx.Wait();
return py_arr; return py_arr;
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
......
...@@ -803,6 +803,10 @@ if (WITH_MKLDNN) ...@@ -803,6 +803,10 @@ if (WITH_MKLDNN)
add_subdirectory(mkldnn) add_subdirectory(mkldnn)
endif() endif()
if (WITH_MLU)
add_subdirectory(mlu)
endif()
add_subdirectory(asp) add_subdirectory(asp)
add_subdirectory(ir) add_subdirectory(ir)
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
if (WITH_MLU)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
endif()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
paddle.enable_static()
SEED = 2021
class TestMean(OpTest):
def setUp(self):
self.set_mlu()
self.place = paddle.device.MLUPlace(0)
self.op_type = "mean"
self.init_dtype()
x = np.random.random([1, 100]).astype(self.dtype)
self.inputs = {'X': x}
self.attrs = {}
np_out = np.mean(x)
self.outputs = {'Out': np_out}
def set_mlu(self):
self.__class__.use_mlu = True
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place)
def test_check_grad(self):
self.check_grad_with_place(self.place, ['X'], 'Out')
class TestMeanFP16(OpTest):
def setUp(self):
self.set_mlu()
self.place = paddle.MLUPlace(0)
self.op_type = "mean"
self.init_dtype()
x = np.random.random([3, 200]).astype(self.dtype)
self.inputs = {'X': x}
self.attrs = {}
np_out = np.mean(x)
self.outputs = {'Out': np_out}
def set_mlu(self):
self.__class__.use_mlu = True
self.__class__.no_need_check_grad = True
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output_with_place(self.place)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
paddle.enable_static()
class TestMeanOp(OpTest):
def set_mlu(self):
self.__class__.use_mlu = True
self.place = paddle.device.MLUPlace(0)
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
def test_check_output(self):
self.check_output_with_place(self.place)
def test_check_grad(self):
self.check_grad_with_place(self.place, ['X'], 'Out')
class TestMeanOp5D(TestMeanOp):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {
'X': np.random.random((1, 2, 5, 6, 10)).astype("float32")
}
self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
class TestMeanOp6D(TestMeanOp):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {
'X': np.random.random((1, 1, 2, 5, 6, 10)).astype("float32")
}
self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
class TestMeanOp8D(TestMeanOp):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {
'X': np.random.random((1, 3, 1, 2, 1, 4, 3, 10)).astype("float32")
}
self.attrs = {'dim': (0, 3)}
self.outputs = {'Out': self.inputs['X'].mean(axis=(0, 3))}
class Test1DReduce(TestMeanOp):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {'X': np.random.random(120).astype("float32")}
self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
class Test2DReduce0(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [0]}
self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
self.outputs = {'Out': self.inputs['X'].mean(axis=0)}
class Test2DReduce1(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((20, 10)).astype("float32")}
self.outputs = {
'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
}
class Test3DReduce0(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
self.outputs = {
'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
}
class Test3DReduce1(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
self.outputs = {
'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
}
class Test3DReduce2(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [-2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
self.outputs = {
'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
}
class Test3DReduce3(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.attrs = {'dim': [1, 2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float32")}
self.outputs = {
'Out': self.inputs['X'].mean(axis=tuple(self.attrs['dim']))
}
class TestKeepDimReduce(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float32")}
self.attrs = {'dim': [1], 'keep_dim': True}
self.outputs = {
'Out': self.inputs['X'].mean(
axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
}
class TestKeepDim8DReduce(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {
'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype("float32")
}
self.attrs = {'dim': (3, 4, 5), 'keep_dim': True}
self.outputs = {
'Out': self.inputs['X'].mean(
axis=tuple(self.attrs['dim']), keepdims=self.attrs['keep_dim'])
}
class TestReduceAll(Test1DReduce):
def setUp(self):
self.set_mlu()
self.op_type = "reduce_mean"
self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float32")}
self.attrs = {'reduce_all': True}
self.outputs = {'Out': self.inputs['X'].mean()}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
import paddle
import paddle.fluid as fluid
paddle.enable_static()
SEED = 2021
class TestRelu(OpTest):
def setUp(self):
self.set_mlu()
self.op_type = "relu"
self.place = paddle.MLUPlace(0)
self.init_dtype()
np.random.seed(SEED)
x = np.random.rand(3, 2).astype(self.dtype)
out = x
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {}
self.outputs = {'Out': out}
def set_mlu(self):
self.__class__.use_mlu = True
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place)
class TestReluFp16(OpTest):
def setUp(self):
self.set_mlu()
self.op_type = "relu"
self.place = paddle.MLUPlace(0)
self.init_dtype()
np.random.seed(SEED)
x = np.random.rand(3, 2).astype(self.dtype)
out = x
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {}
self.outputs = {'Out': out}
def set_mlu(self):
self.__class__.use_mlu = True
self.__class__.no_need_check_grad = True
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output_with_place(self.place, atol=1e-5)
class TestReluNeg(OpTest):
def setUp(self):
self.set_mlu()
self.op_type = "relu"
self.place = paddle.MLUPlace(0)
self.init_dtype()
np.random.seed(SEED)
x = np.array([0.1, -0.1, -1.0]).astype(self.dtype)
out = np.array([0.1, 0.0, 0.0]).astype(self.dtype)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {}
self.outputs = {'Out': out}
def set_mlu(self):
self.__class__.use_mlu = True
def init_dtype(self):
self.dtype = np.float32
def test_check_output(self):
self.check_output_with_place(self.place)
class TestReluNet(unittest.TestCase):
def _test(self, run_mlu=True):
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32')
b_np = np.random.random(size=(32, 32)).astype('float32')
label_np = np.random.randint(2, size=(32, 1)).astype('int64')
with paddle.static.program_guard(main_prog, startup_prog):
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
label = paddle.static.data(
name="label", shape=[32, 1], dtype='int64')
sum = paddle.add(a, b)
z = paddle.nn.functional.relu(sum)
fc_1 = fluid.layers.fc(input=z, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.reduce_mean(cost)
sgd = fluid.optimizer.SGD(learning_rate=0.01)
sgd.minimize(loss)
if run_mlu:
place = paddle.MLUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(100):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
if epoch % 10 == 0:
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
return pred_res, loss_res
def test_mlu(self):
cpu_pred, cpu_loss = self._test(False)
mlu_pred, mlu_loss = self._test(True)
self.assertTrue(np.allclose(mlu_pred, cpu_pred))
self.assertTrue(np.allclose(mlu_loss, cpu_loss))
if __name__ == '__main__':
unittest.main()
...@@ -326,6 +326,9 @@ class OpTest(unittest.TestCase): ...@@ -326,6 +326,9 @@ class OpTest(unittest.TestCase):
def is_npu_op_test(): def is_npu_op_test():
return hasattr(cls, "use_npu") and cls.use_npu == True return hasattr(cls, "use_npu") and cls.use_npu == True
def is_mlu_op_test():
return hasattr(cls, "use_mlu") and cls.use_mlu == True
if not hasattr(cls, "op_type"): if not hasattr(cls, "op_type"):
raise AssertionError( raise AssertionError(
"This test do not have op_type in class attrs, " "This test do not have op_type in class attrs, "
...@@ -348,7 +351,8 @@ class OpTest(unittest.TestCase): ...@@ -348,7 +351,8 @@ class OpTest(unittest.TestCase):
and not is_xpu_op_test() \ and not is_xpu_op_test() \
and not is_mkldnn_op_test() \ and not is_mkldnn_op_test() \
and not is_rocm_op_test() \ and not is_rocm_op_test() \
and not is_npu_op_test(): and not is_npu_op_test() \
and not is_mlu_op_test():
raise AssertionError( raise AssertionError(
"This test of %s op needs check_grad with fp64 precision." % "This test of %s op needs check_grad with fp64 precision." %
cls.op_type) cls.op_type)
...@@ -1297,7 +1301,8 @@ class OpTest(unittest.TestCase): ...@@ -1297,7 +1301,8 @@ class OpTest(unittest.TestCase):
# No effect on original OpTest # No effect on original OpTest
# Currently not support ParallelExecutor on XPUPlace. # Currently not support ParallelExecutor on XPUPlace.
if not paddle.is_compiled_with_xpu( if not paddle.is_compiled_with_xpu(
) and not paddle.is_compiled_with_npu(): ) and not paddle.is_compiled_with_npu(
) and not paddle.is_compiled_with_mlu():
self.check_inplace_output_with_place( self.check_inplace_output_with_place(
place, no_check_set=no_check_set, inplace_atol=inplace_atol) place, no_check_set=no_check_set, inplace_atol=inplace_atol)
...@@ -1547,11 +1552,9 @@ class OpTest(unittest.TestCase): ...@@ -1547,11 +1552,9 @@ class OpTest(unittest.TestCase):
delta=numeric_grad_delta, delta=numeric_grad_delta,
in_place=in_place) for input_to_check in inputs_to_check in_place=in_place) for input_to_check in inputs_to_check
] ]
analytic_grads = self._get_gradient(inputs_to_check, place, analytic_grads = self._get_gradient(inputs_to_check, place,
output_names, no_grad_set, output_names, no_grad_set,
user_defined_grad_outputs) user_defined_grad_outputs)
# comparison of bf16 results will happen as fp32 # comparison of bf16 results will happen as fp32
# loop over list of grads and convert bf16 to fp32 # loop over list of grads and convert bf16 to fp32
fp32_analytic_grads = [] fp32_analytic_grads = []
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册