未验证 提交 599a201f 编写于 作者: J jjyaoao 提交者: GitHub

delete paddle/fluid/operators/elementwise/*_npu.* (#52675)

上级 0f3bbe10
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() == y_dims.size()) {
direct_compute = true;
} else if (x_dims.size() > y_dims.size()) {
direct_compute = x_dims.size() == (y_dims.size() + axis);
} else {
direct_compute = y_dims.size() == (x_dims.size() + axis);
}
if (direct_compute) {
const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
runner.Run(dev_ctx.stream());
} else {
phi::DenseTensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
runner.Run(dev_ctx.stream());
}
}
};
template <typename T>
class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
auto stream = dev_ctx.stream();
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
auto src_dims = dx->dims();
auto dout_dims = dout->dims();
int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
(dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
} else {
dst_dims_vec.push_back(dout_dims[ax]);
}
}
if (!reduce_axes.empty()) {
phi::DenseTensor tmp;
tmp.ShareDataWith(*dx);
tmp.Resize(phi::make_ddim(dst_dims_vec));
const auto& runner =
NpuOpRunner("ReduceSumD",
{*dout},
{tmp},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec;
std::vector<int> reduce_axes;
auto src_dims = dy->dims();
auto dout_dims = dout->dims();
int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + src_dims.size()) ||
(dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
} else {
dst_dims_vec.push_back(dout_dims[ax]);
}
}
if (!reduce_axes.empty()) {
phi::DenseTensor tmp;
tmp.ShareDataWith(*dy);
tmp.Resize(phi::make_ddim(dst_dims_vec));
const auto& runner =
NpuOpRunner("ReduceSumD",
{*dout},
{tmp},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(elementwise_add,
ops::ElementwiseAddNPUKernel<float>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ElementwiseAddNPUKernel<int64_t>,
#endif
ops::ElementwiseAddNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradNPUKernel<float>,
ops::ElementwiseAddGradNPUKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* out = ctx.Input<phi::DenseTensor>("Out");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
auto place = ctx.GetPlace();
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (dx) {
dx->mutable_data<T>(place);
phi::DenseTensor tensor_one(y->type());
tensor_one.mutable_data<float>({1}, place);
FillNpuTensorWithConstant<float>(&tensor_one, static_cast<float>(1.0));
// Use `Div` CANN OP to achieve `1/y` instead of `Power` CANN OP.
// Because `Power` will cause precision overflow, that is, `float_status`
// will be set to 1.
phi::DenseTensor y_div(y->type());
y_div.mutable_data<T>(y->dims(), place);
const auto& runner_one_div_y =
NpuOpRunner("Div", {tensor_one, *y}, {y_div}, {});
runner_one_div_y.Run(stream);
phi::DenseTensor tensor_zeros(x->type());
tensor_zeros.mutable_data<T>(x->dims(), place);
const auto& runner_tensor_zeros =
NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
runner_tensor_zeros.Run(stream);
phi::DenseTensor x_zero(phi::DataType::BOOL);
x_zero.mutable_data<bool>(x->dims(), place);
const auto& runner_x_zero =
NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
runner_x_zero.Run(stream);
phi::DenseTensor x_nozero(phi::DataType::BOOL);
x_nozero.mutable_data<bool>(x->dims(), place);
const auto& runner_x_nonzero =
NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
runner_x_nonzero.Run(stream);
phi::DenseTensor x_nozero_f(x->type());
x_nozero_f.mutable_data<T>(x->dims(), place);
const auto& runner_x_nonzero_f =
NpuOpRunner("Cast",
{x_nozero},
{x_nozero_f},
{{"dst_type", static_cast<int32_t>(0)}});
runner_x_nonzero_f.Run(stream);
phi::DenseTensor x_grad_w(x->type());
x_grad_w.mutable_data<T>(x->dims(), place);
const auto& runner_x_grad_w =
NpuOpRunner("Mul", {x_nozero_f, y_div}, {x_grad_w}, {});
runner_x_grad_w.Run(stream);
const auto& runner_x_grad =
NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
runner_x_grad.Run(stream);
}
if (dy) {
dy->mutable_data<T>(place);
phi::DenseTensor neg_out(out->type());
neg_out.mutable_data<T>(out->dims(), place);
const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
runner_neg_out.Run(stream);
phi::DenseTensor tmp_mul(out->type());
tmp_mul.mutable_data<T>(out->dims(), place);
const auto& runner_mul =
NpuOpRunner("Mul", {neg_out, *dout}, {tmp_mul}, {});
runner_mul.Run(stream);
if (dy->dims() != dout->dims()) {
phi::DenseTensor reduced_tmp_mul(y->type());
reduced_tmp_mul.mutable_data<T>(y->dims(), place);
std::vector<int64_t> axes;
int64_t diff = dout->dims().size() - dy->dims().size();
for (int64_t i = 0; i < dout->dims().size(); ++i) {
if (i < diff) {
axes.push_back(i);
continue;
}
if (dout->dims()[i] > dy->dims()[i - diff]) {
axes.push_back(i);
}
}
const auto& runner_reduce =
NpuOpRunner("ReduceSumD",
{tmp_mul},
{reduced_tmp_mul},
{{"axes", axes}, {"keep_dims", false}});
runner_reduce.Run(stream);
const auto& runner_y_grad =
NpuOpRunner("Div", {reduced_tmp_mul, *y}, {*dy}, {});
runner_y_grad.Run(stream);
} else {
const auto& runner_y_grad =
NpuOpRunner("Div", {tmp_mul, *y}, {*dy}, {});
runner_y_grad.Run(stream);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
elementwise_div,
ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseDivNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
elementwise_div_grad,
ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseDivGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(elementwise_floordiv,
ops::ElementwiseFloorDivNPUKernel<int>,
ops::ElementwiseFloorDivNPUKernel<int64_t>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
} else {
direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
}
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
if (direct_compute) {
const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
runner.Run(stream);
} else {
phi::DenseTensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
const auto& runner =
NpuOpRunner("Maximum", {transformed_x, transformed_y}, {*out}, {});
runner.Run(stream);
}
}
};
template <typename DeviceContext, typename T>
class ElementwiseMaxGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
// The ascend elementwise_max_grad op only supports broadcast
// when axis is -1, and requires all the inputs must have the
// same shape when axis is not -1. For convenience, we should
// broadcast the original input x and y to transformed_x and
// transformed_x firstly, then use tmp tensor to get the op
// output, last reduce the tmp tensor shape to match the
// paddle output.
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
phi::DenseTensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
auto dout_dims = dout->dims();
auto stream = dev_ctx.stream();
framework::NPUAttributeMap attr_input = {{"grad_x", true},
{"grad_y", true}};
// Reshape info vector.
std::vector<int> reduce_axes;
if (dx && dy) {
dx->mutable_data<T>(ctx.GetPlace());
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_dx;
tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
phi::DenseTensor tmp_dy;
tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
const auto& runner = NpuOpRunner("MaximumGrad",
{*dout, transformed_x, transformed_y},
{tmp_dx, tmp_dy},
attr_input);
runner.Run(stream);
if (x_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
(dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dx},
{*dx},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
}
if (y_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
(dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dy},
{*dy},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
}
} else if (dx) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
dx->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_dx;
tmp_dx.mutable_data<T>(dout_dims, ctx.GetPlace());
const auto& runner = NpuOpRunner("MaximumGrad",
{*dout, transformed_x, transformed_y},
{tmp_dx, zero_tensor},
attr_input);
runner.Run(stream);
if (x_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
(dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dx},
{*dx},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dx, ctx.GetPlace(), dev_ctx, dx);
}
} else if (dy) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(dout_dims, ctx.GetPlace());
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_dy;
tmp_dy.mutable_data<T>(dout_dims, ctx.GetPlace());
const auto& runner = NpuOpRunner("MaximumGrad",
{*dout, transformed_x, transformed_y},
{zero_tensor, tmp_dy},
attr_input);
runner.Run(stream);
if (y_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
(dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dy},
{*dy},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dy, ctx.GetPlace(), dev_ctx, dy);
}
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Do not support all outputs to be empty."));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
elementwise_max,
ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, float>,
ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, double>,
ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int>,
ops::ElementwiseMaxNPUKernel<plat::NPUDeviceContext, int64_t>);
REGISTER_OP_NPU_KERNEL(
elementwise_max_grad,
ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, float>,
ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, double>,
ops::ElementwiseMaxGradNPUKernel<plat::NPUDeviceContext, int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
int axis = ctx.Attr<int>("axis");
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
} else {
direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
}
phi::DenseTensor transformed_x, transformed_y;
if (direct_compute) {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
} else {
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
}
const auto& runner =
NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
template <typename DeviceContext, typename T>
class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
auto stream = dev_ctx.stream();
if (dx && dy) {
// dx
dx->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_x;
tmp_x.ShareDataWith(*dx);
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec_x;
std::vector<int> reduce_axes_x;
auto src_dims_x = dx->dims();
auto dout_dims = dout->dims();
int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
(dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
reduce_axes_x.push_back(ax);
} else {
dst_dims_vec_x.push_back(dout_dims[ax]);
}
}
if (!reduce_axes_x.empty()) {
tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
}
}
// dy
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_y;
tmp_y.ShareDataWith(*dy);
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec_y;
std::vector<int> reduce_axes_y;
auto src_dims_y = dy->dims();
auto dout_dims = dout->dims();
int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
(dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
reduce_axes_y.push_back(ax);
} else {
dst_dims_vec_y.push_back(dout_dims[ax]);
}
}
if (!reduce_axes_y.empty()) {
tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
}
}
const auto& runner = NpuOpRunner("MinimumGrad",
{*dout, *x, *y},
{tmp_x, tmp_y},
{{"grad_x", true}, {"grad_y", true}});
runner.Run(stream);
} else if (dx) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
// dx
dx->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_x;
tmp_x.ShareDataWith(*dx);
if (dx->dims() != dout->dims()) {
std::vector<int> dst_dims_vec_x;
std::vector<int> reduce_axes_x;
auto src_dims_x = dx->dims();
auto dout_dims = dout->dims();
int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
(dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
reduce_axes_x.push_back(ax);
} else {
dst_dims_vec_x.push_back(dout_dims[ax]);
}
}
if (!reduce_axes_x.empty()) {
tmp_x.Resize(phi::make_ddim(dst_dims_vec_x));
}
}
const auto& runner = NpuOpRunner("MinimumGrad",
{*dout, *x, *y},
{tmp_x, zero_tensor},
{{"grad_x", true}, {"grad_y", true}});
runner.Run(stream);
} else if (dy) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
// dy
dy->mutable_data<T>(ctx.GetPlace());
phi::DenseTensor tmp_y;
tmp_y.ShareDataWith(*dy);
if (dy->dims() != dout->dims()) {
std::vector<int> dst_dims_vec_y;
std::vector<int> reduce_axes_y;
auto src_dims_y = dy->dims();
auto dout_dims = dout->dims();
int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
(dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
reduce_axes_y.push_back(ax);
} else {
dst_dims_vec_y.push_back(dout_dims[ax]);
}
}
if (!reduce_axes_y.empty()) {
tmp_y.Resize(phi::make_ddim(dst_dims_vec_y));
}
}
const auto& runner = NpuOpRunner("MinimumGrad",
{*dout, *x, *y},
{zero_tensor, tmp_y},
{{"grad_x", true}, {"grad_y", true}});
runner.Run(stream);
} else {
std::cout << "error" << std::endl;
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
elementwise_min,
ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
elementwise_min_grad,
ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwiseModNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
int axis = ctx.Attr<int>("axis");
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
bool direct_compute = false;
if (x_dims.size() >= y_dims.size()) {
direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
} else {
direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
}
phi::DenseTensor transformed_x, transformed_y;
if (direct_compute) {
transformed_x.ShareDataWith(*x);
transformed_y.ShareDataWith(*y);
} else {
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
}
out->mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("FloorMod", {transformed_x, transformed_y}, {*out}, {});
auto stream = dev_ctx.stream();
runner.Run(stream);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(
elementwise_mod,
ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, double>,
ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int>,
ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
ops::ElementwiseModNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename T>
static void ReduceDims(const framework::ExecutionContext& ctx,
const aclrtStream& stream,
const int axis,
const framework::DDim& ddims,
const framework::DDim& brd_ddims,
const phi::DenseTensor& in,
phi::DenseTensor* out) {
std::vector<int64_t> axes;
int64_t brd_size = brd_ddims.size();
int64_t org_size = ddims.size();
// int64_t diff = brd_dims.size() - dims.size();
for (int64_t i = 0; i < brd_size; ++i) {
if (i < axis || i >= org_size + axis) {
axes.push_back(i);
continue;
}
if (brd_ddims[i] > ddims[i - axis]) {
axes.push_back(i);
}
}
// LOG(INFO) << "axes = " << phi::make_ddim(axes).to_str();
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner(
"ReduceSumD", {in}, {*out}, {{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
}
template <typename T>
class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = x_dims.size() == (y_dims.size() + axis);
} else {
direct_compute = y_dims.size() == (x_dims.size() + axis);
}
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
if (direct_compute) {
const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
runner.Run(stream);
} else {
phi::DenseTensor trans_x, trans_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
runner.Run(stream);
}
}
};
template <typename T>
class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
phi::DenseTensor trans_x, trans_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
if (dx) {
if (dx->dims() == dout->dims()) {
dx->mutable_data<T>(ctx.GetPlace());
const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
runner_dx.Run(stream);
} else {
phi::DenseTensor dx_temp(x->type());
dx_temp.Resize(trans_x.dims());
dx_temp.mutable_data<T>(ctx.GetPlace());
const auto& runner_dx =
NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {});
runner_dx.Run(stream);
ReduceDims<T>(
ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, dx);
}
}
if (dy) {
if (dy->dims() == dout->dims()) {
dy->mutable_data<T>(ctx.GetPlace());
const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
runner_dy.Run(stream);
} else {
phi::DenseTensor dy_temp(y->type());
dy_temp.Resize(trans_y.dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
const auto& runner_dy =
NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {});
runner_dy.Run(stream);
ReduceDims<T>(
ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, dy);
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL(elementwise_mul,
ops::ElementwiseMulNPUKernel<float>,
ops::ElementwiseMulNPUKernel<paddle::platform::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ElementwiseMulNPUKernel<int64_t>,
#endif
ops::ElementwiseMulNPUKernel<int>);
REGISTER_OP_NPU_KERNEL(
elementwise_mul_grad,
ops::ElementwiseMulGradNPUKernel<float>,
ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ElementwiseMulGradNPUKernel<int64_t>,
#endif
ops::ElementwiseMulGradNPUKernel<int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
namespace paddle {
namespace operators {
template <typename T>
void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx,
const phi::DenseTensor* src,
int axis,
const framework::DDim& dst_dims,
phi::DenseTensor* transformed_src) {
auto stream = dev_ctx.stream();
// 1. expand the axis with dim 1
auto src_dims = src->dims();
phi::DenseTensor tmp_src;
tmp_src.ShareDataWith(*src);
tmp_src.Resize(src_dims);
for (int i = 0; i < src_dims.size(); ++i) {
if (src_dims[i] == 1 && dst_dims[i + axis] > 1) {
phi::DenseTensor tmp_tensor;
auto tmp_tensor_dims = tmp_src.dims();
tmp_tensor_dims[i] = dst_dims[i + axis];
tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
const auto& runner =
NpuOpRunner("TileWithAxis",
{tmp_src},
{tmp_tensor},
{{"axis", static_cast<int64_t>(i)},
{"tiles", static_cast<int64_t>(dst_dims[i + axis])}});
runner.Run(stream);
tmp_src.ShareDataWith(tmp_tensor);
tmp_src.Resize(tmp_tensor_dims);
}
}
// 2.expand the ahead axis
auto prev = phi::product(phi::slice_ddim(dst_dims, 0, axis));
if (prev > 1) {
phi::DenseTensor tmp_tensor;
auto tmp_tensor_dims = phi::slice_ddim(dst_dims, 0, axis + src_dims.size());
tmp_tensor.mutable_data<T>(tmp_tensor_dims, dev_ctx.GetPlace());
const auto& runner =
NpuOpRunner("ExpandD",
{tmp_src},
{tmp_tensor},
{{"shape", phi::vectorize<int64_t>(tmp_tensor_dims)}});
runner.Run(stream);
tmp_src.ShareDataWith(tmp_tensor);
tmp_src.Resize(tmp_tensor_dims);
} else {
tmp_src.Resize(phi::slice_ddim(dst_dims, 0, axis + src_dims.size()));
}
// 3.expand the tail axis
auto post = phi::product(
phi::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size()));
if (post > 1) {
auto src_dims_vec = phi::vectorize<int>(tmp_src.dims());
src_dims_vec.push_back(1);
tmp_src.Resize(phi::make_ddim(src_dims_vec));
phi::DenseTensor tmp_tensor;
tmp_tensor.mutable_data<T>(dst_dims, dev_ctx.GetPlace());
const auto& runner =
NpuOpRunner("TileWithAxis",
{tmp_src},
{tmp_tensor},
{{"axis", static_cast<int64_t>(axis + src_dims.size())},
{"tiles", static_cast<int64_t>(post)}});
runner.Run(stream);
tmp_src.ShareDataWith(tmp_tensor);
}
tmp_src.Resize(dst_dims);
framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src);
}
template <typename T>
void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx,
const phi::DenseTensor* x,
const phi::DenseTensor* y,
int axis,
phi::DenseTensor* transformed_x,
phi::DenseTensor* transformed_y) {
auto x_dims = x->dims();
auto y_dims = y->dims();
bool is_xsize_larger = true;
int max_dim = x_dims.size();
std::vector<int> dst_dims_vec = phi::vectorize<int>(x_dims);
if (x_dims.size() < y_dims.size()) {
is_xsize_larger = false;
max_dim = y_dims.size();
dst_dims_vec = phi::vectorize<int>(y_dims);
}
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
int x_axis = is_xsize_larger ? 0 : axis;
int y_axis = is_xsize_larger ? axis : 0;
PADDLE_ENFORCE_GE(
axis,
0,
platform::errors::InvalidArgument(
"Axis should be great than or equal to 0, but received axis is %d.",
axis));
PADDLE_ENFORCE_LE(
axis,
max_dim,
platform::errors::InvalidArgument(
"Axis should be less than or equal to %d, but received axis is %d.",
max_dim,
axis));
for (int i = 0; i < x_dims.size(); ++i) {
dst_dims_vec[i + x_axis] =
std::max(dst_dims_vec[i + x_axis], static_cast<int>(x_dims[i]));
}
for (int i = 0; i < y_dims.size(); ++i) {
dst_dims_vec[i + y_axis] =
std::max(dst_dims_vec[i + y_axis], static_cast<int>(y_dims[i]));
}
auto dst_dims = phi::make_ddim(dst_dims_vec);
NpuBroadcast<T>(dev_ctx, x, x_axis, dst_dims, transformed_x);
NpuBroadcast<T>(dev_ctx, y, y_axis, dst_dims, transformed_y);
}
} // namespace operators
} // namespace paddle
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
USE_OP_ITSELF(elementwise_sub);
USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
template <typename T>
void Compare(f::Scope *scope,
const p::DeviceContext &ctx,
std::string op_type) {
// init
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<phi::DenseTensor>();
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<phi::DenseTensor>();
std::vector<T> init_x;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_x.push_back(static_cast<T>(1.0));
}
std::vector<T> init_y;
for (int64_t i = 0; i < 10 * 10; ++i) {
init_y.push_back(static_cast<T>(2.0));
}
paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
tensor_x->Resize({10, 10});
paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10});
auto place = ctx.GetPlace();
auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<phi::DenseTensor>();
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
op_type, {{"X", {"X"}}, {"Y", {"Y"}}}, {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place);
std::vector<T> out_vec;
paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
ctx.Wait();
float expected = 0.0;
if (op_type == "elementwise_add") {
expected = 3.0;
} else if (op_type == "elementwise_sub") {
expected = -1.0;
}
EXPECT_EQ(out_vec.size(), init_x.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], static_cast<T>(expected));
}
}
template <typename T>
void CompareGrad(f::Scope *scope,
const p::DeviceContext &ctx,
std::string op_type) {
// init
auto dout = scope->Var("DOut");
auto tensor_dout = dout->GetMutable<phi::DenseTensor>();
tensor_dout->Resize({2, 3, 5});
auto x = scope->Var("X");
auto tensor_x = x->GetMutable<phi::DenseTensor>();
tensor_x->Resize({2, 3, 5});
auto y = scope->Var("Y");
auto tensor_y = y->GetMutable<phi::DenseTensor>();
tensor_y->Resize({1, 5});
auto dx = scope->Var("DX");
auto tensor_dx = dx->GetMutable<phi::DenseTensor>();
auto dy = scope->Var("DY");
auto tensor_dy = dy->GetMutable<phi::DenseTensor>();
std::vector<T> init_dout;
for (int64_t i = 0; i < tensor_dout->numel(); ++i) {
init_dout.push_back(static_cast<T>(1.0));
}
paddle::framework::TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5});
// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(
op_type,
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}},
attrs);
auto place = ctx.GetPlace();
op->Run(*scope, place);
std::vector<T> dx_vec;
paddle::framework::TensorToVector(*tensor_dx, ctx, &dx_vec);
std::vector<T> dy_vec;
paddle::framework::TensorToVector(*tensor_dy, ctx, &dy_vec);
ctx.Wait();
float expected_x = 0, expected_y = 0;
if (op_type == "elementwise_add_grad") {
expected_x = 1.0;
expected_y = 6.0;
} else if (op_type == "elementwise_sub_grad") {
expected_x = 1.0;
expected_y = -6.0;
}
for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
}
for (uint32_t i = 0; i < dy_vec.size(); i++) {
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
}
TEST(elementwise_add, NPU_fp32) {
f::Scope scope;
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "elementwise_add");
}
TEST(elementwise_sub, NPU_fp32) {
f::Scope scope;
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<float>(&scope, *ctx, "elementwise_sub");
}
TEST(elementwise_sub, NPU_fp16) {
f::Scope scope;
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
Compare<p::float16>(&scope, *ctx, "elementwise_sub");
}
TEST(elementwise_sub_grad, NPU) {
f::Scope scope;
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
}
TEST(elementwise_add_grad, NPU) {
f::Scope scope;
auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class ElementwisePowNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
auto place = ctx.GetPlace();
int axis = ctx.Attr<int>("axis");
out->mutable_data<T>(place);
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis =
(axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = y_dims == phi::slice_ddim(x_dims, axis, x_dims.size());
} else {
direct_compute = x_dims == phi::slice_ddim(y_dims, axis, y_dims.size());
}
auto stream = dev_ctx.stream();
if (direct_compute) {
const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
runner.Run(stream);
} else {
phi::DenseTensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
const auto& runner =
NpuOpRunner("Pow", {transformed_x, transformed_y}, {*out}, {});
runner.Run(stream);
}
}
};
template <typename DeviceContext, typename T>
class ElementwisePowGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx =
ctx.template device_context<paddle::platform::NPUDeviceContext>();
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto place = ctx.GetPlace();
auto x_dims = x->dims();
auto y_dims = y->dims();
axis =
(axis < 0 ? std::abs(x_dims.size() - y_dims.size()) + axis + 1 : axis);
phi::DenseTensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(
dev_ctx, x, y, axis, &transformed_x, &transformed_y);
auto dout_dims = dout->dims();
auto stream = dev_ctx.stream();
// Reshape info vector.
std::vector<int> reduce_axes;
if (dx) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(dout_dims, place);
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
dx->mutable_data<T>(place);
phi::DenseTensor tmp_dx;
tmp_dx.mutable_data<T>(dout_dims, place);
// dx = dout * y * pow(x, y - 1);
phi::DenseTensor PowGrad_dx_temp1(dout->type());
PowGrad_dx_temp1.mutable_data<T>(dout->dims(), place);
const auto& runner_PowGrad_dx_temp1 =
NpuOpRunner("Mul", {*dout, transformed_y}, {PowGrad_dx_temp1}, {});
runner_PowGrad_dx_temp1.Run(stream);
phi::DenseTensor one_dx(transformed_y.type());
one_dx.mutable_data<T>(transformed_y.dims(), place);
const auto& runner_one_dx =
NpuOpRunner("OnesLike", {transformed_y}, {one_dx}, {});
runner_one_dx.Run(stream);
phi::DenseTensor sub_dx(transformed_y.type());
sub_dx.mutable_data<T>(transformed_y.dims(), place);
const auto& runner_sub_dx =
NpuOpRunner("Sub", {transformed_y, one_dx}, {sub_dx}, {});
runner_sub_dx.Run(stream);
phi::DenseTensor PowGrad_dx_temp2(transformed_x.type());
PowGrad_dx_temp2.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_PowGrad_dx_temp2 =
NpuOpRunner("Pow", {transformed_x, sub_dx}, {PowGrad_dx_temp2}, {});
runner_PowGrad_dx_temp2.Run(stream);
const auto& runner_dx = NpuOpRunner(
"Mul", {PowGrad_dx_temp1, PowGrad_dx_temp2}, {tmp_dx}, {});
runner_dx.Run(stream);
if (x_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (x_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + x_dims.size()) ||
(dout_dims[ax] > 1 && x_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dx},
{*dx},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dx, place, dev_ctx, dx);
}
}
if (dy) {
phi::DenseTensor zero_tensor(dout->type());
zero_tensor.mutable_data<T>(dout_dims, place);
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
dy->mutable_data<T>(place);
phi::DenseTensor tmp_dy;
tmp_dy.mutable_data<T>(dout_dims, place);
// dy = dout * log(x) * pow(x, y)
phi::DenseTensor PowGrad_dy_temp1(transformed_x.type());
PowGrad_dy_temp1.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_PowGrad_dy_temp1 = NpuOpRunner(
"Pow", {transformed_x, transformed_y}, {PowGrad_dy_temp1}, {});
runner_PowGrad_dy_temp1.Run(stream);
phi::DenseTensor one_dy(transformed_x.type());
one_dy.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_one_dy =
NpuOpRunner("OnesLike", {transformed_x}, {one_dy}, {});
runner_one_dy.Run(stream);
phi::DenseTensor sub_dy(transformed_x.type());
sub_dy.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_sub_dy =
NpuOpRunner("Sub", {transformed_x, one_dy}, {sub_dy}, {});
runner_sub_dy.Run(stream);
phi::DenseTensor log_dy(transformed_x.type());
log_dy.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_log_dy = NpuOpRunner("Log1p", {sub_dy}, {log_dy}, {});
runner_log_dy.Run(stream);
phi::DenseTensor PowGrad_dy_temp2(transformed_x.type());
PowGrad_dy_temp2.mutable_data<T>(transformed_x.dims(), place);
const auto& runner_PowGrad_dy_temp2 = NpuOpRunner(
"Mul", {log_dy, PowGrad_dy_temp1}, {PowGrad_dy_temp2}, {});
runner_PowGrad_dy_temp2.Run(stream);
const auto& runner_dy =
NpuOpRunner("Mul", {*dout, PowGrad_dy_temp2}, {tmp_dy}, {});
runner_dy.Run(stream);
if (y_dims != dout_dims) {
reduce_axes.clear();
int src_axis = (y_dims.size() < dout_dims.size() ? axis : 0);
for (int ax = 0; ax < dout_dims.size(); ++ax) {
if ((ax < src_axis || ax >= src_axis + y_dims.size()) ||
(dout_dims[ax] > 1 && y_dims[ax - src_axis] == 1)) {
reduce_axes.push_back(ax);
}
}
if (!reduce_axes.empty()) {
const auto& runner =
NpuOpRunner("ReduceSumD",
{tmp_dy},
{*dy},
{{"axes", reduce_axes}, {"keep_dims", false}});
runner.Run(stream);
}
} else {
framework::TensorCopy(tmp_dy, place, dev_ctx, dy);
}
}
if (!dx && !dy) {
PADDLE_THROW(platform::errors::Unavailable(
"Not support all outputs to be empty."));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(
elementwise_pow,
ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, float>,
ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, double>,
ops::ElementwisePowNPUKernel<plat::NPUDeviceContext, int>);
REGISTER_OP_NPU_KERNEL(
elementwise_pow_grad,
ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, plat::float16>,
ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, float>,
ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, double>,
ops::ElementwisePowGradNPUKernel<plat::NPUDeviceContext, int>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
namespace paddle {
namespace operators {
template <typename T>
class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<phi::DenseTensor>("X");
auto* y = ctx.Input<phi::DenseTensor>("Y");
auto* out = ctx.Output<phi::DenseTensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
}
};
template <typename T>
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
// For dx
// stage 1
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
phi::DenseTensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("ReduceSumD",
{*dout},
{reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
const auto& runner = NpuOpRunner("ReduceSumD",
{*tmp_dout},
{*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopy(
*tmp_dout,
ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
dx);
}
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
// For dy
// stage 1
auto reduce_ndim = dout->dims().size() - dy->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
phi::DenseTensor* tmp_dout = const_cast<phi::DenseTensor*>(dout);
phi::DenseTensor reduced_dy(dy->type());
phi::DenseTensor reduced_dout(dy->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(phi::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
const auto& runner =
NpuOpRunner("ReduceSumD",
{*dout},
{reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
phi::DenseTensor* tmp_dy = tmp_dout;
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("ReduceSumD",
{*tmp_dout},
{reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
tmp_dy = &reduced_dy;
}
// stage 3, negative
const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(elementwise_sub,
ops::ElementwiseSubNPUKernel<int>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ElementwiseSubNPUKernel<int64_t>,
#endif
ops::ElementwiseSubNPUKernel<float>,
ops::ElementwiseSubNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
ops::ElementwiseSubGradNPUKernel<int>,
#ifdef PADDLE_WITH_ASCEND_INT64
ops::ElementwiseSubGradNPUKernel<int64_t>,
#endif
ops::ElementwiseSubGradNPUKernel<float>,
ops::ElementwiseSubGradNPUKernel<plat::float16>);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册