未验证 提交 09778f46 编写于 作者: Q Qi Li 提交者: GitHub

[NPU] fix elementwise_mul to support broadcast, test=develop (#36258)

* [NPU] fix elementwise_mul to support broadcast, test=develop

* remove debug files, test=develop

* add axis support, test=develop
上级 b3f6eedb
...@@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using NPUDeviceContext = platform::NPUDeviceContext;
template <typename DeviceContext, typename T>
template <typename T>
static void ReduceDims(const framework::ExecutionContext& ctx,
const aclrtStream& stream, const int axis,
const framework::DDim& ddims,
const framework::DDim& brd_ddims, const Tensor& in,
Tensor* out) {
std::vector<int64_t> axes;
int64_t brd_size = brd_ddims.size();
int64_t org_size = ddims.size();
// int64_t diff = brd_dims.size() - dims.size();
for (int64_t i = 0; i < brd_size; ++i) {
if (i < axis || i >= org_size + axis) {
axes.push_back(i);
continue;
}
if (brd_ddims[i] > ddims[i - axis]) {
axes.push_back(i);
}
}
// LOG(INFO) << "axes = " << framework::make_ddim(axes).to_str();
out->mutable_data<T>(ctx.GetPlace());
const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
}
template <typename T>
class ElementwiseMulNPUKernel : public framework::OpKernel<T> { class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* out = ctx.Output<Tensor>("Out"); auto* out = ctx.Output<Tensor>("Out");
out->mutable_data<T>(ctx.GetPlace());
int axis = ctx.Attr<int>("axis");
bool direct_compute = false;
auto x_dims = x->dims();
auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) {
direct_compute = x_dims.size() == (y_dims.size() + axis);
} else {
direct_compute = y_dims.size() == (x_dims.size() + axis);
}
auto place = ctx.GetPlace(); auto stream = ctx.template device_context<NPUDeviceContext>().stream();
out->mutable_data<T>(place);
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); if (direct_compute) {
runner.Run(stream); const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
runner.Run(stream);
} else {
Tensor trans_x, trans_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
runner.Run(stream);
}
} }
}; };
template <typename DeviceContext, typename T> template <typename T>
class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> { class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
auto* x = ctx.Input<Tensor>("X"); auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out")); auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
auto place = ctx.GetPlace(); axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
auto stream = ctx.template device_context<NPUDeviceContext>().stream();
auto stream = Tensor trans_x, trans_y;
ctx.template device_context<paddle::platform::NPUDeviceContext>() NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
.stream();
if (dx) { if (dx) {
dx->mutable_data<T>(place); if (dx->dims() == dout->dims()) {
const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); dx->mutable_data<T>(ctx.GetPlace());
runner_dx.Run(stream); const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
runner_dx.Run(stream);
} else {
Tensor dx_temp(x->type());
dx_temp.Resize(trans_x.dims());
dx_temp.mutable_data<T>(ctx.GetPlace());
const auto& runner_dx =
NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {});
runner_dx.Run(stream);
ReduceDims<T>(ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp,
dx);
}
} }
if (dy) { if (dy) {
dy->mutable_data<T>(place); if (dy->dims() == dout->dims()) {
const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); dy->mutable_data<T>(ctx.GetPlace());
runner_dy.Run(stream); const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
runner_dy.Run(stream);
} else {
Tensor dy_temp(y->type());
dy_temp.Resize(trans_y.dims());
dy_temp.mutable_data<T>(ctx.GetPlace());
const auto& runner_dy =
NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {});
runner_dy.Run(stream);
ReduceDims<T>(ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp,
dy);
}
} }
} }
}; };
...@@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> { ...@@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel<float>,
elementwise_mul, ops::ElementwiseMulNPUKernel<paddle::platform::float16>);
ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(
elementwise_mul_grad, elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel<float>,
ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>, ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>);
ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
...@@ -18,147 +18,203 @@ import numpy as np ...@@ -18,147 +18,203 @@ import numpy as np
import unittest import unittest
import sys import sys
sys.path.append("..") sys.path.append("..")
from op_test import OpTest from op_test import OpTest, skip_check_grad_ci
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
paddle.enable_static() paddle.enable_static()
SEED = 2021
class TestElementwiseMul(OpTest): class ElementwiseMulOp(OpTest):
def set_npu(self):
self.__class__.use_npu = True
self.place = paddle.NPUPlace(0)
def setUp(self): def setUp(self):
self.set_npu() self.set_npu()
self.op_type = "elementwise_mul" self.op_type = "elementwise_mul"
self.place = paddle.NPUPlace(0) self.dtype = np.float32
self.axis = -1
self.init_dtype() self.init_dtype()
np.random.seed(SEED) self.init_input_output()
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) self.init_axis()
y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
out = np.multiply(x, y)
self.inputs = { self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(x), 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(y) 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
} }
self.attrs = {} self.outputs = {'Out': self.out}
self.outputs = {'Out': out} self.attrs = {'axis': self.axis}
def set_npu(self): def test_check_output(self):
self.__class__.use_npu = True self.check_output_with_place(self.place)
def test_check_grad_normal(self):
self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
def test_check_grad_ingore_x(self):
self.check_grad_with_place(
self.place, ['Y'], 'Out', no_grad_set=set("X"))
def test_check_grad_ingore_y(self):
self.check_grad_with_place(
self.place, ['X'], 'Out', no_grad_set=set('Y'))
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
self.out = np.multiply(self.x, self.y)
def init_dtype(self): def init_dtype(self):
self.dtype = np.float32 pass
def test_check_output(self): def init_axis(self):
self.check_output_with_place(self.place) pass
# TODO(ascendrc): Mul grad test
# def test_check_grad(self): @skip_check_grad_ci(
# if self.dtype == np.float16: reason="[skip shape check] Use y_shape(1) to test broadcast.")
# return class TestElementwiseMulOp_scalar(ElementwiseMulOp):
# self.check_grad(['X'], 'Out') def setUp(self):
# self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = {
'X': np.random.rand(10, 3, 4).astype(np.float32),
'Y': np.random.rand(1).astype(np.float32)
}
self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
class TestElementwiseMulFp16(OpTest): class TestElementwiseMulOp_Vector(ElementwiseMulOp):
def setUp(self): def setUp(self):
self.set_npu() self.set_npu()
self.op_type = "elementwise_mul" self.op_type = "elementwise_mul"
self.place = paddle.NPUPlace(0) self.inputs = {
'X': np.random.random((100, )).astype("float32"),
'Y': np.random.random((100, )).astype("float32")
}
self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
self.init_dtype()
np.random.seed(SEED)
x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
out = np.multiply(x, y)
class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
def init_input_output(self):
self.x = np.random.rand(100, 2, 3).astype(self.dtype)
self.y = np.random.rand(100).astype(self.dtype)
self.out = self.x * self.y.reshape(100, 1, 1)
def init_axis(self):
self.axis = 0
class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = { self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(x), 'X': np.random.rand(2, 100, 3).astype(np.float32),
'Y': OpTest.np_dtype_to_fluid_dtype(y) 'Y': np.random.rand(100).astype(np.float32)
}
self.attrs = {'axis': 1}
self.outputs = {
'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
}
class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = {
'X': np.random.rand(2, 3, 100).astype(np.float32),
'Y': np.random.rand(100).astype(np.float32)
}
self.outputs = {
'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
} }
self.attrs = {}
self.outputs = {'Out': out}
def set_npu(self):
self.__class__.use_npu = True
self.__class__.no_need_check_grad = True
class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = {
'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
'Y': np.random.rand(10, 12).astype(np.float32)
}
self.attrs = {'axis': 1}
self.outputs = {
'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
}
class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = {
'X': np.random.rand(10, 2, 11).astype(np.float32),
'Y': np.random.rand(10, 1, 11).astype(np.float32)
}
self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
def setUp(self):
self.set_npu()
self.op_type = "elementwise_mul"
self.inputs = {
'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
}
self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
@unittest.skipIf(not paddle.is_compiled_with_npu(),
"paddle is not compiled with NPU")
class TestElementwiseMulOpFp16(ElementwiseMulOp):
def init_dtype(self): def init_dtype(self):
self.dtype = np.float16 self.dtype = np.float16
def test_check_output(self):
self.check_output_with_place(self.place, atol=1e-5) class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
def setUp(self):
self.set_npu()
class TestElementwiseMulNet(unittest.TestCase): self.op_type = "elementwise_mul"
def _test(self, run_npu=True): self.inputs = {
main_prog = paddle.static.Program() 'X': np.random.rand(2, 3, 100).astype(np.float32),
startup_prog = paddle.static.Program() 'Y': np.random.rand(1, 1, 100).astype(np.float32)
main_prog.random_seed = SEED }
startup_prog.random_seed = SEED self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
np.random.seed(SEED)
a_np = np.random.random(size=(32, 32)).astype('float32') class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
b_np = np.random.random(size=(32, 32)).astype('float32') def setUp(self):
c_np = np.random.random(size=(32, 32)).astype('float32') self.set_npu()
d_np = np.random.random(size=(32, 32)).astype('float32') self.op_type = "elementwise_mul"
label_np = np.random.randint(2, size=(32, 1)).astype('int64') self.inputs = {
'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
with paddle.static.program_guard(main_prog, startup_prog): 'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') }
b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
label = paddle.static.data( class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
name="label", shape=[32, 1], dtype='int64') def setUp(self):
self.set_npu()
e = paddle.multiply(a, b) self.op_type = "elementwise_mul"
f = paddle.multiply(c, d) self.inputs = {
f.stop_gradient = True 'X': np.random.rand(10, 10).astype(np.float32),
g = paddle.multiply(e, f) 'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
}
fc_1 = fluid.layers.fc(input=g, size=128)
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') self.attrs = {'axis': 2}
cost = fluid.layers.cross_entropy(input=prediction, label=label) self.outputs = {
loss = fluid.layers.reduce_mean(cost) 'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
sgd = fluid.optimizer.SGD(learning_rate=0.01) }
sgd.minimize(loss)
if run_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(100):
pred_res, loss_res = exe.run(main_prog,
feed={
"a": a_np,
"b": b_np,
"c": c_np,
"d": d_np,
"label": label_np
},
fetch_list=[prediction, loss])
if epoch % 10 == 0:
print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
epoch, pred_res[0], loss_res))
return pred_res, loss_res
def test_npu(self):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred))
self.assertTrue(np.allclose(npu_loss, cpu_loss))
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册