diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index b23b408e9c59a74fb9b426be0ccd4a54465a9082..6d7e8f3478c848f5b13ec440ce3ff52cc71e6218 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -21,166 +21,387 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void MatMul2D(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("MatMul", {X, Y}, {*Out}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + runner.Run(stream); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const std::vector& dims, + const std::vector& brd_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = brd_dims.size(); + int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (brd_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class MatMulV2NPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - - if (x->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "MatMul", {*x, *y}, {*out}, - {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); + + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else if (x->dims().size() > 2) { - out->mutable_data(ctx.GetPlace()); + auto stream = ctx.template device_context().stream(); - const auto& runner = - NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, - {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + PADDLE_ENFORCE_EQ( + X->numel(), Y->numel(), + platform::errors::InvalidArgument( + "X's numbers must be equal to Y's numbers," + "when X/Y's dims =1. But received X has [%d] elements," + "received Y has [%d] elements", + X->numel(), Y->numel())); + Out->Resize({1}); + Out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); + const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); runner.Run(stream); + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + y_ndim = 2; + out_ndim += 1; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (trans_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); } + + // Case 2: [M, K] x [K, N] = [M, N] + if (x_ndim == 2 && y_ndim == 2) { + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector vec_dim = {x_temp.numel() / K, K}; + x_temp.Resize(framework::make_ddim(vec_dim)); + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } + + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); } }; -template +template class MatMulV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - bool transpose_y = ctx.Attr("trans_y"); - auto stream = - ctx.template device_context() - .stream(); - - if (x->dims().size() == 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*dout, *x}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); - runner_dy.Run(stream); - } + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); + auto stream = ctx.template device_context().stream(); - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + Tensor dout_temp(dOut->type()); + dout_temp.Resize(X->dims()); + dout_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("BroadcastTo") + .AddInput(*dOut) + .AddInput(std::move(x_dims)) + .AddOutput(dout_temp) + .Run(stream); - runner_dy.Run(stream); + if (dX) { + dX->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); + runner_dx.Run(stream); + } + if (dY) { + dY->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); + runner_dy.Run(stream); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(framework::make_ddim(x_dims)); + if (trans_x) { + MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); + } else { + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); } + dX->Resize(X->dims()); } - } else if (x->dims().size() > 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); - - runner_dx.Run(stream); + if (dY) { + dY->Resize(framework::make_ddim(y_dims)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + dY->Resize(Y->dims()); + } + return; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - runner_dy.Run(stream); + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector x_vec_dim = {x_temp.numel() / K, K}; + dout_temp.Resize( + framework::make_ddim(std::vector{dout_temp.numel() / N, N})); + if (dX) { + dX->Resize(framework::make_ddim(x_vec_dim)); + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); + dX->Resize(X->dims()); + } + if (dY) { + x_temp.Resize(framework::make_ddim(x_vec_dim)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); } - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + } + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } - runner_dx.Run(stream); + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + + if (dX) { + if (x_dims == x_broadcast_dims) { + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if ((x->dims().size() == 3) && (dout->dims().size() == 3) && - (dy->dims().size() == 2)) { - framework::Tensor dout_tmp; - dout_tmp.ShareDataWith(*dout); - std::vector vec_dim = - framework::vectorize(dout_tmp.dims()); - std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; - dout_tmp.Resize(framework::make_ddim(vec_dim_v)); - - framework::Tensor x_tmp; - x_tmp.ShareDataWith(*x); - std::vector vec_dim_x = - framework::vectorize(x_tmp.dims()); - std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], - vec_dim_x[2]}; - x_tmp.Resize(framework::make_ddim(vec_dim_x_v)); - const auto& runner_dy = - NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } else { - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); - } + } else { + Tensor dx_temp(X->type()); + dx_temp.Resize(framework::make_ddim(x_broadcast_dims)); + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, + true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, + !trans_y); } + ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); + } + } + if (dY) { + if (y_dims == y_broadcast_dims) { + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); + } + } else { + Tensor dy_temp(Y->type()); + dy_temp.Resize(framework::make_ddim(y_broadcast_dims)); + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, + trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, + false); + } + ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); } } } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel, + ops::MatMulV2NPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel, + ops::MatMulV2GradNPUKernel); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 7f2937b9af7643b5dbf21fb940413a507b0e694e..4d90b9159470eb82e87b7c1ac0695130c45ee75a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -55,6 +55,7 @@ __all__ = [ 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_xpu', + 'is_compiled_with_npu', 'Variable', 'require_version', 'device_guard', @@ -380,6 +381,15 @@ def _xpu_ids(): return device_ids +def _npu_ids(): + npus_env = os.getenv("FLAGS_selected_npus") + if npus_env: + device_ids = [int(s) for s in npus_env.split(",")] + else: + device_ids = six.moves.range(core.get_npu_device_count()) + return device_ids + + def is_compiled_with_xpu(): """ Whether this whl package can be used to run the model on XPU. @@ -395,6 +405,21 @@ def is_compiled_with_xpu(): return core.is_compiled_with_xpu() +def is_compiled_with_npu(): + """ + Whether this whl package can be used to run the model on NPU. + + Returns (bool): support npu or not. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + support_npu = fluid.is_compiled_with_npu() + """ + return core.is_compiled_with_npu() + + def disable_signal_handler(): """ Reset signal handler registered by Paddle. @@ -538,6 +563,47 @@ def xpu_places(device_ids=None): return [core.XPUPlace(dev_id) for dev_id in device_ids] +def npu_places(device_ids=None): + """ + **Note**: + For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device. + + This function creates a list of :code:`paddle.NPUPlace` objects. + If :code:`device_ids` is None, environment variable of + :code:`FLAGS_selected_npus` would be checked first. For example, if + :code:`FLAGS_selected_npus=0,1,2`, the returned list would + be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + If :code:`FLAGS_selected_npus` is not set, all visible + npu places would be returned. + If :code:`device_ids` is not None, it should be the device + ids of NPUs. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be + [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + + Parameters: + device_ids (list or tuple of int, optional): list of NPU device ids. + Returns: + list of paddle.NPUPlace: Created NPU place list. + Examples: + .. code-block:: python + + # required: npu + + import paddle + import paddle.static as static + + paddle.enable_static() + npu_places = static.npu_places() + """ + assert core.is_compiled_with_npu(), \ + "Not compiled with NPU" + if device_ids is None: + device_ids = _npu_ids() + elif not isinstance(device_ids, (list, tuple)): + device_ids = [device_ids] + return [core.NPUPlace(dev_id) for dev_id in device_ids] + + def cpu_places(device_count=None): """ This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. @@ -1927,6 +1993,10 @@ class Variable(object): p = core.Place() p.set_place(t._place()) place = core.XPUPlace(p.xpu_device_id()) + elif p.is_npu_place(): + p = core.Place() + p.set_place(t._place()) + place = core.NPUPlace(p.npu_device_id()) else: p = core.Place() p.set_place(t._place()) diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 44b3c6764a7cfa6e84141886537ead3712d4a44a..4e81bb9544ceb9161046d978fb7b48ca8144ea31 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -20,4 +20,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index 53766c5eb61b7a429636551b5e7b1c926f9b38e4..882043ef6eb911f6163d516e9929658f38810ade 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -21,56 +21,35 @@ sys.path.append("..") from op_test import OpTest import paddle import paddle.fluid as fluid +from test_matmul_v2_op import reference_matmul paddle.enable_static() SEED = 2021 -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): - """Reference forward implementation using np.matmul.""" - # np.matmul does not support the transpose flags, so we manually - # transpose X and Y appropriately. - if transpose_X: - if X.ndim == 1: - X = X.reshape((X.size)) - elif X.ndim == 2: - X = X.T - else: - dim = [i for i in range(len(X.shape))] - dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] - X = np.transpose(X, tuple(dim)) - if transpose_Y: - if Y.ndim == 1: - Y = Y.reshape((Y.size)) - else: - dim = [i for i in range(len(Y.shape))] - dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] - Y = np.transpose(Y, tuple(dim)) - - Out = np.matmul(X, Y) - if not Out.shape: - # We do not support 0-dimensional Tensors (scalars). So where - # np.matmul outputs a scalar, we must convert to a Tensor of - # shape (1) instead. - # Everywhere else, we are compatible with np.matmul. - Out = np.array([Out], dtype="float64") - return Out - - -class TestMatMul(OpTest): +class TestMatMulV2Op(OpTest): + """ + case 1 + """ + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def config(self): - self.x_shape = (100, 24) - self.y_shape = (24, 100) + self.x_shape = (100, ) + self.y_shape = (100, ) self.trans_x = False self.trans_y = False + def init_kernel_type(self): + self.dtype = "float32" + def setUp(self): self.set_npu() - self.op_type = "matmul_v2" - self.place = paddle.NPUPlace(0) - self.init_dtype() + self.init_kernel_type() self.config() - np.random.seed(SEED) + self.op_type = "matmul_v2" x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 @@ -85,201 +64,314 @@ class TestMatMul(OpTest): self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} self.outputs = {'Out': result} - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True - - def init_dtype(self): - self.dtype = np.float32 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # -class TestMatMul2(TestMatMul): +class TestMatMuklOp2(TestMatMulV2Op): """ case 2 """ def config(self): - self.x_shape = (32, 24) - self.y_shape = (32, 24) + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) self.trans_x = False self.trans_y = True -class TestMatMul3(TestMatMul): +class TestMatMuklOp3(TestMatMulV2Op): """ case 3 """ - def init_dtype(self): - self.dtype = np.float16 + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False -class TestMatMul4(TestMatMul): +class TestMatMuklOp4(TestMatMulV2Op): """ - case 4 dim=3 + case 4 """ def config(self): - self.x_shape = (2, 3, 4) - self.y_shape = (2, 4, 3) + self.x_shape = (100, ) + self.y_shape = (1, 2, 100, 2) self.trans_x = False self.trans_y = False -class TestMatMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(2, 3)).astype('float32') - b_np = np.random.random(size=(2, 3)).astype('float32') - c_np = np.random.random(size=(3, 2)).astype('float32') - d_np = np.random.random(size=(3, 2)).astype('float32') - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') - b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') - c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') - d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - result = paddle.matmul(sum_1, sum_2) - - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) - - -# The precision is aligned in NPU and GPU separately, which is only used for the usage method. - - -class TestMatMulNet3_2(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - self._dtype = "float32" - - a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - c_np = np.random.random(size=(3, 2)).astype(self._dtype) - d_np = np.random.random(size=(3, 2)).astype(self._dtype) - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype) - b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype) - c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype) - d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype) - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - sum_1 = paddle.cast(sum_1, 'float16') - sum_2 = paddle.cast(sum_2, 'float16') - if not run_npu: - sum_1 = paddle.cast(sum_1, 'float32') - sum_2 = paddle.cast(sum_2, 'float32') - - result = paddle.matmul(sum_1, sum_2) - if run_npu: - result = paddle.cast(result, 'float32') - - result = paddle.reshape(result, shape=[2, 2]) - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: +class TestMatMuklOp5(TestMatMulV2Op): + """ + case 5 + """ + + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (100, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp6(TestMatMulV2Op): + """ + case 6 + """ + + def config(self): + self.x_shape = (1, 2, 102, 1) + self.y_shape = (102, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp7(TestMatMulV2Op): + """ + case 7 + """ + + def config(self): + self.x_shape = (1, 2, 1, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp8(TestMatMulV2Op): + """ + case 8 + """ + + def config(self): + self.x_shape = (1, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp9(TestMatMulV2Op): + """ + case 9 + """ + + def config(self): + self.x_shape = (1, 1, 1, 100) + self.y_shape = (2, 1, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMuklOp10(TestMatMulV2Op): + """ + case 10 + """ + + def config(self): + self.x_shape = (1, 1, 25, 4) + self.y_shape = (1, 2, 4, 25) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp11(TestMatMulV2Op): + """ + case 11 + """ + + def config(self): + self.x_shape = (2, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp12(TestMatMulV2Op): + """ + case 12 + """ + + def config(self): + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp13(TestMatMulV2Op): + """ + case 13 + """ + + def config(self): + self.x_shape = (2, 2, 10, 10) + self.y_shape = (2, 2, 10, 10) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp14(TestMatMulV2Op): + """ + case 14_1 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp15(TestMatMulV2Op): + """ + case 14_2 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp16(TestMatMulV2Op): + """ + case 16 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp17(TestMatMulV2Op): + """ + case 17 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOpBroadcast1(TestMatMulV2Op): + """ + case 14_3 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = True + self.trans_y = True + + +class TestMatMuklOpBroadcast2(TestMatMulV2Op): + """ + case 14_4 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = False + self.trans_y = True + + +#--------------------test matmul fp16-------------------- + + +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulV2Op) +create_test_fp16_class(TestMatMuklOp2) +create_test_fp16_class(TestMatMuklOp3) +create_test_fp16_class(TestMatMuklOp4) +create_test_fp16_class(TestMatMuklOp5) +create_test_fp16_class(TestMatMuklOp6) +create_test_fp16_class(TestMatMuklOp7) +create_test_fp16_class(TestMatMuklOp8) +create_test_fp16_class(TestMatMuklOp9) +create_test_fp16_class(TestMatMuklOp10) +create_test_fp16_class(TestMatMuklOp11) +create_test_fp16_class(TestMatMuklOp12) +create_test_fp16_class(TestMatMuklOp13) +create_test_fp16_class(TestMatMuklOp14) +create_test_fp16_class(TestMatMuklOp15) +create_test_fp16_class(TestMatMuklOp16) +create_test_fp16_class(TestMatMuklOp17) + + +class TestMatMulV2API(unittest.TestCase): + def setUp(self): + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_npu(): + self.places.append(paddle.NPUPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32") + input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32") + + result = paddle.matmul(input_x, input_y) + + x_np = np.random.random([4, 3]).astype("float32") + y_np = np.random.random([3, 4]).astype("float32") + + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": x_np, + "input_y": y_np}, + fetch_list=[result]) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float32") + input_y = np.random.random([3, 4]).astype("float32") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + + def test_dygraph_fp16(self): + if paddle.is_compiled_with_npu(): place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float16") + input_y = np.random.random([3, 4]).astype("float16") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) if __name__ == '__main__': diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 0f463b0c7d9418a2a847edc71ff4d5eec93d0781..20af4158df48fdc022d8a1923ad2e18e1a54b96d 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -43,6 +43,7 @@ from ..fluid.framework import program_guard # noqa: F401 from ..fluid.framework import cpu_places # noqa: F401 from ..fluid.framework import cuda_places # noqa: F401 from ..fluid.framework import xpu_places # noqa: F401 +from ..fluid.framework import npu_places # noqa: F401 from ..fluid.framework import Variable # noqa: F401 from ..fluid.layers.control_flow import Print # noqa: F401 from ..fluid.layers.nn import py_func # noqa: F401 @@ -99,6 +100,7 @@ __all__ = [ #noqa 'cpu_places', 'cuda_places', 'xpu_places', + 'npu_places', 'Variable', 'create_global_var', 'accuracy', diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 69baa4facfa96c3d64561697ba001be30319781d..efdc6847f0056183e641141494d2c9849245589d 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -74,7 +74,22 @@ def _is_cuda_available(): return False -def _run_dygraph_single(use_cuda): +def _is_npu_available(): + """ + Check whether NPU is avaiable. + """ + try: + assert len(paddle.static.npu_places()) > 0 + return True + except Exception as e: + logging.warning( + "You are using NPU version PaddlePaddle, but there is no NPU " + "detected on your machine. Maybe NPU devices is not set properly." + "\n Original Error is {}".format(e)) + return False + + +def _run_dygraph_single(use_cuda, use_npu): """ Testing the simple network in dygraph mode using one CPU/GPU. @@ -84,6 +99,8 @@ def _run_dygraph_single(use_cuda): paddle.disable_static() if use_cuda: paddle.set_device('gpu') + elif use_npu: + paddle.set_device('npu') else: paddle.set_device('cpu') weight_attr = paddle.ParamAttr( @@ -102,7 +119,7 @@ def _run_dygraph_single(use_cuda): opt.step() -def _run_static_single(use_cuda): +def _run_static_single(use_cuda, use_npu): """ Testing the simple network with executor running directly, using one CPU/GPU. @@ -119,8 +136,14 @@ def _run_static_single(use_cuda): param_grads = paddle.static.append_backward( out, parameter_list=[weight.name])[0] - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(train_prog, feed={input.name: _prepare_data(1)}, @@ -128,7 +151,7 @@ def _run_static_single(use_cuda): paddle.disable_static() -def _run_static_parallel(use_cuda, device_list): +def _run_static_parallel(use_cuda, use_npu, device_list): """ Testing the simple network in data parallel mode, using multiple CPU/GPU. @@ -150,8 +173,15 @@ def _run_static_parallel(use_cuda, device_list): train_prog).with_data_parallel( loss_name=loss.name, places=device_list) - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + compiled_prog = train_prog + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(compiled_prog, feed={input.name: _prepare_data(len(device_list))}, @@ -182,23 +212,31 @@ def run_check(): if paddle.is_compiled_with_cuda(): use_cuda = _is_cuda_available() + use_npu = False + elif paddle.is_compiled_with_npu(): + use_npu = _is_npu_available() + use_cuda = False else: + use_npu = False use_cuda = False if use_cuda: device_str = "GPU" device_list = paddle.static.cuda_places() + elif use_npu: + device_str = "NPU" + device_list = paddle.static.npu_places() else: device_str = "CPU" device_list = paddle.static.cpu_places(device_count=2) device_count = len(device_list) - _run_static_single(use_cuda) - _run_dygraph_single(use_cuda) + _run_static_single(use_cuda, use_npu) + _run_dygraph_single(use_cuda, use_npu) print("PaddlePaddle works well on 1 {}.".format(device_str)) try: - _run_static_parallel(use_cuda, device_list) + _run_static_parallel(use_cuda, use_npu, device_list) print("PaddlePaddle works well on {} {}s.".format(device_count, device_str)) print(