From 9bf00cd59319ed1ebd82eddb5c0c4d2f3aa12f16 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Wed, 23 Jun 2021 15:22:37 +0800 Subject: [PATCH] repair npu matmul_grad and comm_init_hccl (#33719) --- .../collective/c_comm_init_hccl_op.cc | 29 +++++++ paddle/fluid/operators/matmul_v2_op_npu.cc | 32 +++++++- .../unittests/npu/test_matmulv2_op_npu.py | 80 +++++++++++++++++++ 3 files changed, 137 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 7817f19bac..5f765d9544 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -22,7 +22,11 @@ class Scope; } // namespace framework } // namespace paddle #if defined(PADDLE_WITH_ASCEND_CL) +#include "acl/acl.h" +#include "hccl/hccl.h" +#include "hccl/hccl_types.h" #include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" #endif namespace paddle { @@ -57,6 +61,31 @@ class CCommInitOpAscend : public framework::OperatorBase { } platform::HCCLCommContext::Instance().CreateHCCLComm( hccl_id, rank_ids, rank_id, device_id, rid); + + // Build comm + float* buff; + int32_t size = 20; + std::vector input(size, 0); + for (int32_t idx = 0; idx < size; idx++) { + input[idx] = 1.0; + } + aclrtMalloc(reinterpret_cast(&buff), size * sizeof(float), + ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMemcpy(reinterpret_cast(buff), size * sizeof(float), + input.data(), size * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE); + VLOG(3) << "Build buff data successful."; + + aclrtStream stream = nullptr; + auto comm = paddle::platform::HCCLCommContext::Instance().Get(rid, place); + if (rank_id == 0) { + stream = comm->stream(); + } else { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } + platform::dynload::HcclBroadcast(buff, size, HCCL_DATA_TYPE_FP32, 0, + comm->comm(), stream); + VLOG(3) << "Build connection successful."; #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU.")); diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index f499c24ea3..5ec48d3bf0 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -138,10 +138,34 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { } if (dy) { dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); + if ((x->dims().size() == 3) && (dout->dims().size() == 3) && + (dy->dims().size() == 2)) { + framework::Tensor dout_; + TensorCopy(*dout, ctx.GetPlace(), &dout_); + ctx.template device_context() + .Wait(); + std::vector vec_dim = framework::vectorize(dout_.dims()); + std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; + dout_.Resize(framework::make_ddim(vec_dim_v)); + + framework::Tensor x_; + TensorCopy(*x, ctx.GetPlace(), &x_); + ctx.template device_context() + .Wait(); + std::vector vec_dim_x = framework::vectorize(x_.dims()); + std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], + vec_dim_x[2]}; + x_.Resize(framework::make_ddim(vec_dim_x_v)); + const auto& runner_dy = + NpuOpRunner("MatMul", {x_, dout_}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + runner_dy.Run(stream); + } else { + const auto& runner_dy = + NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, + {{"adj_x1", true}, {"adj_x2", false}}); + runner_dy.Run(stream); + } } } } diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index b27b9c0b97..b093fa4f2c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -206,5 +206,85 @@ class TestMatMulNet(unittest.TestCase): self.assertTrue(np.allclose(npu_loss, cpu_loss)) +# The precision is aligned in NPU and GPU separately, which is only used for the usage method. +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMatMulNet3_2(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + self._dtype = "float32" + + a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) + b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) + c_np = np.random.random(size=(3, 2)).astype(self._dtype) + d_np = np.random.random(size=(3, 2)).astype(self._dtype) + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype) + b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype) + c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype) + d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype) + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + sum_1 = paddle.cast(sum_1, 'float16') + sum_2 = paddle.cast(sum_2, 'float16') + if not run_npu: + sum_1 = paddle.cast(sum_1, 'float32') + sum_2 = paddle.cast(sum_2, 'float32') + + result = paddle.matmul(sum_1, sum_2) + if run_npu: + result = paddle.cast(result, 'float32') + + result = paddle.reshape(result, shape=[2, 2]) + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) + self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + + if __name__ == '__main__': unittest.main() -- GitLab