diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index d3022056a47ded99e63aa05c1aca8e9b31ccc3fe..353eab5bc5264146399d6d6549a32cd085902688 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -135,8 +135,21 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { } if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + framework::Tensor dout_; + TensorCopySync(*dout, ctx.GetPlace(), &dout_); + std::vector vec_dim = framework::vectorize(dout_.dims()); + std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; + dout_.Resize(framework::make_ddim(vec_dim_v)); + + framework::Tensor x_; + TensorCopySync(*x, ctx.GetPlace(), &x_); + std::vector vec_dim_x = framework::vectorize(x_.dims()); + std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], + vec_dim_x[2]}; + x_.Resize(framework::make_ddim(vec_dim_x_v)); + auto runner_dy = + NpuOpRunner("MatMul", {x_, dout_}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); runner_dy.Run(stream); } } diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index e28ef1e94b1a4c6b8d5c21c2dc2a712101728271..911948293c8fff056cd06242e72d5bdcc161d264 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -927,6 +927,7 @@ def _linear(x, weight, bias=None, name=None): else: helper = LayerHelper('linear', **locals()) dtype = x.dtype + assert x.ndim < 4, "X latitude is not supported greater than 3 now." check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'linear')