[hybrid performance] Optimize tensor parallel plus pipeline parallel send recv size (#34110)

348d043e · WangXi · GitHub · 651aad06 · 348d043e · 348d043e
12 changed file
--- a/paddle/fluid/operators/collective/partial_allgather_op.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_allgather_op.h"
+namespace paddle {
+namespace operators {
+class PartialAllGatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PartialAllGather");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "PartialAllGather");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The value of nranks should be >=2."));
+    PADDLE_ENFORCE_EQ(
+        (rank >= 0 && rank < nranks), true,
+        platform::errors::InvalidArgument(
+            "The rank (%d) for partial_allgather op must >=0 and <nranks (%d)",
+            rank, nranks));
+    framework::DDim dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+};
+class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be partial allgather");
+    AddOutput("Out", "(Tensor) the allgather result");
+    AddAttr<int>("ring_id", "(int default 0) communication ring id.")
+        .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<int>("nranks",
+                 "Total trainer count of the distributed training job");
+    AddAttr<int>("rank", "Rand of the distributed training job");
+    AddComment(R"DOC(
+PartialAllGather Operator.
+Divide the Input into nranks copies and only use the rank part.
+Each rank receives the aggregation of data from all ranks in the order of the ranks.
+reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(partial_allgather, ops::PartialAllGatherOp,
+                             ops::PartialAllGatherOpMaker);
+REGISTER_OP_CPU_KERNEL(partial_allgather,
+                       ops::PartialAllGatherOpCPUKernel<float>,
+                       ops::PartialAllGatherOpCPUKernel<double>,
+                       ops::PartialAllGatherOpCPUKernel<int>,
+                       ops::PartialAllGatherOpCPUKernel<int64_t>,
+                       ops::PartialAllGatherOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_allgather_op.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    int64_t numel = in->numel();
+    ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+    PADDLE_ENFORCE_EQ(rank, comm->rank(),
+                      platform::errors::InvalidArgument(
+                          "rank: %s should equal to %s", rank, comm->rank()));
+    PADDLE_ENFORCE_EQ(
+        (numel % nranks), 0,
+        platform::errors::InvalidArgument(
+            "The input numel (%d) must be divisible by nranks(%d)", numel,
+            nranks));
+    framework::DDim dims = in->dims();
+    out->mutable_data<T>(dims, place);
+    int64_t send_numel = numel / nranks;
+    int offset = send_numel * rank;
+    const T* send_buff = in->data<T>() + offset;
+    T* recv_buff = out->data<T>();
+    gpuStream_t stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(partial_allgather,
+                        ops::PartialAllGatherOpCUDAKernel<float>,
+                        ops::PartialAllGatherOpCUDAKernel<double>,
+                        ops::PartialAllGatherOpCUDAKernel<int>,
+                        ops::PartialAllGatherOpCUDAKernel<int64_t>,
+                        ops::PartialAllGatherOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_allgather_op.h
+++ b/paddle/fluid/operators/collective/partial_allgather_op.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialAllGatherOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support partial_allgather for cpu kernel now."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/partial_recv_op.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_recv_op.h"
+#include <string>
+namespace paddle {
+namespace operators {
+class PartialRecvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PartialRecv");
+    int peer = ctx->Attrs().Get<int>("peer");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int num = ctx->Attrs().Get<int>("num");
+    int id = ctx->Attrs().Get<int>("id");
+    auto out_shape = ctx->Attrs().Get<std::vector<int>>("out_shape");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for partial_recv op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for partial_recv op must be non-negative.",
+            ring_id));
+    PADDLE_ENFORCE_GE(num, 1,
+                      platform::errors::InvalidArgument(
+                          "The num (%d) for partial_send op must >=1", num));
+    PADDLE_ENFORCE_EQ(
+        (id >= 0 && id < num), true,
+        platform::errors::InvalidArgument(
+            "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
+    PADDLE_ENFORCE_GE(out_shape.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of the output shape must be greater than 0 "
+                          "but the value given is %d.",
+                          out_shape.size()));
+    for (size_t i = 0; i < out_shape.size(); ++i) {
+      PADDLE_ENFORCE_GE(out_shape[i], 1,
+                        platform::errors::InvalidArgument(
+                            "The shape attribute for partial_recv must be set "
+                            "explicitly, but the %dth element is %d which "
+                            "is less than 1.",
+                            i, out_shape[i]));
+    }
+    auto out_dims = framework::make_ddim(out_shape);
+    int numel = framework::product(out_dims);
+    PADDLE_ENFORCE_EQ(
+        (numel % num), 0,
+        platform::errors::InvalidArgument(
+            "The output numel (%d) must be divisible by num(%d)", numel, num));
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    int dtype = ctx.Attr<int>("dtype");
+    framework::proto::VarType::Type type =
+        framework::proto::VarType::Type(dtype);
+    return framework::OpKernelType(type, ctx.GetPlace());
+  }
+};
+class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddOutput("Out", "(Tensor) tensor to receive.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
+    AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
+        .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
+    AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
+        .SetDefault(std::vector<int>());
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<int>("num", "(int default 1) The number of Output to be cut.")
+        .SetDefault(1);
+    AddAttr<int>("id",
+                 "(int default 0) ID of the part to be recv after Output cut.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Recv Operator.
+Divide the Output into num copies and only recv the id part.
+Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(partial_recv, ops::PartialRecvOp,
+                             ops::PartialRecvOpMaker);
+REGISTER_OP_CPU_KERNEL(partial_recv, ops::PartialRecvOpCPUKernel<float>,
+                       ops::PartialRecvOpCPUKernel<double>,
+                       ops::PartialRecvOpCPUKernel<int>,
+                       ops::PartialRecvOpCPUKernel<int64_t>,
+                       ops::PartialRecvOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_recv_op.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto out_dims = out->dims();
+    auto numel = out->numel();
+    int rid = ctx.Attr<int>("ring_id");
+    int peer = ctx.Attr<int>("peer");
+    int data_type = ctx.Attr<int>("dtype");
+    int num = ctx.Attr<int>("num");
+    int id = ctx.Attr<int>("id");
+    framework::proto::VarType::Type type =
+        framework::proto::VarType::Type(data_type);
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for partial_recv op must be non-negative.", rid));
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for partial_recv op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(num, 1,
+                      platform::errors::InvalidArgument(
+                          "The num (%d) for partial_recv op must >=1", num));
+    PADDLE_ENFORCE_EQ(
+        (id >= 0 && id < num), true,
+        platform::errors::InvalidArgument(
+            "The id (%d) for partial_recv op must >=0 and <num (%d)", id, num));
+    PADDLE_ENFORCE_EQ(
+        (numel % num), 0,
+        platform::errors::InvalidArgument(
+            "The input numel (%d) must be divisible by num(%d)", numel, num));
+    gpuStream_t stream = nullptr;
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    PADDLE_ENFORCE_LT(
+        peer, comm->nranks(),
+        platform::errors::InvalidArgument("The value of peer (%d) you set must "
+                                          "be less than comm->nranks (%d).",
+                                          peer, comm->nranks()));
+    out->mutable_data<T>(out_dims, place);
+    ncclDataType_t dtype = platform::ToNCCLDataType(type);
+    int recv_numel = numel / num;
+    int offset = recv_numel * id;
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::ncclRecv(out->data<T>() + offset, recv_numel, dtype,
+                                    peer, comm->comm(), stream));
+    VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel
+            << " from offset[" << offset << "] from " << peer;
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should be compiled with NCCL and "
+        "NCCL version >= 2.7.3 is needed."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(partial_recv, ops::PartialRecvOpCUDAKernel<float>,
+                        ops::PartialRecvOpCUDAKernel<double>,
+                        ops::PartialRecvOpCUDAKernel<int>,
+                        ops::PartialRecvOpCUDAKernel<int64_t>,
+                        ops::PartialRecvOpCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_recv_op.h
+++ b/paddle/fluid/operators/collective/partial_recv_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialRecvOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support partial_recv for cpu kernel now."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/partial_send_op.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_send_op.h"
+namespace paddle {
+namespace operators {
+class PartialSendOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PartialSend");
+    int peer = ctx->Attrs().Get<int>("peer");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    int num = ctx->Attrs().Get<int>("num");
+    int id = ctx->Attrs().Get<int>("id");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for partial_send op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for partial_send op must be non-negative.",
+            ring_id));
+    PADDLE_ENFORCE_GE(num, 1,
+                      platform::errors::InvalidArgument(
+                          "The num (%d) for partial_send op must >=1", num));
+    PADDLE_ENFORCE_EQ(
+        (id >= 0 && id < num), true,
+        platform::errors::InvalidArgument(
+            "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+class PartialSendMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be sent.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<int>("num", "(int default 1) The number of Input to be cut.")
+        .SetDefault(1);
+    AddAttr<int>("id",
+                 "(int default 0) ID of the part to be sent after Input cut.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+PartialSend Operator.
+Divide the Input into num copies and only send the id part.
+Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_WITHOUT_GRADIENT(partial_send, ops::PartialSendOp,
+                             ops::PartialSendMaker);
+REGISTER_OP_CPU_KERNEL(partial_send, ops::PartialSendOpCPUKernel<float>,
+                       ops::PartialSendOpCPUKernel<double>,
+                       ops::PartialSendOpCPUKernel<int>,
+                       ops::PartialSendOpCPUKernel<int64_t>,
+                       ops::PartialSendOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/partial_send_op.h"
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialSendCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    int numel = x->numel();
+    int rid = ctx.Attr<int>("ring_id");
+    int peer = ctx.Attr<int>("peer");
+    int num = ctx.Attr<int>("num");
+    int id = ctx.Attr<int>("id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for partial_send op must be non-negative.", rid));
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for partial_send op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(num, 1,
+                      platform::errors::InvalidArgument(
+                          "The num (%d) for partial_send op must >=1", num));
+    PADDLE_ENFORCE_EQ(
+        (id >= 0 && id < num), true,
+        platform::errors::InvalidArgument(
+            "The id (%d) for partial_send op must >=0 and <num (%d)", id, num));
+    PADDLE_ENFORCE_EQ(
+        (numel % num), 0,
+        platform::errors::InvalidArgument(
+            "The input numel (%d) must be divisible by num(%d)", numel, num));
+    gpuStream_t stream = nullptr;
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    PADDLE_ENFORCE_LT(
+        peer, comm->nranks(),
+        platform::errors::InvalidArgument("The value of peer (%d) you set must "
+                                          "be less than comm->nranks (%d).",
+                                          peer, comm->nranks()));
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+    int send_numel = numel / num;
+    int offset = send_numel * id;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+        x->data<T>() + offset, send_numel, dtype, peer, comm->comm(), stream));
+    VLOG(3) << "rank " << comm->rank() << " send " << send_numel
+            << " from offset[" << offset << "] to " << peer;
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should be compiled with NCCL "
+        "and NCCL version >= 2.7.3 is needed."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(partial_send, ops::PartialSendCUDAKernel<float>,
+                        ops::PartialSendCUDAKernel<double>,
+                        ops::PartialSendCUDAKernel<int>,
+                        ops::PartialSendCUDAKernel<int64_t>,
+                        ops::PartialSendCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_send_op.h
+++ b/paddle/fluid/operators/collective/partial_send_op.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename T>
+class PartialSendOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support partial_send for cpu kernel now."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -183,6 +183,8 @@ class PipelineOptimizer(MetaOptimizerBase):
        program._pipeline_opt['micro_batch_size'] = self.micro_batch_size
        program._pipeline_opt['schedule_mode'] = self.schedule_mode
        program._pipeline_opt['use_sharding'] = False
+        program._pipeline_opt['mp_degree'] = 1
+        program._pipeline_opt['mp_rank'] = 0
        optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize(
            loss, startup_program, parameter_list, no_grad_set)
        self.startup_program = orig_startup_program._pipeline_opt[

--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -195,23 +195,28 @@ class ShardingOptimizer(MetaOptimizerBase):
        if self.pp_degree > 1:
            pp_optimizer = fluid.optimizer.PipelineOptimizer(
                self.inner_opt, self._gradient_merge_acc_step)
-            main_program = loss.block.program
-            main_program._pipeline_opt = dict()
+            strategy = self.user_defined_strategy
-            self.schedule_mode = self.user_defined_strategy.pipeline_configs[
+            self.schedule_mode = strategy.pipeline_configs['schedule_mode']
-                'schedule_mode']
-            main_program._pipeline_opt['schedule_mode'] = self.schedule_mode
-            main_program._pipeline_opt[
-                'micro_batch_size'] = self.user_defined_strategy.pipeline_configs[
-                    'micro_batch_size']
            self.pp_rank_ = self.role_maker._worker_index() // (
                self.sharding_degree * self.mp_degree) % self.pp_degree
-            main_program._pipeline_opt['local_rank'] = self.pp_rank_
-            main_program._pipeline_opt[
+            pipeline_opt = dict()
-                'global_rank'] = self.role_maker._worker_index()
+            pipeline_opt['schedule_mode'] = self.schedule_mode
-            main_program._pipeline_opt['use_sharding'] = True
+            pipeline_opt['micro_batch_size'] = strategy.pipeline_configs[
+                'micro_batch_size']
+            pipeline_opt['local_rank'] = self.pp_rank_
+            pipeline_opt['global_rank'] = self.role_maker._worker_index()
+            pipeline_opt['use_sharding'] = True
            # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
-            main_program._pipeline_opt['ring_id'] = 20
+            pipeline_opt['ring_id'] = 20
-            main_program._pipeline_opt['global_ring_id'] = 3
+            pipeline_opt['global_ring_id'] = 3
+            pipeline_opt['mp_degree'] = self.mp_degree
+            pipeline_opt['mp_rank'] = self.role_maker._worker_index(
+            ) % self.mp_degree
+            main_program = loss.block.program
+            main_program._pipeline_opt = pipeline_opt
            optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize(
                loss, startup_program, parameter_list, no_grad_set)

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4709,9 +4709,11 @@ class PipelineOptimizer(object):
                    if interval == -1:
                        decrease_flag = True
                    if interval == 1:
-                        assert decrease_flag is False, \
+                        # FIXME(wangxi): recompute failed
-                            "Pipeline stage must be in order, " \
+                        pass
-                            "please check the stage of op={}".format(op)
+                        #assert decrease_flag is False, \
+                        #    "Pipeline stage must be in order, " \
+                        #    "please check the stage of op={}".format(op)
                pre_stage_id = stage_id
        return device_list
@@ -4844,7 +4846,8 @@ class PipelineOptimizer(object):
                        extra_index_info['index'] += 1
                        block._insert_op_without_sync(
                            index=index + extra_index_info['index'],
-                            type='send_v2',
+                            type='send_v2'
+                            if self.mp_degree == 1 else 'partial_send',
                            inputs={'X': var},
                            attrs={
                                self._op_device_key: prev_dev,
@@ -4852,6 +4855,9 @@ class PipelineOptimizer(object):
                                'use_calc_stream': False,
                                'ring_id': ring_id,
                                'peer': 1,
+                                # if send_v2, num&id attr is not in op_attrs, will not insert
+                                'num': self.mp_degree,
+                                'id': self.mp_rank,
                            })
                        extra_index_info['index'] += 1
                        insert_index = None
@@ -4882,9 +4888,14 @@ class PipelineOptimizer(object):
                        var_shape = list(var.shape)
                        var_shape[0] = self.micro_batch_size if var_shape[
                            0] < 0 else var_shape[0]
+                        numel = np.prod(var.shape)
+                        assert numel % self.mp_degree == 0, \
+                            "The numel={} must be divisible by mp_degree={}".format(numel, self.mp_degree)
                        block._insert_op_without_sync(
                            index=index + extra_index_info['index'],
-                            type='recv_v2',
+                            type='recv_v2'
+                            if self.mp_degree == 1 else 'partial_recv',
                            outputs={'Out': [var]},
                            attrs={
                                'out_shape': var_shape,
@@ -4893,9 +4904,28 @@ class PipelineOptimizer(object):
                                self._op_role_key: op_role,
                                'use_calc_stream': True,
                                'peer': 0,
-                                'ring_id': ring_id
+                                'ring_id': ring_id,
+                                # if recv_v2, num&id attr is not in op_attrs, will not insert
+                                'num': self.mp_degree,
+                                'id': self.mp_rank,
                            })
                        extra_index_info['index'] += 1
+                        if self.mp_degree > 1:
+                            block._insert_op_without_sync(
+                                index=index + extra_index_info['index'],
+                                type='partial_allgather',
+                                inputs={'X': [var]},
+                                outputs={'Out': [var]},
+                                attrs={
+                                    self._op_device_key: cur_dev,
+                                    self._op_role_key: op_role,
+                                    'use_calc_stream': True,
+                                    'ring_id': 0,
+                                    # if recv_v2, num&id attr is not in op_attrs, will not insert
+                                    'nranks': self.mp_degree,
+                                    'rank': self.mp_rank,
+                                })
+                            extra_index_info['index'] += 1
                    else:
                        raise ValueError(
                            "Now only 'F-then-B' and '1F1B' are supported."
@@ -5207,9 +5237,10 @@ class PipelineOptimizer(object):
        block = program.block(0)
+        recv_type = 'recv_v2' if self.mp_degree == 1 else 'partial_recv'
        backward_recv_index = None
        for index, op in enumerate(block.ops):
-            if op.type == 'recv_v2' and self._is_backward_op(op):
+            if op.type == recv_type and self._is_backward_op(op):
                backward_recv_index = index
                break
@@ -5248,7 +5279,8 @@ class PipelineOptimizer(object):
        if startup_program is None:
            startup_program = default_startup_program()
-        assert main_program._pipeline_opt, 'Please use pipeline with fleet.'
+        pipeline_opt = main_program._pipeline_opt
+        assert pipeline_opt, 'Please use pipeline with fleet.'
        required_keys = [
            'local_rank',
            'schedule_mode',
@@ -5256,17 +5288,22 @@ class PipelineOptimizer(object):
            'ring_id',
            'global_ring_id',
            'use_sharding',
+            'mp_degree',
+            'mp_rank',
        ]
        for key in required_keys:
-            assert key in main_program._pipeline_opt, \
+            assert key in pipeline_opt, \
                'Please use pipeline with fleet to use {}.'.format(key)
-        self.local_rank = main_block.program._pipeline_opt['local_rank']
+        self.local_rank = pipeline_opt['local_rank']
-        self.schedule_mode = main_block.program._pipeline_opt['schedule_mode']
+        self.schedule_mode = pipeline_opt['schedule_mode']
-        self.micro_batch_size = main_block.program._pipeline_opt[
+        self.micro_batch_size = pipeline_opt['micro_batch_size']
-            'micro_batch_size']
+        self.use_sharding = pipeline_opt['use_sharding']
-        self.use_sharding = main_block.program._pipeline_opt['use_sharding']
+        self.ring_id = pipeline_opt['ring_id']
-        self.ring_id = main_block.program._pipeline_opt['ring_id']
+        self.global_ring_id = pipeline_opt['global_ring_id']
-        self.global_ring_id = main_block.program._pipeline_opt['global_ring_id']
+        self.mp_degree = pipeline_opt['mp_degree']
+        self.mp_rank = pipeline_opt['mp_rank']
+        assert self.mp_degree >= 1
+        assert 0 <= self.mp_rank < self.mp_degree
        optimize_ops, params_grads = self._optimizer.minimize(
            loss, startup_program, parameter_list, no_grad_set)