未验证 提交 f6ff2221 编写于 作者: Y Yuang Liu 提交者: GitHub

fix fused attention, ffn, fm under new process group (#44259)

上级 c5c6026e
...@@ -24,11 +24,13 @@ limitations under the License. */ ...@@ -24,11 +24,13 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif #endif
...@@ -44,8 +46,21 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT ...@@ -44,8 +46,21 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
const platform::CUDADeviceContext &ctx) { const platform::CUDADeviceContext &ctx) {
if (ring_id == -1) return; if (ring_id == -1) return;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto dtype = auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
if (map->has(ring_id)) {
paddle::distributed::ProcessGroup *pg = map->get(ring_id);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(tensor);
out_tensor.push_back(tensor);
paddle::distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
auto dtype = platform::ToNCCLDataType(
framework::TransToProtoVarType(tensor.dtype()));
int64_t numel = tensor.numel(); int64_t numel = tensor.numel();
const void *sendbuff = tensor.data<T>(); const void *sendbuff = tensor.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -54,6 +69,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT ...@@ -54,6 +69,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
auto stream = ctx.stream(); auto stream = ctx.stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
}
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"PaddlePaddle should compile with NCCL or RCCL when used tensor model " "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
......
...@@ -17,11 +17,13 @@ limitations under the License. */ ...@@ -17,11 +17,13 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
#include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
#include "paddle/fluid/operators/matmul_v2_op.h" #include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif #endif
...@@ -37,8 +39,21 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT ...@@ -37,8 +39,21 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT
const platform::CUDADeviceContext& ctx) { const platform::CUDADeviceContext& ctx) {
if (ring_id == -1) return; if (ring_id == -1) return;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto dtype = auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
if (map->has(ring_id)) {
paddle::distributed::ProcessGroup* pg = map->get(ring_id);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(tensor);
out_tensor.push_back(tensor);
paddle::distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
auto dtype = platform::ToNCCLDataType(
framework::TransToProtoVarType(tensor.dtype()));
int64_t numel = tensor.numel(); int64_t numel = tensor.numel();
const void* sendbuff = tensor.data<T>(); const void* sendbuff = tensor.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -47,6 +62,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT ...@@ -47,6 +62,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT
auto stream = ctx.stream(); auto stream = ctx.stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
}
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"PaddlePaddle should compile with NCCL or RCCL when used tensor model " "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
......
...@@ -29,9 +29,11 @@ limitations under the License. */ ...@@ -29,9 +29,11 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif #endif
...@@ -50,8 +52,21 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT ...@@ -50,8 +52,21 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
const platform::CUDADeviceContext &ctx) { const platform::CUDADeviceContext &ctx) {
if (ring_id == -1) return; if (ring_id == -1) return;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto dtype = auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype()));
if (map->has(ring_id)) {
paddle::distributed::ProcessGroup *pg = map->get(ring_id);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(tensor);
out_tensor.push_back(tensor);
paddle::distributed::AllreduceOptions opts;
opts.reduce_op = distributed::ReduceOp::SUM;
auto task = pg->AllReduce(in_tensor, out_tensor, opts);
task->Wait();
} else {
auto dtype = platform::ToNCCLDataType(
framework::TransToProtoVarType(tensor.dtype()));
int64_t numel = tensor.numel(); int64_t numel = tensor.numel();
const void *sendbuff = tensor.data<T>(); const void *sendbuff = tensor.data<T>();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
...@@ -60,6 +75,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT ...@@ -60,6 +75,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT
auto stream = ctx.stream(); auto stream = ctx.stream();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
}
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"PaddlePaddle should compile with NCCL or RCCL when used tensor model " "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册