// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/distributed/collective/check.h" #include "paddle/fluid/distributed/collective/NCCLTools.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/errors.h" #ifdef PADDLE_WITH_HIP #define gpuMalloc hipMalloc #define gpuMemcpy hipMemcpy #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost #define gpuMemcpyHostToDevice hipMemcpyHostToDevice #define gpuFree hipFree #else #define gpuMalloc cudaMalloc #define gpuMemcpy cudaMemcpy #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice #define gpuFree cudaFree #endif namespace paddle { namespace distributed { // static checks void CommStaticCheck::CheckRank(int rank, int world_size) { PADDLE_ENFORCE_GE(rank, 0, phi::errors::InvalidArgument( "Rank should be greater than or equal to 0.")); PADDLE_ENFORCE_LT( rank, world_size, phi::errors::InvalidArgument("Rank is out of the process group.")); } void CommStaticCheck::CheckPlace(const phi::DenseTensor& tensor) { PADDLE_ENFORCE_EQ( platform::is_gpu_place(tensor.place()), true, platform::errors::InvalidArgument("Tensor should be in GPU place.")); } void CommStaticCheck::CheckPlace(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor) { CheckPlace(out_tensor); CheckPlace(in_tensor); PADDLE_ENFORCE_EQ( out_tensor.place(), in_tensor.place(), phi::errors::InvalidArgument( "Input and output tensors should be on the same place.")); } void CommStaticCheck::CheckDataType(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor) { PADDLE_ENFORCE_EQ( out_tensor.dtype(), in_tensor.dtype(), phi::errors::InvalidArgument( "Input and output tensors should have the same data type.")); } void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor) { PADDLE_ENFORCE_GT( tensor.numel(), 0, phi::errors::InvalidArgument("Size of tensor should be greater than 0.")); } void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, int out_size_factor, int in_size_factor) { CheckShape(out_tensor); CheckShape(in_tensor); int64_t out_size = out_tensor.numel(), in_size = in_tensor.numel(); PADDLE_ENFORCE_EQ( out_size * out_size_factor, in_size * in_size_factor, phi::errors::InvalidArgument( "Input and output tensors should have matching sizes.")); } void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, int dst_rank, int cur_rank, int world_size, int out_size_factor, int in_size_factor) { CheckRank(dst_rank, world_size); CheckRank(cur_rank, world_size); CheckPlace(out_tensor, in_tensor); CheckDataType(out_tensor, in_tensor); if (dst_rank == cur_rank) { CheckShape(out_tensor, in_tensor, out_size_factor, in_size_factor); } else { CheckShape(out_tensor); CheckShape(in_tensor); } } void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor, int rank, int world_size) { CheckPlace(tensor); CheckRank(rank, world_size); } void CommStaticCheck::SameShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, int dst_rank, int cur_rank, int world_size) { CheckShape(out_tensor, in_tensor, dst_rank, cur_rank, world_size, /*out_size_factor*/ 1, /*in_size_factor*/ 1); } void CommStaticCheck::ScatterLikeShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, int dst_rank, int cur_rank, int world_size) { CheckShape(out_tensor, in_tensor, dst_rank, cur_rank, world_size, /*out_size_factor*/ world_size, /*in_size_factor*/ 1); } void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, int dst_rank, int cur_rank, int world_size) { CheckShape(out_tensor, in_tensor, dst_rank, cur_rank, world_size, /*out_size_factor*/ 1, /*in_size_factor*/ world_size); } // dynamic checks void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, int64_t dtype) { PADDLE_ENFORCE_EQ( static_cast(tensor.dtype()), dtype, phi::errors::InvalidArgument( "Tensors in communication are expected to have the same data type.")); } void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, int root_rank, int cur_rank, ncclComm_t comm) { constexpr int kSize = sizeof(int64_t); int64_t dtype_host = static_cast(tensor.dtype()); int64_t* dtype_device; PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&dtype_device, kSize)); PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice)); NCCL_CHECK(phi::dynload::ncclBroadcast(dtype_device, dtype_device, kSize, ncclInt64, root_rank, comm, kDefaultStream)); if (root_rank == cur_rank) { VLOG(3) << "Dynamic check broadcast metadata, dtype: " << dtype_host; } else { PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(&dtype_host, dtype_device, kSize, gpuMemcpyDeviceToHost)); VLOG(3) << "Dynamic check recv metadata, dtype: " << dtype_host; CheckDataType(tensor, dtype_host); } PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(dtype_device)); } void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor, int64_t shape) { PADDLE_ENFORCE_EQ( tensor.numel(), shape, phi::errors::InvalidArgument( "Tensors in communication are expected to have matching sizes.")); } void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor, int root_rank, int cur_rank, ncclComm_t comm) { CheckDataType(tensor, root_rank, cur_rank, comm); constexpr int kSize = sizeof(int64_t); int64_t shape_host = tensor.numel(); int64_t* shape_device; PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&shape_device, kSize)); PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice)); NCCL_CHECK(phi::dynload::ncclBroadcast(shape_device, shape_device, kSize, ncclInt64, root_rank, comm, kDefaultStream)); if (root_rank == cur_rank) { VLOG(3) << "Dynamic check broadcast metadata, shape: " << shape_host; } else { PADDLE_ENFORCE_GPU_SUCCESS( gpuMemcpy(&shape_host, shape_device, kSize, gpuMemcpyDeviceToHost)); VLOG(3) << "Dynamic check recv metadata, shape: " << shape_host; CheckShape(tensor, shape_host); } PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(shape_device)); } void CommDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, const phi::DenseTensor& in_tensor, const std::vector& in_size_each_rank, int cur_rank, int world_size, ncclComm_t comm) { CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm); CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm); constexpr int kSize = sizeof(int64_t); int64_t in_row_size = in_tensor.numel() / in_tensor.dims()[0]; for (int rank = 0; rank < world_size; ++rank) { int64_t in_shape_host = in_size_each_rank[rank] * in_row_size; int64_t* in_shape_device; PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize)); PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice)); NCCL_CHECK(phi::dynload::ncclReduce(in_shape_device, in_shape_device, kSize, ncclInt64, ncclSum, rank, comm, kDefaultStream)); if (rank == cur_rank) { PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( &in_shape_host, in_shape_device, kSize, gpuMemcpyDeviceToHost)); VLOG(3) << "Dynamic check recv metadata, shape: " << in_shape_host; CheckShape(out_tensor, in_shape_host); } PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(in_shape_device)); } } } // namespace distributed } // namespace paddle