From 28b356b9a22373285f42499f38b590c8733fcc9b Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 26 Feb 2021 10:22:21 +0800 Subject: [PATCH] [ROCM] update fluid framework for rocm (part6), test=develop (#31015) --- paddle/fluid/framework/tensor_test.cc | 6 +- paddle/fluid/framework/tensor_util.cc | 18 ++--- paddle/fluid/framework/tensor_util.h | 6 +- paddle/fluid/framework/tensor_util_test.cc | 10 +-- paddle/fluid/framework/tensor_util_test.cu | 71 +++++++++++++++++++ paddle/fluid/framework/trainer.h | 33 +++++---- paddle/fluid/framework/trainer_factory.cc | 8 ++- paddle/fluid/framework/var_type_traits.cc | 8 +++ paddle/fluid/framework/var_type_traits.h | 14 ++-- .../fluid/framework/var_type_traits_test.cc | 8 +++ paddle/fluid/operators/nccl/CMakeLists.txt | 14 +++- paddle/fluid/operators/nccl/nccl_gpu_common.h | 4 ++ paddle/fluid/pybind/CMakeLists.txt | 40 +++++------ .../pybind/global_value_getter_setter.cc | 4 +- paddle/fluid/pybind/imperative.cc | 9 +-- paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 3 +- paddle/fluid/pybind/ps_gpu_wrapper_py.h | 3 +- paddle/fluid/pybind/pybind.cc | 50 ++++++++----- paddle/fluid/pybind/tensor_py.h | 17 +++-- 19 files changed, 229 insertions(+), 97 deletions(-) diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 92a29d5165c..54f77981306 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -118,7 +118,7 @@ TEST(Tensor, MutableData) { EXPECT_EQ(static_cast(p2[0]), 1); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { framework::Tensor src_tensor; float* p1 = nullptr; @@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { framework::Tensor src_tensor; framework::Tensor dst_tensor; @@ -212,7 +212,7 @@ TEST(Tensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { framework::Tensor src_tensor; src_tensor.mutable_data(framework::make_ddim({6, 9}), diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 1ad321df216..c6ac30a3698 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, @@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, @@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) { return !Any(tensor, pred_inf) && !Any(tensor, pred_nan); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) { CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); } @@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> { } void VisitorImpl(const platform::CUDAPlace& gpu) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu); constexpr int MAX_BLOCK_DIM = 512; const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; @@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, platform::errors::ResourceExhausted( "tensor size %d overflow when writing tensor", size)); if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB std::unique_ptr buf(new char[kBufSize]); auto& gpu_dev_ctx = @@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace())) { -#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || platform::is_xpu_place(dev_ctx.GetPlace())) { -#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { if (dl_tensor.ctx.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dl_tensor.ctx.device_type == kDLGPU) { platform::CUDAPlace dst_place = platform::CUDAPlace(dl_tensor.ctx.device_id); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 50644370bc6..8a127e0ed59 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, @@ -150,7 +150,7 @@ void TensorFromVector(const std::vector& src, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, src_place, src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, @@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index e389cb34679..c32efd0a470 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) { } EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { Tensor src_tensor; Tensor gpu_tensor; @@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) { delete cpu_place; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; paddle::framework::Tensor cpu_tensor; @@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) { EXPECT_EQ(src_ptr[i], dst[i]); } } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; paddle::framework::Tensor gpu_tensor; @@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) { } } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; paddle::framework::Tensor cpu_tensor; @@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) { EXPECT_EQ(dst_tensor.dims(), src_tensor.dims()); delete place; } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) { Tensor gpu_tensor; gpu_tensor.Resize({2, 3}); diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu index a51f74199e7..4517726a5c0 100644 --- a/paddle/fluid/framework/tensor_util_test.cu +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsNAN(tensor)); } @@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) { Tensor tensor; paddle::platform::float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsNAN(tensor)); } @@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsInf(tensor)); } @@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) { Tensor tensor; paddle::platform::float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); ASSERT_TRUE(TensorContainsInf(tensor)); } @@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(!TensorIsfinite(tensor)); } { Tensor tensor; float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(!TensorIsfinite(tensor)); } @@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(!TensorIsfinite(tensor)); } { Tensor tensor; float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(!TensorIsfinite(tensor)); } @@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) { { Tensor tensor; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(), + buf); +#else FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(TensorIsfinite(tensor)); } { Tensor tensor; float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(), + buf); +#else FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); EXPECT_TRUE(TensorIsfinite(tensor)); } @@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) { { Tensor tensor, out; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorContainsInf(tensor, &out); platform::CPUPlace cpu; @@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) { Tensor tensor, out; paddle::platform::float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorContainsInf(tensor, &out); platform::CPUPlace cpu; @@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) { { Tensor tensor, out; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorContainsNAN(tensor, &out); platform::CPUPlace cpu; @@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) { Tensor tensor, out; paddle::platform::float16* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorContainsNAN(tensor, &out); platform::CPUPlace cpu; @@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) { { Tensor tensor, out; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorIsfinite(tensor, &out); platform::CPUPlace cpu; @@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) { { Tensor tensor, out; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf); +#else FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorIsfinite(tensor, &out); platform::CPUPlace cpu; @@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) { { Tensor tensor, out; float* buf = tensor.mutable_data({3}, gpu); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(), + buf); +#else FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); +#endif cuda_ctx->Wait(); TensorIsfinite(tensor, &out); platform::CPUPlace cpu; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index d949ba2bffe..ca290a50b42 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer { std::shared_ptr pull_dense_worker_; }; -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ + defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) class HeterServiceContext { public: @@ -155,8 +156,9 @@ class HeterServiceContext { void Reset() { push_dense_status_.clear(); } int place_num_; Scope* scope_{nullptr}; -#ifdef PADDLE_WITH_CUDA - cudaEvent_t event_; + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + gpuEvent_t event_; #endif std::vector ops_; std::vector<::std::future> push_dense_status_; @@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, const paddle::platform::Place& thread_place, - cudaStream_t stream); + gpuStream_t stream); #endif #ifdef PADDLE_WITH_XPU void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor, @@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase { std::vector place_scopes_; BtObjectPool object_pool_; std::vector places_; -#ifdef PADDLE_WITH_CUDA - std::vector copy_streams_; - std::vector events_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::vector copy_streams_; + std::vector events_; #endif }; @@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase { virtual std::string GetDumpPath(int tid) { return ""; } virtual void InitDumpEnv() {} template -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, const paddle::platform::Place& thread_place, - cudaStream_t stream); + gpuStream_t stream); #endif void CreateThreadParam(const ProgramDesc& program, int num); template @@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase { std::vector threads_; int use_ps_gpu_; int thread_num_; -#ifdef PADDLE_WITH_CUDA - std::vector copy_streams_; - std::vector events_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::vector copy_streams_; + std::vector events_; #endif }; #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) class PSGPUTrainer : public TrainerBase { public: PSGPUTrainer() {} @@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 764338a8cc6..6b9dbece897 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -66,15 +66,17 @@ std::shared_ptr TrainerFactory::CreateTrainer( REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer); -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ + defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(HeterXpuTrainer); REGISTER_TRAINER_CLASS(HeterBoxTrainer); #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 81c7d0d0c88..886d00e562b 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -28,6 +28,14 @@ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif +#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/nccl_helper.h" // NOLINT +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT +#include "paddle/fluid/operators/miopen_rnn_cache.h" +#endif #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/bkcl_helper.h" diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 2fd4de5cfcb..b0d8f43a90f 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -30,6 +30,12 @@ #include #endif #endif +#ifdef PADDLE_WITH_HIP +#include +#ifdef PADDLE_WITH_RCCL +#include +#endif +#endif #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" @@ -39,8 +45,8 @@ namespace paddle { namespace platform { -#ifdef PADDLE_WITH_CUDA -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class Communicator; class NCCLCommunicator; #endif @@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, operators::reader::LoDTensorBlockingQueueHolder, FetchList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, -#ifdef PADDLE_WITH_CUDA -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif operators::CudnnRNNCache, diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 9d1bd77ebdf..2a6635c4b60 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -28,6 +28,14 @@ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif +#ifdef PADDLE_WITH_HIP +#if defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT +#include "paddle/fluid/platform/nccl_helper.h" // NOLINT +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT +#include "paddle/fluid/operators/miopen_rnn_cache.h" +#endif #if defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/bkcl_helper.h" #endif diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt index 4f1fe372f58..9a412228255 100644 --- a/paddle/fluid/operators/nccl/CMakeLists.txt +++ b/paddle/fluid/operators/nccl/CMakeLists.txt @@ -1,4 +1,4 @@ -if (NOT WITH_NCCL) +if (NOT (WITH_NCCL OR WITH_RCCL)) return() endif() @@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) endif() -if(WITH_GPU) +if(WITH_ROCM AND NOT WIN32) + hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) +endif() + +if(WITH_GPU OR WITH_ROCM) op_library(nccl_op DEPS nccl_common) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE) endif() -if(NOT WIN32) +if(WITH_GPU AND NOT WIN32) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() + +if(WITH_ROCM AND NOT WIN32) + hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +endif() diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h index 558ff4cc096..01905d8ca84 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.h +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h @@ -23,7 +23,11 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_RCCL +#include "paddle/fluid/platform/dynload/rccl.h" +#else #include "paddle/fluid/platform/dynload/nccl.h" +#endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index bdf018db6f8..ccf589e8588 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator) -if (WITH_GPU) +if (WITH_GPU OR WITH_ROCM) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) endif() -if (WITH_NCCL) +if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} reducer) endif() @@ -21,7 +21,7 @@ endif() if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) - if (WITH_NCCL) + if (WITH_NCCL OR WITH_RCCL) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) endif() endif(NOT WIN32) @@ -71,7 +71,7 @@ if (WITH_PSCORE) list(APPEND PYBIND_SRCS fleet_py.cc) endif() -if (WITH_NCCL) +if (WITH_NCCL OR WITH_RCCL) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) endif() @@ -81,9 +81,9 @@ if(WITH_PYTHON) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) - if(WITH_NCCL) + if (WITH_NCCL OR WITH_RCCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) - endif(WITH_NCCL) + endif() if(WITH_XPU_BKCL) list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context) @@ -93,6 +93,9 @@ if(WITH_PYTHON) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(op_function_generator ${os_dependency_modules}) + if(WITH_ROCM) + target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB}) + endif() set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) set(tmp_impl_file ${impl_file}.tmp) @@ -164,20 +167,17 @@ if(WITH_PYTHON) endif(WITH_MKLDNN) endif(WIN32) - if(WITH_ROCM_PLATFORM) - cc_library(paddle_pybind SHARED - SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - else() - cc_library(paddle_pybind SHARED - SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) - if(NOT APPLE AND NOT WIN32) - target_link_libraries(paddle_pybind rt) - endif(NOT APPLE AND NOT WIN32) - endif(WITH_ROCM_PLATFORM) + cc_library(paddle_pybind SHARED + SRCS ${PYBIND_SRCS} + DEPS ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + if(NOT APPLE AND NOT WIN32) + target_link_libraries(paddle_pybind rt) + endif(NOT APPLE AND NOT WIN32) + + if(WITH_ROCM) + target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB}) + endif() get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index fa44eeb485c..1732cf5bfde 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -66,7 +66,7 @@ DECLARE_bool(benchmark); DECLARE_int32(inner_op_parallelism); DECLARE_int32(max_inplace_grad_add); DECLARE_string(tracer_profile_fname); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // cudnn DECLARE_uint64(conv_workspace_size_limit); DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add, FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) REGISTER_PUBLIC_GLOBAL_VAR( FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic, FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent, diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 8e894fc07a3..21088e06a23 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) { [](imperative::VarBase &self, const imperative::ParallelStrategy &strategy) { if (strategy.nranks_ > 1) { -#ifdef PADDLE_WITH_NCCL +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #if NCCL_VERSION_CODE >= 2212 imperative::AllReduce(self.Var(), self.MutableVar(), strategy); #else @@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) { )DOC") .def("pin_memory", [](const std::shared_ptr &self) { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot copy this Tensor to pinned memory in CPU version " "Paddle, " @@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) { .def("cuda", [](const std::shared_ptr &self, int device_id, bool blocking) { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot copy this Tensor to GPU in CPU version Paddle, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); -#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) py::class_>(m, "ParallelContext"); diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 96acfd7bc04..5bff9178fdf 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -32,7 +32,8 @@ namespace py = pybind11; namespace paddle { namespace pybind { -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) void BindPSGPUWrapper(py::module* m) { py::class_>( *m, "PSGPU") diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.h b/paddle/fluid/pybind/ps_gpu_wrapper_py.h index 4048e88a55a..8bd6ee13cf5 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.h +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.h @@ -22,7 +22,8 @@ namespace py = pybind11; namespace paddle { namespace pybind { -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) void BindPSGPUWrapper(py::module* m); #endif } // namespace pybind diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 750fb6e2258..d11f3c005ee 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -86,7 +86,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" -#ifdef PADDLE_WITH_NCCL +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif #include "paddle/fluid/framework/data_type.h" @@ -95,11 +95,13 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" -#ifdef PADDLE_WITH_CUDA -#ifdef PADDLE_WITH_NCCL +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #endif +#ifndef PADDLE_WITH_HIP #include "paddle/fluid/platform/cuda_profiler.h" +#endif #include "paddle/fluid/platform/gpu_info.h" #endif @@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); namespace paddle { namespace pybind { bool IsCompiledWithCUDA() { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) + return false; +#else + return true; +#endif +} + +bool IsCompiledWithROCM() { +#ifndef PADDLE_WITH_HIP return false; #else return true; @@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) { m.def("set_num_threads", &platform::SetNumThreads); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("cudnn_version", &platform::CudnnVersion); #endif @@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) { if (dl.ctx.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (dl.ctx.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } @@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) { .def("height", &SelectedRows::height) .def("set_rows", [](SelectedRows &self, std::vector rows) { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) self.set_rows(rows); #else Vector new_rows(rows); @@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle. .def_static("create", [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW( platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " @@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle. .def_static("create", [](paddle::platform::CUDAPinnedPlace& place) -> paddle::platform::DeviceContext* { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW( platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " @@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle. #endif });; // clang-format on -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) py::class_(m, "Communicator").def(py::init<>()); #endif py::class_(m, "CUDAPlace", R"DOC( @@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle. )DOC") .def("__init__", [](platform::CUDAPlace &self, int dev_id) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (UNLIKELY(dev_id < 0)) { LOG(ERROR) << string::Sprintf( "Invalid CUDAPlace(%d), device id must be 0 or " @@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle. std::exit(-1); #endif }) -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) .def("get_device_id", [](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) .def("_type", &PlaceIndex) @@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle. )DOC") .def("__init__", [](platform::CUDAPinnedPlace &self) { -#ifndef PADDLE_WITH_CUDA +#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPinnedPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); @@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_devices", []() { framework::InitDevices(); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("supports_bfloat16", SupportsBfloat16); @@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle. py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, py::arg("redirect_stderr") = false); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { // Only GPUs with Compute Capability >= 53 support float16 return platform::GetCUDAComputeCapability(place.device) >= 53; @@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetCUDADeviceCount); -#ifndef _WIN32 +#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_stop", platform::CudaProfilerStop); @@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("size_of_dtype", framework::SizeOfType); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("set_cublas_switch", platform::SetAllowTF32Cublas); m.def("get_cublas_switch", platform::AllowTF32Cublas); m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn); @@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_PSLIB BindHeterWrapper(&m); #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) BindPSGPUWrapper(&m); #endif BindGlooWrapper(&m); @@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_BOX_PS BindBoxWrapper(&m); #endif -#ifdef PADDLE_WITH_NCCL +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) BindNCCLWrapper(&m); #endif #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 5ddb498980d..5f252170070 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -27,7 +27,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/bfloat16.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "paddle/fluid/platform/device_context.h" @@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T)); #endif } else if (platform::is_gpu_place(self.place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) const T *a = self.data(); auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), @@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T)); #endif } else if (platform::is_gpu_place(self->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place()); T *a = self->mutable_data(p); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), @@ -296,7 +296,7 @@ void SetTensorFromPyArrayT( "Please recompile or reinstall Paddle with XPU support.")); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (paddle::platform::is_gpu_place(place)) { // NOTE(wangxi): When copying data to the accelerator card, // we need set_device(dev_id) first. @@ -304,8 +304,13 @@ void SetTensorFromPyArrayT( platform::CUDADeviceGuard guard( BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device); auto dst = self->mutable_data(place); +#ifdef PADDLE_WITH_HIP + paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(), + hipMemcpyHostToDevice); +#else paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice); +#endif } else if (paddle::platform::is_cuda_pinned_place(place)) { auto dst = self->mutable_data(place); @@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, self.type()); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_cuda_pinned_place(place)) { output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place), self.type()); @@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, "Please recompile or reinstall Paddle with XPU support.")); #endif } else if (is_gpu_tensor) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); PADDLE_ENFORCE_EQ(py_arr.writeable(), true, platform::errors::InvalidArgument( -- GitLab