未验证 提交 28b356b9 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part6), test=develop (#31015)

上级 c8fac5ee
...@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) { ...@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ(static_cast<int>(p2[0]), 1); EXPECT_EQ(static_cast<int>(p2[0]), 1);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
framework::Tensor src_tensor; framework::Tensor src_tensor;
float* p1 = nullptr; float* p1 = nullptr;
...@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) { ...@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>()); ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
framework::Tensor src_tensor; framework::Tensor src_tensor;
framework::Tensor dst_tensor; framework::Tensor dst_tensor;
...@@ -212,7 +212,7 @@ TEST(Tensor, Slice) { ...@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
framework::Tensor src_tensor; framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}), src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
......
...@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, ...@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
...@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, ...@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place)); "Copy from %s to %s is not supported.", src_place, dst_place));
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) { platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
...@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) { ...@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
return !Any(tensor, pred_inf) && !Any(tensor, pred_nan); return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T> template <typename T>
static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) { static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); } CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
...@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> { ...@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
} }
void VisitorImpl(const platform::CUDAPlace& gpu) const { void VisitorImpl(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
constexpr int MAX_BLOCK_DIM = 512; constexpr int MAX_BLOCK_DIM = 512;
const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
...@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, ...@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
platform::errors::ResourceExhausted( platform::errors::ResourceExhausted(
"tensor size %d overflow when writing tensor", size)); "tensor size %d overflow when writing tensor", size));
if (platform::is_gpu_place(tensor.place())) { if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]); std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx = auto& gpu_dev_ctx =
...@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) { platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape)); cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType( framework::VisitDataType(
...@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) || if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) { platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor cpu_tensor; Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims)); cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType( framework::VisitDataType(
...@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { ...@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
if (dl_tensor.ctx.device_type == kDLCPU) { if (dl_tensor.ctx.device_type == kDLCPU) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (dl_tensor.ctx.device_type == kDLGPU) { if (dl_tensor.ctx.device_type == kDLGPU) {
platform::CUDAPlace dst_place = platform::CUDAPlace dst_place =
platform::CUDAPlace(dl_tensor.ctx.device_id); platform::CUDAPlace(dl_tensor.ctx.device_id);
......
...@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size, ...@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size); src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
...@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size); src_place, src_ptr, size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
...@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size); size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
......
...@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) { ...@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
} }
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
Tensor src_tensor; Tensor src_tensor;
Tensor gpu_tensor; Tensor gpu_tensor;
...@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) { ...@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
delete cpu_place; delete cpu_place;
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor cpu_tensor; paddle::framework::Tensor cpu_tensor;
...@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) { ...@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
EXPECT_EQ(src_ptr[i], dst[i]); EXPECT_EQ(src_ptr[i], dst[i]);
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor gpu_tensor; paddle::framework::Tensor gpu_tensor;
...@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) { ...@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor cpu_tensor; paddle::framework::Tensor cpu_tensor;
...@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) { ...@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
EXPECT_EQ(dst_tensor.dims(), src_tensor.dims()); EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
delete place; delete place;
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
Tensor gpu_tensor; Tensor gpu_tensor;
gpu_tensor.Resize({2, 3}); gpu_tensor.Resize({2, 3});
......
...@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) { ...@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
{ {
Tensor tensor; Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor)); ASSERT_TRUE(TensorContainsNAN(tensor));
} }
...@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) { ...@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
Tensor tensor; Tensor tensor;
paddle::platform::float16* buf = paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu); tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor)); ASSERT_TRUE(TensorContainsNAN(tensor));
} }
...@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) { ...@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
{ {
Tensor tensor; Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor)); ASSERT_TRUE(TensorContainsInf(tensor));
} }
...@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) { ...@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
Tensor tensor; Tensor tensor;
paddle::platform::float16* buf = paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu); tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor)); ASSERT_TRUE(TensorContainsInf(tensor));
} }
...@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) { ...@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
{ {
Tensor tensor; Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor)); EXPECT_TRUE(!TensorIsfinite(tensor));
} }
{ {
Tensor tensor; Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu); float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor)); EXPECT_TRUE(!TensorIsfinite(tensor));
} }
...@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) { ...@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
{ {
Tensor tensor; Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor)); EXPECT_TRUE(!TensorIsfinite(tensor));
} }
{ {
Tensor tensor; Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu); float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor)); EXPECT_TRUE(!TensorIsfinite(tensor));
} }
...@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) { ...@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
{ {
Tensor tensor; Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(TensorIsfinite(tensor)); EXPECT_TRUE(TensorIsfinite(tensor));
} }
{ {
Tensor tensor; Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu); float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
EXPECT_TRUE(TensorIsfinite(tensor)); EXPECT_TRUE(TensorIsfinite(tensor));
} }
...@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) { ...@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
{ {
Tensor tensor, out; Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorContainsInf(tensor, &out); TensorContainsInf(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) { ...@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
Tensor tensor, out; Tensor tensor, out;
paddle::platform::float16* buf = paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu); tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorContainsInf(tensor, &out); TensorContainsInf(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) { ...@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
{ {
Tensor tensor, out; Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorContainsNAN(tensor, &out); TensorContainsNAN(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) { ...@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
Tensor tensor, out; Tensor tensor, out;
paddle::platform::float16* buf = paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu); tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorContainsNAN(tensor, &out); TensorContainsNAN(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) { ...@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{ {
Tensor tensor, out; Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorIsfinite(tensor, &out); TensorIsfinite(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) { ...@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{ {
Tensor tensor, out; Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorIsfinite(tensor, &out); TensorIsfinite(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
...@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) { ...@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{ {
Tensor tensor, out; Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu); float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait(); cuda_ctx->Wait();
TensorIsfinite(tensor, &out); TensorIsfinite(tensor, &out);
platform::CPUPlace cpu; platform::CPUPlace cpu;
......
...@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer { ...@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_; std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
}; };
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
class HeterServiceContext { class HeterServiceContext {
public: public:
...@@ -155,8 +156,9 @@ class HeterServiceContext { ...@@ -155,8 +156,9 @@ class HeterServiceContext {
void Reset() { push_dense_status_.clear(); } void Reset() { push_dense_status_.clear(); }
int place_num_; int place_num_;
Scope* scope_{nullptr}; Scope* scope_{nullptr};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuEvent_t event_;
#endif #endif
std::vector<OperatorBase*> ops_; std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_; std::vector<::std::future<int32_t>> push_dense_status_;
...@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
virtual std::string GetDumpPath(int tid) { return ""; } virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {} virtual void InitDumpEnv() {}
template <typename T> template <typename T>
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place, const paddle::platform::Place& thread_place,
cudaStream_t stream); gpuStream_t stream);
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor, void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
...@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase { ...@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
std::vector<Scope*> place_scopes_; std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_; BtObjectPool<HeterServiceContext> object_pool_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<cudaStream_t> copy_streams_; std::vector<gpuStream_t> copy_streams_;
std::vector<cudaEvent_t> events_; std::vector<gpuEvent_t> events_;
#endif #endif
}; };
...@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase { ...@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
virtual std::string GetDumpPath(int tid) { return ""; } virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {} virtual void InitDumpEnv() {}
template <typename T> template <typename T>
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place, const paddle::platform::Place& thread_place,
cudaStream_t stream); gpuStream_t stream);
#endif #endif
void CreateThreadParam(const ProgramDesc& program, int num); void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T> template <typename T>
...@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase { ...@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
std::vector<std::thread> threads_; std::vector<std::thread> threads_;
int use_ps_gpu_; int use_ps_gpu_;
int thread_num_; int thread_num_;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<cudaStream_t> copy_streams_; std::vector<gpuStream_t> copy_streams_;
std::vector<cudaEvent_t> events_; std::vector<gpuEvent_t> events_;
#endif #endif
}; };
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class PSGPUTrainer : public TrainerBase { class PSGPUTrainer : public TrainerBase {
public: public:
PSGPUTrainer() {} PSGPUTrainer() {}
...@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase { ...@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
}; };
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class PipelineTrainer : public TrainerBase { class PipelineTrainer : public TrainerBase {
public: public:
PipelineTrainer() {} PipelineTrainer() {}
......
...@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer( ...@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer); REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer); REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer); REGISTER_TRAINER_CLASS(HeterXpuTrainer);
REGISTER_TRAINER_CLASS(HeterBoxTrainer); REGISTER_TRAINER_CLASS(HeterBoxTrainer);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(PSGPUTrainer); REGISTER_TRAINER_CLASS(PSGPUTrainer);
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer); REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -28,6 +28,14 @@ ...@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif #endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/bkcl_helper.h"
......
...@@ -30,6 +30,12 @@ ...@@ -30,6 +30,12 @@
#include <nccl.h> #include <nccl.h>
#endif #endif
#endif #endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h" #include "xpu/bkcl.h"
...@@ -39,8 +45,8 @@ ...@@ -39,8 +45,8 @@
namespace paddle { namespace paddle {
namespace platform { namespace platform {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class Communicator; class Communicator;
class NCCLCommunicator; class NCCLCommunicator;
#endif #endif
...@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ...@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList, operators::reader::LoDTensorBlockingQueueHolder, FetchList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
#endif #endif
operators::CudnnRNNCache, operators::CudnnRNNCache,
......
...@@ -28,6 +28,14 @@ ...@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif #endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/bkcl_helper.h"
#endif #endif
......
if (NOT WITH_NCCL) if (NOT (WITH_NCCL OR WITH_RCCL))
return() return()
endif() endif()
...@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32) ...@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator ) nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
endif() endif()
if(WITH_GPU) if(WITH_ROCM AND NOT WIN32)
hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
endif()
if(WITH_GPU OR WITH_ROCM)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE) set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
endif() endif()
if(NOT WIN32) if(WITH_GPU AND NOT WIN32)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
if(WITH_ROCM AND NOT WIN32)
hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
...@@ -23,7 +23,11 @@ limitations under the License. */ ...@@ -23,7 +23,11 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
......
...@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp ...@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator) gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
if (WITH_GPU) if (WITH_GPU OR WITH_ROCM)
set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
endif() endif()
if (WITH_NCCL) if (WITH_NCCL OR WITH_RCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
set(PYBIND_DEPS ${PYBIND_DEPS} reducer) set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
endif() endif()
...@@ -21,7 +21,7 @@ endif() ...@@ -21,7 +21,7 @@ endif()
if(NOT WIN32) if(NOT WIN32)
set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
if (WITH_NCCL) if (WITH_NCCL OR WITH_RCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context) set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
endif() endif()
endif(NOT WIN32) endif(NOT WIN32)
...@@ -71,7 +71,7 @@ if (WITH_PSCORE) ...@@ -71,7 +71,7 @@ if (WITH_PSCORE)
list(APPEND PYBIND_SRCS fleet_py.cc) list(APPEND PYBIND_SRCS fleet_py.cc)
endif() endif()
if (WITH_NCCL) if (WITH_NCCL OR WITH_RCCL)
list(APPEND PYBIND_SRCS nccl_wrapper_py.cc) list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
endif() endif()
...@@ -81,9 +81,9 @@ if(WITH_PYTHON) ...@@ -81,9 +81,9 @@ if(WITH_PYTHON)
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
if(WITH_NCCL) if (WITH_NCCL OR WITH_RCCL)
list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context) list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
endif(WITH_NCCL) endif()
if(WITH_XPU_BKCL) if(WITH_XPU_BKCL)
list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context) list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context)
...@@ -93,6 +93,9 @@ if(WITH_PYTHON) ...@@ -93,6 +93,9 @@ if(WITH_PYTHON)
target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(op_function_generator ${os_dependency_modules}) target_link_libraries(op_function_generator ${os_dependency_modules})
if(WITH_ROCM)
target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
endif()
set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
set(tmp_impl_file ${impl_file}.tmp) set(tmp_impl_file ${impl_file}.tmp)
...@@ -164,20 +167,17 @@ if(WITH_PYTHON) ...@@ -164,20 +167,17 @@ if(WITH_PYTHON)
endif(WITH_MKLDNN) endif(WITH_MKLDNN)
endif(WIN32) endif(WIN32)
if(WITH_ROCM_PLATFORM) cc_library(paddle_pybind SHARED
cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS}
SRCS ${PYBIND_SRCS} DEPS ${PYBIND_DEPS}
DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) if(NOT APPLE AND NOT WIN32)
else() target_link_libraries(paddle_pybind rt)
cc_library(paddle_pybind SHARED endif(NOT APPLE AND NOT WIN32)
SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS} if(WITH_ROCM)
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
if(NOT APPLE AND NOT WIN32) endif()
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT WIN32)
endif(WITH_ROCM_PLATFORM)
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules}) target_link_libraries(paddle_pybind ${os_dependency_modules})
......
...@@ -66,7 +66,7 @@ DECLARE_bool(benchmark); ...@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism); DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add); DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname); DECLARE_string(tracer_profile_fname);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn // cudnn
DECLARE_uint64(conv_workspace_size_limit); DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_batchnorm_spatial_persistent); DECLARE_bool(cudnn_batchnorm_spatial_persistent);
...@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() { ...@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add, FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off); FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR( REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic, FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent, FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
......
...@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
[](imperative::VarBase &self, [](imperative::VarBase &self,
const imperative::ParallelStrategy &strategy) { const imperative::ParallelStrategy &strategy) {
if (strategy.nranks_ > 1) { if (strategy.nranks_ > 1) {
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if NCCL_VERSION_CODE >= 2212 #if NCCL_VERSION_CODE >= 2212
imperative::AllReduce(self.Var(), self.MutableVar(), strategy); imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
#else #else
...@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
)DOC") )DOC")
.def("pin_memory", .def("pin_memory",
[](const std::shared_ptr<imperative::VarBase> &self) { [](const std::shared_ptr<imperative::VarBase> &self) {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot copy this Tensor to pinned memory in CPU version " "Cannot copy this Tensor to pinned memory in CPU version "
"Paddle, " "Paddle, "
...@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) { ...@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
.def("cuda", .def("cuda",
[](const std::shared_ptr<imperative::VarBase> &self, int device_id, [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
bool blocking) { bool blocking) {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot copy this Tensor to GPU in CPU version Paddle, " "Cannot copy this Tensor to GPU in CPU version Paddle, "
"Please recompile or reinstall Paddle with CUDA support.")); "Please recompile or reinstall Paddle with CUDA support."));
...@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) { ...@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
}, },
py::call_guard<py::gil_scoped_release>()); py::call_guard<py::gil_scoped_release>());
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::ParallelContext, py::class_<imperative::ParallelContext,
std::shared_ptr<imperative::ParallelContext>>(m, std::shared_ptr<imperative::ParallelContext>>(m,
"ParallelContext"); "ParallelContext");
......
...@@ -32,7 +32,8 @@ namespace py = pybind11; ...@@ -32,7 +32,8 @@ namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void BindPSGPUWrapper(py::module* m) { void BindPSGPUWrapper(py::module* m) {
py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>( py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
*m, "PSGPU") *m, "PSGPU")
......
...@@ -22,7 +22,8 @@ namespace py = pybind11; ...@@ -22,7 +22,8 @@ namespace py = pybind11;
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void BindPSGPUWrapper(py::module* m); void BindPSGPUWrapper(py::module* m);
#endif #endif
} // namespace pybind } // namespace pybind
......
...@@ -86,7 +86,7 @@ limitations under the License. */ ...@@ -86,7 +86,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h" #include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/pybind_boost_headers.h"
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h" #include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif #endif
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
...@@ -95,11 +95,13 @@ limitations under the License. */ ...@@ -95,11 +95,13 @@ limitations under the License. */
#include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h" #include "paddle/fluid/string/to_string.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif #endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_profiler.h" #include "paddle/fluid/platform/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType); ...@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
bool IsCompiledWithCUDA() { bool IsCompiledWithCUDA() {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
return false;
#else
return true;
#endif
}
bool IsCompiledWithROCM() {
#ifndef PADDLE_WITH_HIP
return false; return false;
#else #else
return true; return true;
...@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
m.def("set_num_threads", &platform::SetNumThreads); m.def("set_num_threads", &platform::SetNumThreads);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion); m.def("cudnn_version", &platform::CudnnVersion);
#endif #endif
...@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
if (dl.ctx.device_type == kDLCPU) { if (dl.ctx.device_type == kDLCPU) {
paddle::framework::TensorFromDLPack(dl, &tensor); paddle::framework::TensorFromDLPack(dl, &tensor);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (dl.ctx.device_type == kDLGPU) { if (dl.ctx.device_type == kDLGPU) {
paddle::framework::TensorFromDLPack(dl, &tensor); paddle::framework::TensorFromDLPack(dl, &tensor);
} }
...@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
.def("height", &SelectedRows::height) .def("height", &SelectedRows::height)
.def("set_rows", .def("set_rows",
[](SelectedRows &self, std::vector<int64_t> rows) { [](SelectedRows &self, std::vector<int64_t> rows) {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self.set_rows(rows); self.set_rows(rows);
#else #else
Vector<int64_t> new_rows(rows); Vector<int64_t> new_rows(rows);
...@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create", .def_static("create",
[](paddle::platform::CUDAPlace& place) [](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW( PADDLE_THROW(
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, " "Cannot use CUDAPlace in CPU only version, "
...@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create", .def_static("create",
[](paddle::platform::CUDAPinnedPlace& place) [](paddle::platform::CUDAPinnedPlace& place)
-> paddle::platform::DeviceContext* { -> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW( PADDLE_THROW(
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, " "Cannot use CUDAPinnedPlace in CPU only version, "
...@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
});; });;
// clang-format on // clang-format on
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>()); py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif #endif
py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC( py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
...@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC") )DOC")
.def("__init__", .def("__init__",
[](platform::CUDAPlace &self, int dev_id) { [](platform::CUDAPlace &self, int dev_id) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (UNLIKELY(dev_id < 0)) { if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), device id must be 0 or " "Invalid CUDAPlace(%d), device id must be 0 or "
...@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
std::exit(-1); std::exit(-1);
#endif #endif
}) })
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_device_id", .def("get_device_id",
[](const platform::CUDAPlace &self) { return self.GetDeviceId(); }) [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
.def("_type", &PlaceIndex<platform::CUDAPlace>) .def("_type", &PlaceIndex<platform::CUDAPlace>)
...@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC") )DOC")
.def("__init__", .def("__init__",
[](platform::CUDAPinnedPlace &self) { [](platform::CUDAPinnedPlace &self) {
#ifndef PADDLE_WITH_CUDA #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, " "Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support.")); "Please recompile or reinstall Paddle with CUDA support."));
...@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", []() { framework::InitDevices(); }); m.def("init_devices", []() { framework::InitDevices(); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16", SupportsBfloat16);
...@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
py::arg("redirect_stderr") = false); py::arg("redirect_stderr") = false);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16 // Only GPUs with Compute Capability >= 53 support float16
return platform::GetCUDAComputeCapability(place.device) >= 53; return platform::GetCUDAComputeCapability(place.device) >= 53;
...@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::take_ownership); py::return_value_policy::take_ownership);
m.def("op_support_gpu", OpSupportGPU); m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("get_cuda_device_count", platform::GetCUDADeviceCount); m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
#ifndef _WIN32 #if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
m.def("nvprof_init", platform::CudaProfilerInit); m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart); m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop); m.def("nvprof_stop", platform::CudaProfilerStop);
...@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("size_of_dtype", framework::SizeOfType); m.def("size_of_dtype", framework::SizeOfType);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("set_cublas_switch", platform::SetAllowTF32Cublas); m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
m.def("get_cublas_switch", platform::AllowTF32Cublas); m.def("get_cublas_switch", platform::AllowTF32Cublas);
m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn); m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
...@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
BindHeterWrapper(&m); BindHeterWrapper(&m);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper(&m); BindPSGPUWrapper(&m);
#endif #endif
BindGlooWrapper(&m); BindGlooWrapper(&m);
...@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_BOX_PS #ifdef PADDLE_WITH_BOX_PS
BindBoxWrapper(&m); BindBoxWrapper(&m);
#endif #endif
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BindNCCLWrapper(&m); BindNCCLWrapper(&m);
#endif #endif
#ifdef PADDLE_WITH_GLOO #ifdef PADDLE_WITH_GLOO
......
...@@ -27,7 +27,7 @@ limitations under the License. */ ...@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) { ...@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T)); paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
#endif #endif
} else if (platform::is_gpu_place(self.place())) { } else if (platform::is_gpu_place(self.place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const T *a = self.data<T>(); const T *a = self.data<T>();
auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place()); auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T), paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
...@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) { ...@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T)); paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
#endif #endif
} else if (platform::is_gpu_place(self->place())) { } else if (platform::is_gpu_place(self->place())) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place()); auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
T *a = self->mutable_data<T>(p); T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T), paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
...@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT( ...@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
// NOTE(wangxi): When copying data to the accelerator card, // NOTE(wangxi): When copying data to the accelerator card,
// we need set_device(dev_id) first. // we need set_device(dev_id) first.
...@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT( ...@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device); BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
auto dst = self->mutable_data<T>(place); auto dst = self->mutable_data<T>(place);
#ifdef PADDLE_WITH_HIP
paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
hipMemcpyHostToDevice);
#else
paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(), paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
#endif
} else if (paddle::platform::is_cuda_pinned_place(place)) { } else if (paddle::platform::is_cuda_pinned_place(place)) {
auto dst = self->mutable_data<T>(place); auto dst = self->mutable_data<T>(place);
...@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self, ...@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
self.type()); self.type());
#endif #endif
} else { } else {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_cuda_pinned_place(place)) { if (platform::is_cuda_pinned_place(place)) {
output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place), output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
self.type()); self.type());
...@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, ...@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
"Please recompile or reinstall Paddle with XPU support.")); "Please recompile or reinstall Paddle with XPU support."));
#endif #endif
} else if (is_gpu_tensor) { } else if (is_gpu_tensor) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true, PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册