未验证 提交 28b356b9 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part6), test=develop (#31015)

上级 c8fac5ee
......@@ -118,7 +118,7 @@ TEST(Tensor, MutableData) {
EXPECT_EQ(static_cast<int>(p2[0]), 1);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework::Tensor src_tensor;
float* p1 = nullptr;
......@@ -174,7 +174,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework::Tensor src_tensor;
framework::Tensor dst_tensor;
......@@ -212,7 +212,7 @@ TEST(Tensor, Slice) {
EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
framework::Tensor src_tensor;
src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
......
......@@ -97,7 +97,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
......@@ -304,7 +304,7 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
platform::is_cuda_pinned_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
......@@ -595,7 +595,7 @@ bool TensorIsfinite(const framework::Tensor& tensor) {
return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
template <typename T>
static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
......@@ -618,7 +618,7 @@ struct BothFalseVisitor : public boost::static_visitor<> {
}
void VisitorImpl(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
constexpr int MAX_BLOCK_DIM = 512;
const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
......@@ -703,7 +703,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
platform::errors::ResourceExhausted(
"tensor size %d overflow when writing tensor", size));
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto& gpu_dev_ctx =
......@@ -802,7 +802,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(shape));
framework::VisitDataType(
......@@ -859,7 +860,8 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
size_t size = tensor->numel() * framework::SizeOfType(desc.data_type());
if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
platform::is_xpu_place(dev_ctx.GetPlace())) {
#if defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
Tensor cpu_tensor;
cpu_tensor.Resize(framework::make_ddim(dims));
framework::VisitDataType(
......@@ -954,7 +956,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) {
if (dl_tensor.ctx.device_type == kDLCPU) {
memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (dl_tensor.ctx.device_type == kDLGPU) {
platform::CUDAPlace dst_place =
platform::CUDAPlace(dl_tensor.ctx.device_id);
......
......@@ -127,7 +127,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
......@@ -150,7 +150,7 @@ void TensorFromVector(const std::vector<T>& src,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(dst_place)) { // NOLINT
memory::Copy(
BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place,
......@@ -187,7 +187,7 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr,
size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
else if (platform::is_gpu_place(src.place())) { // NOLINT
memory::Copy(
dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()),
......
......@@ -58,7 +58,7 @@ TEST(TensorCopy, Tensor) {
}
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor src_tensor;
Tensor gpu_tensor;
......@@ -149,7 +149,7 @@ TEST(TensorFromVector, Tensor) {
delete cpu_place;
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor cpu_tensor;
......@@ -224,7 +224,7 @@ TEST(TensorToVector, Tensor) {
EXPECT_EQ(src_ptr[i], dst[i]);
}
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor gpu_tensor;
......@@ -264,7 +264,7 @@ TEST(TensorFromDLPack, Tensor) {
}
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
paddle::framework::Tensor cpu_tensor;
......@@ -430,7 +430,7 @@ TEST(Tensor, FromAndToStream) {
EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
delete place;
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
Tensor gpu_tensor;
gpu_tensor.Resize({2, 3});
......
......@@ -63,7 +63,11 @@ TEST(TensorContainsNAN, GPU) {
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor));
}
......@@ -71,7 +75,11 @@ TEST(TensorContainsNAN, GPU) {
Tensor tensor;
paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor));
}
......@@ -84,7 +92,11 @@ TEST(TensorContainsInf, GPU) {
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor));
}
......@@ -92,7 +104,11 @@ TEST(TensorContainsInf, GPU) {
Tensor tensor;
paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor));
}
......@@ -107,14 +123,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor));
}
{
Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor));
}
......@@ -123,14 +147,22 @@ TEST(TensorIsfinite, GPU) {
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor));
}
{
Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(!TensorIsfinite(tensor));
}
......@@ -139,14 +171,24 @@ TEST(TensorIsfinite, GPU) {
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(TensorIsfinite(tensor));
}
{
Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
EXPECT_TRUE(TensorIsfinite(tensor));
}
......@@ -159,7 +201,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
{
Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorContainsInf(tensor, &out);
platform::CPUPlace cpu;
......@@ -172,7 +218,11 @@ TEST(TensorContainsInf, GPUWithoutWait) {
Tensor tensor, out;
paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorContainsInf(tensor, &out);
platform::CPUPlace cpu;
......@@ -190,7 +240,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
{
Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorContainsNAN(tensor, &out);
platform::CPUPlace cpu;
......@@ -203,7 +257,11 @@ TEST(TensorContainsNAN, GPUWithoutWait) {
Tensor tensor, out;
paddle::platform::float16* buf =
tensor.mutable_data<paddle::platform::float16>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorContainsNAN(tensor, &out);
platform::CPUPlace cpu;
......@@ -221,7 +279,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillInf, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorIsfinite(tensor, &out);
platform::CPUPlace cpu;
......@@ -233,7 +295,11 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillNAN, dim3(1), dim3(1), 0, cuda_ctx->stream(), buf);
#else
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorIsfinite(tensor, &out);
platform::CPUPlace cpu;
......@@ -245,7 +311,12 @@ TEST(TensorIsfinite, GPUWithoutWait) {
{
Tensor tensor, out;
float* buf = tensor.mutable_data<float>({3}, gpu);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(FillFinite, dim3(1), dim3(1), 0, cuda_ctx->stream(),
buf);
#else
FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
#endif
cuda_ctx->Wait();
TensorIsfinite(tensor, &out);
platform::CPUPlace cpu;
......
......@@ -141,7 +141,8 @@ class DistMultiTrainer : public MultiTrainer {
std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
};
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class HeterServiceContext {
public:
......@@ -155,8 +156,9 @@ class HeterServiceContext {
void Reset() { push_dense_status_.clear(); }
int place_num_;
Scope* scope_{nullptr};
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
gpuEvent_t event_;
#endif
std::vector<OperatorBase*> ops_;
std::vector<::std::future<int32_t>> push_dense_status_;
......@@ -187,10 +189,10 @@ class HeterXpuTrainer : public TrainerBase {
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
template <typename T>
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream);
gpuStream_t stream);
#endif
#ifdef PADDLE_WITH_XPU
void HeterMemCpy(LoDTensor* thread_tensor, LoDTensor* root_tensor,
......@@ -222,9 +224,9 @@ class HeterXpuTrainer : public TrainerBase {
std::vector<Scope*> place_scopes_;
BtObjectPool<HeterServiceContext> object_pool_;
std::vector<platform::Place> places_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<cudaEvent_t> events_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<gpuStream_t> copy_streams_;
std::vector<gpuEvent_t> events_;
#endif
};
......@@ -247,10 +249,10 @@ class HeterBoxTrainer : public TrainerBase {
virtual std::string GetDumpPath(int tid) { return ""; }
virtual void InitDumpEnv() {}
template <typename T>
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream);
gpuStream_t stream);
#endif
void CreateThreadParam(const ProgramDesc& program, int num);
template <typename T>
......@@ -272,14 +274,15 @@ class HeterBoxTrainer : public TrainerBase {
std::vector<std::thread> threads_;
int use_ps_gpu_;
int thread_num_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
std::vector<cudaEvent_t> events_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<gpuStream_t> copy_streams_;
std::vector<gpuEvent_t> events_;
#endif
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class PSGPUTrainer : public TrainerBase {
public:
PSGPUTrainer() {}
......@@ -321,7 +324,7 @@ class PSGPUTrainer : public TrainerBase {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class PipelineTrainer : public TrainerBase {
public:
PipelineTrainer() {}
......
......@@ -66,15 +66,17 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS(MultiTrainer);
REGISTER_TRAINER_CLASS(DistMultiTrainer);
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(HeterXpuTrainer);
REGISTER_TRAINER_CLASS(HeterBoxTrainer);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS(PSGPUTrainer);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
REGISTER_TRAINER_CLASS(PipelineTrainer);
#endif
} // namespace framework
......
......@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
......
......@@ -30,6 +30,12 @@
#include <nccl.h>
#endif
#endif
#ifdef PADDLE_WITH_HIP
#include <miopen/miopen.h>
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
......@@ -39,8 +45,8 @@
namespace paddle {
namespace platform {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class Communicator;
class NCCLCommunicator;
#endif
......@@ -151,8 +157,8 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *,
operators::reader::LoDTensorBlockingQueueHolder, FetchList,
operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ncclUniqueId, platform::Communicator, platform::NCCLCommunicator,
#endif
operators::CudnnRNNCache,
......
......@@ -28,6 +28,14 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#ifdef PADDLE_WITH_HIP
#if defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" // NOLINT
#include "paddle/fluid/platform/nccl_helper.h" // NOLINT
#endif
#include "paddle/fluid/operators/conv_cudnn_op_cache.h" // NOLINT
#include "paddle/fluid/operators/miopen_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
......
if (NOT WITH_NCCL)
if (NOT (WITH_NCCL OR WITH_RCCL))
return()
endif()
......@@ -6,12 +6,20 @@ if(WITH_GPU AND NOT WIN32)
nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
endif()
if(WITH_GPU)
if(WITH_ROCM AND NOT WIN32)
hip_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
endif()
if(WITH_GPU OR WITH_ROCM)
op_library(nccl_op DEPS nccl_common)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
set(OPERATOR_DEPS ${OPERATOR_DEPS} nccl_common PARENT_SCOPE)
endif()
if(NOT WIN32)
if(WITH_GPU AND NOT WIN32)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
if(WITH_ROCM AND NOT WIN32)
hip_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
......@@ -23,7 +23,11 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#endif
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
......
......@@ -3,12 +3,12 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
if (WITH_GPU)
if (WITH_GPU OR WITH_ROCM)
set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
endif()
if (WITH_NCCL)
if (WITH_NCCL OR WITH_RCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
endif()
......@@ -21,7 +21,7 @@ endif()
if(NOT WIN32)
set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator)
if (WITH_NCCL)
if (WITH_NCCL OR WITH_RCCL)
set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
endif()
endif(NOT WIN32)
......@@ -71,7 +71,7 @@ if (WITH_PSCORE)
list(APPEND PYBIND_SRCS fleet_py.cc)
endif()
if (WITH_NCCL)
if (WITH_NCCL OR WITH_RCCL)
list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
endif()
......@@ -81,9 +81,9 @@ if(WITH_PYTHON)
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
if(WITH_NCCL)
if (WITH_NCCL OR WITH_RCCL)
list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
endif(WITH_NCCL)
endif()
if(WITH_XPU_BKCL)
list(APPEND OP_FUNCTION_GENERETOR_DEPS bkcl_context)
......@@ -93,6 +93,9 @@ if(WITH_PYTHON)
target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(op_function_generator ${os_dependency_modules})
if(WITH_ROCM)
target_link_libraries(op_function_generator ${ROCM_HIPRTC_LIB})
endif()
set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h)
set(tmp_impl_file ${impl_file}.tmp)
......@@ -164,12 +167,6 @@ if(WITH_PYTHON)
endif(WITH_MKLDNN)
endif(WIN32)
if(WITH_ROCM_PLATFORM)
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS}
${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
else()
cc_library(paddle_pybind SHARED
SRCS ${PYBIND_SRCS}
DEPS ${PYBIND_DEPS}
......@@ -177,7 +174,10 @@ if(WITH_PYTHON)
if(NOT APPLE AND NOT WIN32)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT WIN32)
endif(WITH_ROCM_PLATFORM)
if(WITH_ROCM)
target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
endif()
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(paddle_pybind ${os_dependency_modules})
......
......@@ -66,7 +66,7 @@ DECLARE_bool(benchmark);
DECLARE_int32(inner_op_parallelism);
DECLARE_int32(max_inplace_grad_add);
DECLARE_string(tracer_profile_fname);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// cudnn
DECLARE_uint64(conv_workspace_size_limit);
DECLARE_bool(cudnn_batchnorm_spatial_persistent);
......@@ -354,7 +354,7 @@ static void RegisterGlobalVarGetterSetter() {
FLAGS_paddle_num_threads, FLAGS_use_mkldnn, FLAGS_max_inplace_grad_add,
FLAGS_tracer_mkldnn_ops_on, FLAGS_tracer_mkldnn_ops_off);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
REGISTER_PUBLIC_GLOBAL_VAR(
FLAGS_gpu_memory_limit_mb, FLAGS_cudnn_deterministic,
FLAGS_conv_workspace_size_limit, FLAGS_cudnn_batchnorm_spatial_persistent,
......
......@@ -966,7 +966,7 @@ void BindImperative(py::module *m_ptr) {
[](imperative::VarBase &self,
const imperative::ParallelStrategy &strategy) {
if (strategy.nranks_ > 1) {
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if NCCL_VERSION_CODE >= 2212
imperative::AllReduce(self.Var(), self.MutableVar(), strategy);
#else
......@@ -1016,7 +1016,7 @@ void BindImperative(py::module *m_ptr) {
)DOC")
.def("pin_memory",
[](const std::shared_ptr<imperative::VarBase> &self) {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot copy this Tensor to pinned memory in CPU version "
"Paddle, "
......@@ -1050,7 +1050,7 @@ void BindImperative(py::module *m_ptr) {
.def("cuda",
[](const std::shared_ptr<imperative::VarBase> &self, int device_id,
bool blocking) {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot copy this Tensor to GPU in CPU version Paddle, "
"Please recompile or reinstall Paddle with CUDA support."));
......@@ -1412,7 +1412,8 @@ void BindImperative(py::module *m_ptr) {
},
py::call_guard<py::gil_scoped_release>());
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
py::class_<imperative::ParallelContext,
std::shared_ptr<imperative::ParallelContext>>(m,
"ParallelContext");
......
......@@ -32,7 +32,8 @@ namespace py = pybind11;
namespace paddle {
namespace pybind {
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void BindPSGPUWrapper(py::module* m) {
py::class_<framework::PSGPUWrapper, std::shared_ptr<framework::PSGPUWrapper>>(
*m, "PSGPU")
......
......@@ -22,7 +22,8 @@ namespace py = pybind11;
namespace paddle {
namespace pybind {
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
void BindPSGPUWrapper(py::module* m);
#endif
} // namespace pybind
......
......@@ -86,7 +86,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/pybind/nccl_wrapper_py.h"
#endif
#include "paddle/fluid/framework/data_type.h"
......@@ -95,11 +95,13 @@ limitations under the License. */
#include "paddle/fluid/pybind/reader_py.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/fluid/string/to_string.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
#endif
#ifndef PADDLE_WITH_HIP
#include "paddle/fluid/platform/cuda_profiler.h"
#endif
#include "paddle/fluid/platform/gpu_info.h"
#endif
......@@ -128,7 +130,15 @@ PYBIND11_MAKE_OPAQUE(paddle::framework::FetchType);
namespace paddle {
namespace pybind {
bool IsCompiledWithCUDA() {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
return false;
#else
return true;
#endif
}
bool IsCompiledWithROCM() {
#ifndef PADDLE_WITH_HIP
return false;
#else
return true;
......@@ -389,7 +399,7 @@ PYBIND11_MODULE(core_noavx, m) {
m.def("set_num_threads", &platform::SetNumThreads);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("cudnn_version", &platform::CudnnVersion);
#endif
......@@ -403,7 +413,7 @@ PYBIND11_MODULE(core_noavx, m) {
if (dl.ctx.device_type == kDLCPU) {
paddle::framework::TensorFromDLPack(dl, &tensor);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (dl.ctx.device_type == kDLGPU) {
paddle::framework::TensorFromDLPack(dl, &tensor);
}
......@@ -1060,7 +1070,7 @@ PYBIND11_MODULE(core_noavx, m) {
.def("height", &SelectedRows::height)
.def("set_rows",
[](SelectedRows &self, std::vector<int64_t> rows) {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
self.set_rows(rows);
#else
Vector<int64_t> new_rows(rows);
......@@ -1354,7 +1364,7 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create",
[](paddle::platform::CUDAPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(
platform::errors::PermissionDenied(
"Cannot use CUDAPlace in CPU only version, "
......@@ -1366,7 +1376,7 @@ All parameter, weight, gradient are variables in Paddle.
.def_static("create",
[](paddle::platform::CUDAPinnedPlace& place)
-> paddle::platform::DeviceContext* {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(
platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, "
......@@ -1376,7 +1386,7 @@ All parameter, weight, gradient are variables in Paddle.
#endif
});;
// clang-format on
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
#endif
py::class_<platform::CUDAPlace>(m, "CUDAPlace", R"DOC(
......@@ -1405,7 +1415,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC")
.def("__init__",
[](platform::CUDAPlace &self, int dev_id) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (UNLIKELY(dev_id < 0)) {
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), device id must be 0 or "
......@@ -1443,7 +1453,7 @@ All parameter, weight, gradient are variables in Paddle.
std::exit(-1);
#endif
})
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
.def("get_device_id",
[](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
.def("_type", &PlaceIndex<platform::CUDAPlace>)
......@@ -1559,7 +1569,7 @@ All parameter, weight, gradient are variables in Paddle.
)DOC")
.def("__init__",
[](platform::CUDAPinnedPlace &self) {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use CUDAPinnedPlace in CPU only version, "
"Please recompile or reinstall Paddle with CUDA support."));
......@@ -1749,6 +1759,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", []() { framework::InitDevices(); });
m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("supports_bfloat16", SupportsBfloat16);
......@@ -1793,7 +1804,7 @@ All parameter, weight, gradient are variables in Paddle.
py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
py::arg("redirect_stderr") = false);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
// Only GPUs with Compute Capability >= 53 support float16
return platform::GetCUDAComputeCapability(place.device) >= 53;
......@@ -1967,10 +1978,10 @@ All parameter, weight, gradient are variables in Paddle.
py::return_value_policy::take_ownership);
m.def("op_support_gpu", OpSupportGPU);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
#ifndef _WIN32
#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
m.def("nvprof_init", platform::CudaProfilerInit);
m.def("nvprof_start", platform::CudaProfilerStart);
m.def("nvprof_stop", platform::CudaProfilerStop);
......@@ -2015,7 +2026,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("size_of_dtype", framework::SizeOfType);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
m.def("set_cublas_switch", platform::SetAllowTF32Cublas);
m.def("get_cublas_switch", platform::AllowTF32Cublas);
m.def("set_cudnn_switch", platform::SetAllowTF32Cudnn);
......@@ -2847,7 +2858,8 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper(&m);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper(&m);
#endif
BindGlooWrapper(&m);
......@@ -2855,7 +2867,7 @@ All parameter, weight, gradient are variables in Paddle.
#ifdef PADDLE_WITH_BOX_PS
BindBoxWrapper(&m);
#endif
#ifdef PADDLE_WITH_NCCL
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BindNCCLWrapper(&m);
#endif
#ifdef PADDLE_WITH_GLOO
......
......@@ -27,7 +27,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/platform/bfloat16.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "paddle/fluid/platform/device_context.h"
......@@ -226,7 +226,7 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T));
#endif
} else if (platform::is_gpu_place(self.place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const T *a = self.data<T>();
auto p = BOOST_GET_CONST(platform::CUDAPlace, self.place());
paddle::memory::Copy(platform::CPUPlace(), &b, p, a + offset, sizeof(T),
......@@ -250,7 +250,7 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T));
#endif
} else if (platform::is_gpu_place(self->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto p = BOOST_GET_CONST(platform::CUDAPlace, self->place());
T *a = self->mutable_data<T>(p);
paddle::memory::Copy(p, a + offset, platform::CPUPlace(), &elem, sizeof(T),
......@@ -296,7 +296,7 @@ void SetTensorFromPyArrayT(
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) {
// NOTE(wangxi): When copying data to the accelerator card,
// we need set_device(dev_id) first.
......@@ -304,8 +304,13 @@ void SetTensorFromPyArrayT(
platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
auto dst = self->mutable_data<T>(place);
#ifdef PADDLE_WITH_HIP
paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
hipMemcpyHostToDevice);
#else
paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
cudaMemcpyHostToDevice);
#endif
} else if (paddle::platform::is_cuda_pinned_place(place)) {
auto dst = self->mutable_data<T>(place);
......@@ -474,7 +479,7 @@ inline framework::Tensor *_getTensor(const framework::Tensor &self,
self.type());
#endif
} else {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_cuda_pinned_place(place)) {
output->mutable_data(BOOST_GET_CONST(platform::CUDAPinnedPlace, place),
self.type());
......@@ -707,7 +712,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor,
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (is_gpu_tensor) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides);
PADDLE_ENFORCE_EQ(py_arr.writeable(), true,
platform::errors::InvalidArgument(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册