未验证 提交 580447d0 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part4), test=develop (#31013)

上级 7d91974c
......@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
if(WITH_GPU)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
elseif(WITH_ROCM)
hip_test(dim_test SRCS dim_test.cu DEPS ddim)
endif()
cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
......@@ -46,6 +50,8 @@ if(WITH_GPU)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32)
elseif(WITH_ROCM)
hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif()
......@@ -53,6 +59,8 @@ endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
if(WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
elseif(WITH_ROCM)
hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor)
endif()
......@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
if(WITH_GPU)
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
elseif(WITH_ROCM)
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
if(WITH_GPU)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
elseif(WITH_ROCM)
hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
endif()
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
......@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
if(WITH_GPU)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope)
elseif(WITH_ROCM)
hip_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope)
endif()
if(WITH_GPU)
if (WIN32)
......@@ -108,6 +128,9 @@ if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
elseif(WITH_ROCM)
hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
......@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
if(WITH_GPU)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
elseif(WITH_ROCM)
hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
endif()
if(WITH_PYTHON)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
......
......@@ -54,7 +54,7 @@ class Array {
}
HOSTDEVICE inline T &at(size_t i) {
#ifndef __CUDA_ARCH__
#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT(
i, N, platform::errors::OutOfRange("Array index out of bounds."));
#endif
......@@ -62,7 +62,7 @@ class Array {
}
HOSTDEVICE inline const T &at(size_t i) const {
#ifndef __CUDA_ARCH__
#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT(
i, N, platform::errors::OutOfRange("Array index out of bounds."));
#endif
......@@ -103,7 +103,12 @@ class Array<T, 0> {
HOSTDEVICE inline T *GetMutable() { return nullptr; }
HOSTDEVICE inline T &operator[](size_t) {
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static T obj{};
return obj;
#elif defined(__CUDA_ARCH__)
static T obj();
return obj;
#else
......@@ -112,7 +117,12 @@ class Array<T, 0> {
}
HOSTDEVICE inline const T &operator[](size_t) const {
#ifdef __CUDA_ARCH__
#if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static const T obj{};
return obj;
#elif defined(__CUDA_ARCH__)
static const T obj();
return obj;
#else
......
......@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle {
namespace framework {
......@@ -32,7 +37,20 @@ class ConvSearchCache {
static ConvSearchCache instance;
return instance;
}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetForward() {
return &forward_cache_;
}
AlgorithmsCache<miopenConvBwdDataAlgorithm_t>* GetBackwardData() {
return &backward_data_cache_;
}
AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t>* GetBackwardFilter() {
return &backward_filter_cache_;
}
AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
return &fusion_forward_cache_;
}
#else
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
return &forward_cache_;
}
......@@ -45,6 +63,7 @@ class ConvSearchCache {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
return &fusion_forward_cache_;
}
#endif
private:
ConvSearchCache() {}
......@@ -52,10 +71,17 @@ class ConvSearchCache {
ConvSearchCache(const ConvSearchCache&) {}
ConvSearchCache& operator=(const ConvSearchCache&) {}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache<miopenConvFwdAlgorithm_t> forward_cache_;
AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
#else
AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_;
AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
#endif
};
} // namespace framework
......
......@@ -31,7 +31,7 @@ namespace framework {
static std::vector<platform::Place> CreatePlaceList() {
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
places.emplace_back(platform::CUDAPlace(0));
#endif
return places;
......
......@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
} else {
#ifdef PADDLE_WITH_CUDA
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
#elif defined(PADDLE_WITH_HIP)
hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
#else
PADDLE_THROW(platform::errors::Unimplemented(
"Not supported GPU, please compile with option WITH_GPU=ON."));
"Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
"WITH_ROCM=ON."));
#endif
}
}
......@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
#endif
}
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
template <typename T>
void PrivateInstantDataFeed<T>::PutToFeedVec() {
for (size_t i = 0; i < use_slots_.size(); ++i) {
......
......@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
int pv_batch_size_;
};
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
template <typename T>
class PrivateInstantDataFeed : public DataFeed {
public:
......
......@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
#endif
} // namespace framework
......
......@@ -47,7 +47,7 @@ struct CastDataType {
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__
#if defined(__NVCC__) || defined(__HIPCC__)
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
......
......@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar(
});
}
} else if (platform::is_gpu_place(in_tensor.place())) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
VarHandle *out_handle = nullptr;
int root_id =
BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
......
......@@ -34,7 +34,7 @@ class Node;
} // namespace ir
} // namespace framework
namespace platform {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
struct NCCLContextMap;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
......@@ -43,7 +43,7 @@ struct BKCLContextMap;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
......@@ -55,7 +55,7 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase {
public:
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs)
......@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
const platform::NCCLContextMap *nccl_ctxs_;
#elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLContextMap *bkcl_ctxs_;
......
......@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
test_op.TestBroadcastSelectedRows(input_scope_idx);
}
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0;
......
......@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_;
DeviceType use_device_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif
......@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait();
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_ctxs_) {
nccl_ctxs_->WaitAll();
}
......@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle {
platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif
} else if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
......@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle {
#if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_.reset(nullptr);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_ctxs_.reset(nullptr);
#endif
}
......@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get());
#else
......
......@@ -52,7 +52,7 @@ class DeviceContext;
} // namespace platform
} // namespace paddle
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -73,11 +73,12 @@ class PullDenseWorker {
public:
virtual ~PullDenseWorker() {}
virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) {
places_.push_back(place);
}
......@@ -137,8 +138,8 @@ class PullDenseWorker {
float total_batch_num_ = 0;
std::unordered_map<const Scope*, int> scope_to_thread_id_;
#ifdef PADDLE_WITH_CUDA
std::vector<cudaStream_t> copy_streams_;
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<gpuStream_t> copy_streams_;
#endif
std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_;
......@@ -167,9 +168,9 @@ class DeviceWorker {
virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void ProduceTasks() {}
virtual void GetXpuOpIndex() {}
#ifdef PADDLE_WITH_CUDA
virtual void SetStream(const cudaStream_t stream) {}
virtual void SetEvent(const cudaEvent_t event) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
virtual void SetStream(const gpuStream_t stream) {}
virtual void SetEvent(const gpuEvent_t event) {}
#endif
virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field;
......@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
class HeterBoxWorker : public HogwildWorker {
public:
......@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker {
new (&program_) ProgramDesc(main_program);
}
virtual void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {}
void ResetStat();
......@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker {
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
cudaEvent_t event_;
cudaStream_t copy_stream_;
gpuEvent_t event_;
gpuStream_t copy_stream_;
int batch_cnt_{0};
std::atomic<int> done_cnt_{0};
......@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class PSGPUWorker : public HogwildWorker {
public:
PSGPUWorker() {}
......@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker {
new (&program_) ProgramDesc(main_program);
}
virtual void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; }
virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {}
void ResetStat();
......@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker {
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
cudaEvent_t event_;
cudaStream_t copy_stream_;
gpuEvent_t event_;
gpuStream_t copy_stream_;
int batch_cnt_{0};
std::atomic<int> done_cnt_{0};
......@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker {
};
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class SectionWorker : public DeviceWorker {
public:
SectionWorker() {}
......
......@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
#endif
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif
} // namespace framework
......
......@@ -34,7 +34,12 @@ TEST(Dim, Equality) {
// construct a Dim on the GPU
thrust::device_vector<paddle::framework::Dim<2>> t(2);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
thrust::raw_pointer_cast(t.data()));
#else
test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
#endif
a = t[0];
EXPECT_EQ(a[0], 5);
EXPECT_EQ(a[1], 6);
......@@ -55,7 +60,12 @@ TEST(Dim, Equality) {
// dynamic access on GPU
thrust::device_vector<int64_t> r(1);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0,
thrust::raw_pointer_cast(r.data()));
#else
dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
#endif
int64_t res = r[0];
EXPECT_EQ(res, 6);
}
......
......@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx;
ctx.device_type = kDLGPU;
ctx.device_id = place.device;
......@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
}
inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx;
ctx.device_type = kDLCPUPinned;
ctx.device_id = 0;
......
......@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place,
template <typename T>
void TestMainLoop() {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<platform::Place> places{platform::CPUPlace(),
platform::CUDAPlace(0),
platform::CUDAPinnedPlace()};
......
......@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
......
......@@ -25,7 +25,7 @@ namespace paddle {
namespace framework {
const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static int64_t num_cuda_devices = -1;
static std::once_flag num_devices_init_flag;
......@@ -157,7 +157,7 @@ uint64_t Generator::Random64() {
std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
uint64_t increament_offset) {
uint64_t cur_offset = this->state_.thread_offset;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::lock_guard<std::mutex> lock(this->mu_);
this->state_.thread_offset += increament_offset;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册