From 580447d019eef680da7cf1b007d08e296f38d930 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 25 Feb 2021 21:24:28 +0800 Subject: [PATCH] [ROCM] update fluid framework for rocm (part4), test=develop (#31013) --- paddle/fluid/framework/CMakeLists.txt | 36 +++++++++++++--- paddle/fluid/framework/array.h | 18 ++++++-- paddle/fluid/framework/conv_search_cache.h | 28 +++++++++++- .../fluid/framework/copy_same_tensor_test.cc | 2 +- paddle/fluid/framework/data_feed.cc | 7 ++- paddle/fluid/framework/data_feed.h | 2 +- paddle/fluid/framework/data_feed_factory.cc | 2 +- paddle/fluid/framework/data_type_transform.cc | 2 +- .../framework/details/broadcast_op_handle.cc | 2 +- .../framework/details/broadcast_op_handle.h | 8 ++-- .../details/broadcast_op_handle_test.cc | 3 +- .../details/broadcast_op_handle_test.h | 10 ++--- paddle/fluid/framework/device_worker.h | 43 ++++++++++--------- .../fluid/framework/device_worker_factory.cc | 8 ++-- paddle/fluid/framework/dim_test.cu | 10 +++++ paddle/fluid/framework/dlpack_tensor.cc | 4 +- paddle/fluid/framework/dlpack_tensor_test.cc | 2 +- paddle/fluid/framework/executor.cc | 2 +- paddle/fluid/framework/generator.cc | 4 +- 19 files changed, 137 insertions(+), 56 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6e282a2e91..4c92a06aed 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) -nv_test(dim_test SRCS dim_test.cu DEPS ddim) +if(WITH_GPU) + nv_test(dim_test SRCS dim_test.cu DEPS ddim) +elseif(WITH_ROCM) + hip_test(dim_test SRCS dim_test.cu DEPS ddim) +endif() cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) @@ -46,6 +50,8 @@ if(WITH_GPU) else() nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) endif(WIN32) +elseif(WITH_ROCM) + hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) endif() @@ -53,6 +59,8 @@ endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) if(WITH_GPU) nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor) +elseif(WITH_ROCM) + hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor) else() cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor) endif() @@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) if(WITH_GPU) nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) +elseif(WITH_ROCM) + hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) -nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) + +if(WITH_GPU) + nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +elseif(WITH_ROCM) + hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +endif() cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) @@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) -nv_test(data_device_transform_test SRCS data_device_transform_test.cu +if(WITH_GPU) + nv_test(data_device_transform_test SRCS data_device_transform_test.cu DEPS operator op_registry device_context math_function scope) +elseif(WITH_ROCM) + hip_test(data_device_transform_test SRCS data_device_transform_test.cu + DEPS operator op_registry device_context math_function scope) +endif() if(WITH_GPU) if (WIN32) @@ -108,6 +128,9 @@ if(WITH_GPU) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) endif(WIN32) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) +elseif(WITH_ROCM) + hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) else() cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) @@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce) cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack) - -nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) +if(WITH_GPU) + nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) +elseif(WITH_ROCM) + hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) +endif() if(WITH_PYTHON) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h index 10abb83116..0ec9cb8112 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/fluid/framework/array.h @@ -54,7 +54,7 @@ class Array { } HOSTDEVICE inline T &at(size_t i) { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) PADDLE_ENFORCE_LT( i, N, platform::errors::OutOfRange("Array index out of bounds.")); #endif @@ -62,7 +62,7 @@ class Array { } HOSTDEVICE inline const T &at(size_t i) const { -#ifndef __CUDA_ARCH__ +#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__) PADDLE_ENFORCE_LT( i, N, platform::errors::OutOfRange("Array index out of bounds.")); #endif @@ -103,7 +103,12 @@ class Array { HOSTDEVICE inline T *GetMutable() { return nullptr; } HOSTDEVICE inline T &operator[](size_t) { -#ifdef __CUDA_ARCH__ +#if defined(__HIPCC__) + // HIP will have compile error, if use "obj()" + // function declared in block scope cannot have 'static' storage class + static T obj{}; + return obj; +#elif defined(__CUDA_ARCH__) static T obj(); return obj; #else @@ -112,7 +117,12 @@ class Array { } HOSTDEVICE inline const T &operator[](size_t) const { -#ifdef __CUDA_ARCH__ +#if defined(__HIPCC__) + // HIP will have compile error, if use "obj()" + // function declared in block scope cannot have 'static' storage class + static const T obj{}; + return obj; +#elif defined(__CUDA_ARCH__) static const T obj(); return obj; #else diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h index 720467d6f1..db8dc22f68 100644 --- a/paddle/fluid/framework/conv_search_cache.h +++ b/paddle/fluid/framework/conv_search_cache.h @@ -16,7 +16,12 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator_kernel_configs.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#else #include "paddle/fluid/platform/cudnn_helper.h" +#endif namespace paddle { namespace framework { @@ -32,7 +37,20 @@ class ConvSearchCache { static ConvSearchCache instance; return instance; } - +#ifdef PADDLE_WITH_HIP + AlgorithmsCache* GetForward() { + return &forward_cache_; + } + AlgorithmsCache* GetBackwardData() { + return &backward_data_cache_; + } + AlgorithmsCache* GetBackwardFilter() { + return &backward_filter_cache_; + } + AlgorithmsCache* GetConvFusion() { + return &fusion_forward_cache_; + } +#else AlgorithmsCache* GetForward() { return &forward_cache_; } @@ -45,6 +63,7 @@ class ConvSearchCache { AlgorithmsCache* GetConvFusion() { return &fusion_forward_cache_; } +#endif private: ConvSearchCache() {} @@ -52,10 +71,17 @@ class ConvSearchCache { ConvSearchCache(const ConvSearchCache&) {} ConvSearchCache& operator=(const ConvSearchCache&) {} +#ifdef PADDLE_WITH_HIP + AlgorithmsCache forward_cache_; + AlgorithmsCache backward_data_cache_; + AlgorithmsCache backward_filter_cache_; + AlgorithmsCache fusion_forward_cache_; +#else AlgorithmsCache forward_cache_; AlgorithmsCache backward_data_cache_; AlgorithmsCache backward_filter_cache_; AlgorithmsCache fusion_forward_cache_; +#endif }; } // namespace framework diff --git a/paddle/fluid/framework/copy_same_tensor_test.cc b/paddle/fluid/framework/copy_same_tensor_test.cc index ad06473b51..0b1fdc3944 100644 --- a/paddle/fluid/framework/copy_same_tensor_test.cc +++ b/paddle/fluid/framework/copy_same_tensor_test.cc @@ -31,7 +31,7 @@ namespace framework { static std::vector CreatePlaceList() { std::vector places; places.emplace_back(platform::CPUPlace()); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) places.emplace_back(platform::CUDAPlace(0)); #endif return places; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 2b70cdb9f1..1ab0b40135 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) { } else { #ifdef PADDLE_WITH_CUDA cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); +#elif defined(PADDLE_WITH_HIP) + hipMemcpy(dst, src, size, hipMemcpyHostToDevice); #else PADDLE_THROW(platform::errors::Unimplemented( - "Not supported GPU, please compile with option WITH_GPU=ON.")); + "Not supported GPU/ROCM, please compile with option WITH_GPU=ON or " + "WITH_ROCM=ON.")); #endif } } @@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( #endif } -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) template void PrivateInstantDataFeed::PutToFeedVec() { for (size_t i = 0; i < use_slots_.size(); ++i) { diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 1abca95b8b..ec79005dfe 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { int pv_batch_size_; }; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) template class PrivateInstantDataFeed : public DataFeed { public: diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index c967b0f0ca..ec1b8ec773 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -68,7 +68,7 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif } // namespace framework diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 084c6e6816..5a716eba8d 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -47,7 +47,7 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) } else if (platform::is_gpu_place(in_.place())) { platform::Transform trans; auto* context = static_cast(ctx_); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 34d800994f..36b840e494 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else if (platform::is_gpu_place(in_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) VarHandle *out_handle = nullptr; int root_id = BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index e15dd18467..8ca20da974 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -34,7 +34,7 @@ class Node; } // namespace ir } // namespace framework namespace platform { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) struct NCCLContextMap; #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -43,7 +43,7 @@ struct BKCLContextMap; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #elif defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/bkcl_helper.h" @@ -55,7 +55,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) @@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) const platform::NCCLContextMap *nccl_ctxs_; #elif defined(PADDLE_WITH_XPU_BKCL) const platform::BKCLContextMap *bkcl_ctxs_; diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index cfd6b71aab..d8fb1b05ed 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { test_op.TestBroadcastSelectedRows(input_scope_idx); } -#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL) +#if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \ + (defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)) TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index af053de4f6..6ca4baa6d8 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -48,7 +48,7 @@ struct TestBroadcastOpHandle { std::vector> nodes_; std::vector place_list_; DeviceType use_device_; -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) std::unique_ptr nccl_ctxs_; #endif @@ -60,7 +60,7 @@ struct TestBroadcastOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -94,7 +94,7 @@ struct TestBroadcastOpHandle { platform::errors::PreconditionNotMet("Not compiled with BKCL.")); #endif } else if (use_device_ == p::kCUDA) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -122,7 +122,7 @@ struct TestBroadcastOpHandle { #if defined(PADDLE_WITH_XPU_BKCL) bkcl_ctxs_.reset(nullptr); #endif -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) nccl_ctxs_.reset(nullptr); #endif } @@ -143,7 +143,7 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_device_ == p::kCUDA) { -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 6ecc02bbae..9da23ee29d 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -52,7 +52,7 @@ class DeviceContext; } // namespace platform } // namespace paddle -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -73,11 +73,12 @@ class PullDenseWorker { public: virtual ~PullDenseWorker() {} virtual void Initialize(const TrainerDesc& param); -#ifdef PADDLE_WITH_CUDA - void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); } #endif -#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_XPU) void AddPlace(const paddle::platform::Place place) { places_.push_back(place); } @@ -137,8 +138,8 @@ class PullDenseWorker { float total_batch_num_ = 0; std::unordered_map scope_to_thread_id_; -#ifdef PADDLE_WITH_CUDA - std::vector copy_streams_; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + std::vector copy_streams_; #endif std::vector places_; std::vector thread_scopes_; @@ -167,9 +168,9 @@ class DeviceWorker { virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void ProduceTasks() {} virtual void GetXpuOpIndex() {} -#ifdef PADDLE_WITH_CUDA - virtual void SetStream(const cudaStream_t stream) {} - virtual void SetEvent(const cudaEvent_t event) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + virtual void SetStream(const gpuStream_t stream) {} + virtual void SetEvent(const gpuEvent_t event) {} #endif virtual void SetNeedDumpField(bool need_dump_field) { need_dump_field_ = need_dump_field; @@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker { }; #endif -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ +#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ + defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) class HeterBoxWorker : public HogwildWorker { public: @@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } virtual void ProduceTasks() override; - virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; } - virtual void SetEvent(const cudaEvent_t event) { event_ = event; } + virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } + virtual void SetEvent(const gpuEvent_t event) { event_ = event; } virtual void TrainFilesWithProfiler() {} void ResetStat(); @@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; - cudaEvent_t event_; - cudaStream_t copy_stream_; + gpuEvent_t event_; + gpuStream_t copy_stream_; int batch_cnt_{0}; std::atomic done_cnt_{0}; @@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker { }; #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) class PSGPUWorker : public HogwildWorker { public: PSGPUWorker() {} @@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker { new (&program_) ProgramDesc(main_program); } virtual void ProduceTasks() override; - virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; } - virtual void SetEvent(const cudaEvent_t event) { event_ = event; } + virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } + virtual void SetEvent(const gpuEvent_t event) { event_ = event; } virtual void TrainFilesWithProfiler() {} void ResetStat(); @@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker { std::unordered_map> feasign_set_; paddle::framework::Channel> pull_queue_; paddle::framework::Channel> push_queue_; - cudaEvent_t event_; - cudaStream_t copy_stream_; + gpuEvent_t event_; + gpuStream_t copy_stream_; int batch_cnt_{0}; std::atomic done_cnt_{0}; @@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index af1cf7804f..a539a5d5f9 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); #endif -#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) +#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ + (defined PADDLE_WITH_PSLIB) REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu index 7add6d140c..b3c26b10c6 100644 --- a/paddle/fluid/framework/dim_test.cu +++ b/paddle/fluid/framework/dim_test.cu @@ -34,7 +34,12 @@ TEST(Dim, Equality) { // construct a Dim on the GPU thrust::device_vector> t(2); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, + thrust::raw_pointer_cast(t.data())); +#else test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); +#endif a = t[0]; EXPECT_EQ(a[0], 5); EXPECT_EQ(a[1], 6); @@ -55,7 +60,12 @@ TEST(Dim, Equality) { // dynamic access on GPU thrust::device_vector r(1); +#ifdef PADDLE_WITH_HIP + hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0, + thrust::raw_pointer_cast(r.data())); +#else dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); +#endif int64_t res = r[0]; EXPECT_EQ(res, 6); } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index ac42edec68..a3fbb008fe 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { } inline ::DLContext operator()(const platform::CUDAPlace &place) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; ctx.device_type = kDLGPU; ctx.device_id = place.device; @@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { } inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; ctx.device_type = kDLCPUPinned; ctx.device_id = 0; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index c0ab9d3aca..d03437034d 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, template void TestMainLoop() { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::vector places{platform::CPUPlace(), platform::CUDAPlace(0), platform::CUDAPinnedPlace()}; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index b4f7e5f518..0acc8a55fa 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, std::unique_ptr gc; if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (platform::is_gpu_place(place_)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 478d10ee7a..737dbafb64 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -25,7 +25,7 @@ namespace paddle { namespace framework { const std::shared_ptr& GetDefaultCUDAGenerator(int64_t device_id) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static int64_t num_cuda_devices = -1; static std::once_flag num_devices_init_flag; @@ -157,7 +157,7 @@ uint64_t Generator::Random64() { std::pair Generator::IncrementOffset( uint64_t increament_offset) { uint64_t cur_offset = this->state_.thread_offset; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::lock_guard lock(this->mu_); this->state_.thread_offset += increament_offset; -- GitLab