未验证 提交 580447d0 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part4), test=develop (#31013)

上级 7d91974c
...@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto ...@@ -34,7 +34,11 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) if(WITH_GPU)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
elseif(WITH_ROCM)
hip_test(dim_test SRCS dim_test.cu DEPS ddim)
endif()
cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
...@@ -46,6 +50,8 @@ if(WITH_GPU) ...@@ -46,6 +50,8 @@ if(WITH_GPU)
else() else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
endif(WIN32) endif(WIN32)
elseif(WITH_ROCM)
hip_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
else() else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
endif() endif()
...@@ -53,6 +59,8 @@ endif() ...@@ -53,6 +59,8 @@ endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
if(WITH_GPU) if(WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor) nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
elseif(WITH_ROCM)
hip_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor dlpack_tensor)
else() else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor) cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor dlpack_tensor)
endif() endif()
...@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) ...@@ -63,13 +71,20 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
if(WITH_GPU) if(WITH_GPU)
nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
elseif(WITH_ROCM)
hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor)
else() else()
cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
endif() endif()
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
if(WITH_GPU)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
elseif(WITH_ROCM)
hip_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
endif()
cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog)
...@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope) ...@@ -94,8 +109,13 @@ cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits)
cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope) DEPS operator op_registry device_context math_function scope)
elseif(WITH_ROCM)
hip_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry device_context math_function scope)
endif()
if(WITH_GPU) if(WITH_GPU)
if (WIN32) if (WIN32)
...@@ -108,6 +128,9 @@ if(WITH_GPU) ...@@ -108,6 +128,9 @@ if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
endif(WIN32) endif(WIN32)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
elseif(WITH_ROCM)
hip_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
hip_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else() else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
...@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator ...@@ -156,8 +179,11 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator
cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce) cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack) cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
if(WITH_GPU)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
elseif(WITH_ROCM)
hip_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
endif()
if(WITH_PYTHON) if(WITH_PYTHON)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
......
...@@ -54,7 +54,7 @@ class Array { ...@@ -54,7 +54,7 @@ class Array {
} }
HOSTDEVICE inline T &at(size_t i) { HOSTDEVICE inline T &at(size_t i) {
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
i, N, platform::errors::OutOfRange("Array index out of bounds.")); i, N, platform::errors::OutOfRange("Array index out of bounds."));
#endif #endif
...@@ -62,7 +62,7 @@ class Array { ...@@ -62,7 +62,7 @@ class Array {
} }
HOSTDEVICE inline const T &at(size_t i) const { HOSTDEVICE inline const T &at(size_t i) const {
#ifndef __CUDA_ARCH__ #if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
i, N, platform::errors::OutOfRange("Array index out of bounds.")); i, N, platform::errors::OutOfRange("Array index out of bounds."));
#endif #endif
...@@ -103,7 +103,12 @@ class Array<T, 0> { ...@@ -103,7 +103,12 @@ class Array<T, 0> {
HOSTDEVICE inline T *GetMutable() { return nullptr; } HOSTDEVICE inline T *GetMutable() { return nullptr; }
HOSTDEVICE inline T &operator[](size_t) { HOSTDEVICE inline T &operator[](size_t) {
#ifdef __CUDA_ARCH__ #if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static T obj{};
return obj;
#elif defined(__CUDA_ARCH__)
static T obj(); static T obj();
return obj; return obj;
#else #else
...@@ -112,7 +117,12 @@ class Array<T, 0> { ...@@ -112,7 +117,12 @@ class Array<T, 0> {
} }
HOSTDEVICE inline const T &operator[](size_t) const { HOSTDEVICE inline const T &operator[](size_t) const {
#ifdef __CUDA_ARCH__ #if defined(__HIPCC__)
// HIP will have compile error, if use "obj()"
// function declared in block scope cannot have 'static' storage class
static const T obj{};
return obj;
#elif defined(__CUDA_ARCH__)
static const T obj(); static const T obj();
return obj; return obj;
#else #else
......
...@@ -16,7 +16,12 @@ limitations under the License. */ ...@@ -16,7 +16,12 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/framework/operator_kernel_configs.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/miopen_helper.h"
#else
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -32,7 +37,20 @@ class ConvSearchCache { ...@@ -32,7 +37,20 @@ class ConvSearchCache {
static ConvSearchCache instance; static ConvSearchCache instance;
return instance; return instance;
} }
#ifdef PADDLE_WITH_HIP
AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetForward() {
return &forward_cache_;
}
AlgorithmsCache<miopenConvBwdDataAlgorithm_t>* GetBackwardData() {
return &backward_data_cache_;
}
AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t>* GetBackwardFilter() {
return &backward_filter_cache_;
}
AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
return &fusion_forward_cache_;
}
#else
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() { AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
return &forward_cache_; return &forward_cache_;
} }
...@@ -45,6 +63,7 @@ class ConvSearchCache { ...@@ -45,6 +63,7 @@ class ConvSearchCache {
AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() { AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
return &fusion_forward_cache_; return &fusion_forward_cache_;
} }
#endif
private: private:
ConvSearchCache() {} ConvSearchCache() {}
...@@ -52,10 +71,17 @@ class ConvSearchCache { ...@@ -52,10 +71,17 @@ class ConvSearchCache {
ConvSearchCache(const ConvSearchCache&) {} ConvSearchCache(const ConvSearchCache&) {}
ConvSearchCache& operator=(const ConvSearchCache&) {} ConvSearchCache& operator=(const ConvSearchCache&) {}
#ifdef PADDLE_WITH_HIP
AlgorithmsCache<miopenConvFwdAlgorithm_t> forward_cache_;
AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
#else
AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_; AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_; AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_; AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_;
AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_; AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
#endif
}; };
} // namespace framework } // namespace framework
......
...@@ -31,7 +31,7 @@ namespace framework { ...@@ -31,7 +31,7 @@ namespace framework {
static std::vector<platform::Place> CreatePlaceList() { static std::vector<platform::Place> CreatePlaceList() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
places.emplace_back(platform::CUDAPlace(0)); places.emplace_back(platform::CUDAPlace(0));
#endif #endif
return places; return places;
......
...@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) { ...@@ -151,9 +151,12 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
} else { } else {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
#elif defined(PADDLE_WITH_HIP)
hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
#else #else
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Not supported GPU, please compile with option WITH_GPU=ON.")); "Not supported GPU/ROCM, please compile with option WITH_GPU=ON or "
"WITH_ROCM=ON."));
#endif #endif
} }
} }
...@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( ...@@ -1157,7 +1160,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
#endif #endif
} }
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
template <typename T> template <typename T>
void PrivateInstantDataFeed<T>::PutToFeedVec() { void PrivateInstantDataFeed<T>::PutToFeedVec() {
for (size_t i = 0; i < use_slots_.size(); ++i) { for (size_t i = 0; i < use_slots_.size(); ++i) {
......
...@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { ...@@ -716,7 +716,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
int pv_batch_size_; int pv_batch_size_;
}; };
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
template <typename T> template <typename T>
class PrivateInstantDataFeed : public DataFeed { class PrivateInstantDataFeed : public DataFeed {
public: public:
......
...@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed( ...@@ -68,7 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -47,7 +47,7 @@ struct CastDataType { ...@@ -47,7 +47,7 @@ struct CastDataType {
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_); auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin, trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>()); CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__ #if defined(__NVCC__) || defined(__HIPCC__)
} else if (platform::is_gpu_place(in_.place())) { } else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans; platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_); auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
......
...@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -81,7 +81,7 @@ void BroadcastOpHandle::BroadcastOneVar(
}); });
} }
} else if (platform::is_gpu_place(in_tensor.place())) { } else if (platform::is_gpu_place(in_tensor.place())) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
VarHandle *out_handle = nullptr; VarHandle *out_handle = nullptr;
int root_id = int root_id =
BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device; BOOST_GET_CONST(platform::CUDAPlace, in_tensor.place()).device;
......
...@@ -34,7 +34,7 @@ class Node; ...@@ -34,7 +34,7 @@ class Node;
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
namespace platform { namespace platform {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
struct NCCLContextMap; struct NCCLContextMap;
#endif #endif
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
...@@ -43,7 +43,7 @@ struct BKCLContextMap; ...@@ -43,7 +43,7 @@ struct BKCLContextMap;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/bkcl_helper.h"
...@@ -55,7 +55,7 @@ namespace details { ...@@ -55,7 +55,7 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase { struct BroadcastOpHandle : public OpHandleBase {
public: public:
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes, BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs) const platform::NCCLContextMap *nccl_ctxs)
...@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase { ...@@ -106,7 +106,7 @@ struct BroadcastOpHandle : public OpHandleBase {
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
const platform::NCCLContextMap *nccl_ctxs_; const platform::NCCLContextMap *nccl_ctxs_;
#elif defined(PADDLE_WITH_XPU_BKCL) #elif defined(PADDLE_WITH_XPU_BKCL)
const platform::BKCLContextMap *bkcl_ctxs_; const platform::BKCLContextMap *bkcl_ctxs_;
......
...@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { ...@@ -36,7 +36,8 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
test_op.TestBroadcastSelectedRows(input_scope_idx); test_op.TestBroadcastSelectedRows(input_scope_idx);
} }
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL) #if (defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_NCCL)) || \
(defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL))
TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TestBroadcastOpHandle test_op; TestBroadcastOpHandle test_op;
size_t input_scope_idx = 0; size_t input_scope_idx = 0;
......
...@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle { ...@@ -48,7 +48,7 @@ struct TestBroadcastOpHandle {
std::vector<std::unique_ptr<ir::Node>> nodes_; std::vector<std::unique_ptr<ir::Node>> nodes_;
std::vector<p::Place> place_list_; std::vector<p::Place> place_list_;
DeviceType use_device_; DeviceType use_device_;
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_; std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
#endif #endif
...@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle { ...@@ -60,7 +60,7 @@ struct TestBroadcastOpHandle {
for (size_t j = 0; j < ctxs_.size(); ++j) { for (size_t j = 0; j < ctxs_.size(); ++j) {
ctxs_[j]->Wait(); ctxs_[j]->Wait();
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nccl_ctxs_) { if (nccl_ctxs_) {
nccl_ctxs_->WaitAll(); nccl_ctxs_->WaitAll();
} }
...@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle { ...@@ -94,7 +94,7 @@ struct TestBroadcastOpHandle {
platform::errors::PreconditionNotMet("Not compiled with BKCL.")); platform::errors::PreconditionNotMet("Not compiled with BKCL."));
#endif #endif
} else if (use_device_ == p::kCUDA) { } else if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int count = p::GetCUDADeviceCount(); int count = p::GetCUDADeviceCount();
if (count <= 1) { if (count <= 1) {
LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA "
...@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle { ...@@ -122,7 +122,7 @@ struct TestBroadcastOpHandle {
#if defined(PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_.reset(nullptr); bkcl_ctxs_.reset(nullptr);
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
nccl_ctxs_.reset(nullptr); nccl_ctxs_.reset(nullptr);
#endif #endif
} }
...@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle { ...@@ -143,7 +143,7 @@ struct TestBroadcastOpHandle {
nodes_.emplace_back( nodes_.emplace_back(
ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); ir::CreateNodeForTest("node0", ir::Node::Type::kOperation));
if (use_device_ == p::kCUDA) { if (use_device_ == p::kCUDA) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
......
...@@ -52,7 +52,7 @@ class DeviceContext; ...@@ -52,7 +52,7 @@ class DeviceContext;
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/platform/nccl_helper.h"
#endif #endif
...@@ -73,11 +73,12 @@ class PullDenseWorker { ...@@ -73,11 +73,12 @@ class PullDenseWorker {
public: public:
virtual ~PullDenseWorker() {} virtual ~PullDenseWorker() {}
virtual void Initialize(const TrainerDesc& param); virtual void Initialize(const TrainerDesc& param);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void AddStream(const cudaStream_t stream) { copy_streams_.push_back(stream); } void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
#endif #endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
void AddPlace(const paddle::platform::Place place) { void AddPlace(const paddle::platform::Place place) {
places_.push_back(place); places_.push_back(place);
} }
...@@ -137,8 +138,8 @@ class PullDenseWorker { ...@@ -137,8 +138,8 @@ class PullDenseWorker {
float total_batch_num_ = 0; float total_batch_num_ = 0;
std::unordered_map<const Scope*, int> scope_to_thread_id_; std::unordered_map<const Scope*, int> scope_to_thread_id_;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<cudaStream_t> copy_streams_; std::vector<gpuStream_t> copy_streams_;
#endif #endif
std::vector<paddle::platform::Place> places_; std::vector<paddle::platform::Place> places_;
std::vector<Scope*> thread_scopes_; std::vector<Scope*> thread_scopes_;
...@@ -167,9 +168,9 @@ class DeviceWorker { ...@@ -167,9 +168,9 @@ class DeviceWorker {
virtual void CacheProgram(const ProgramDesc& main_program) {} virtual void CacheProgram(const ProgramDesc& main_program) {}
virtual void ProduceTasks() {} virtual void ProduceTasks() {}
virtual void GetXpuOpIndex() {} virtual void GetXpuOpIndex() {}
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
virtual void SetStream(const cudaStream_t stream) {} virtual void SetStream(const gpuStream_t stream) {}
virtual void SetEvent(const cudaEvent_t event) {} virtual void SetEvent(const gpuEvent_t event) {}
#endif #endif
virtual void SetNeedDumpField(bool need_dump_field) { virtual void SetNeedDumpField(bool need_dump_field) {
need_dump_field_ = need_dump_field; need_dump_field_ = need_dump_field;
...@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker { ...@@ -437,7 +438,8 @@ class HeterCpuWorker : public HogwildWorker {
}; };
#endif #endif
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB) (defined PADDLE_WITH_PSLIB)
class HeterBoxWorker : public HogwildWorker { class HeterBoxWorker : public HogwildWorker {
public: public:
...@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker { ...@@ -452,8 +454,8 @@ class HeterBoxWorker : public HogwildWorker {
new (&program_) ProgramDesc(main_program); new (&program_) ProgramDesc(main_program);
} }
virtual void ProduceTasks() override; virtual void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; } virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {} virtual void TrainFilesWithProfiler() {}
void ResetStat(); void ResetStat();
...@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker { ...@@ -515,8 +517,8 @@ class HeterBoxWorker : public HogwildWorker {
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_; std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_; paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_; paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
cudaEvent_t event_; gpuEvent_t event_;
cudaStream_t copy_stream_; gpuStream_t copy_stream_;
int batch_cnt_{0}; int batch_cnt_{0};
std::atomic<int> done_cnt_{0}; std::atomic<int> done_cnt_{0};
...@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker { ...@@ -537,7 +539,8 @@ class HeterBoxWorker : public HogwildWorker {
}; };
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
class PSGPUWorker : public HogwildWorker { class PSGPUWorker : public HogwildWorker {
public: public:
PSGPUWorker() {} PSGPUWorker() {}
...@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker { ...@@ -551,8 +554,8 @@ class PSGPUWorker : public HogwildWorker {
new (&program_) ProgramDesc(main_program); new (&program_) ProgramDesc(main_program);
} }
virtual void ProduceTasks() override; virtual void ProduceTasks() override;
virtual void SetStream(const cudaStream_t stream) { copy_stream_ = stream; } virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
virtual void SetEvent(const cudaEvent_t event) { event_ = event; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
virtual void TrainFilesWithProfiler() {} virtual void TrainFilesWithProfiler() {}
void ResetStat(); void ResetStat();
...@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker { ...@@ -611,8 +614,8 @@ class PSGPUWorker : public HogwildWorker {
std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_; std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_; paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_; paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
cudaEvent_t event_; gpuEvent_t event_;
cudaStream_t copy_stream_; gpuStream_t copy_stream_;
int batch_cnt_{0}; int batch_cnt_{0};
std::atomic<int> done_cnt_{0}; std::atomic<int> done_cnt_{0};
...@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker { ...@@ -633,7 +636,7 @@ class PSGPUWorker : public HogwildWorker {
}; };
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class SectionWorker : public DeviceWorker { class SectionWorker : public DeviceWorker {
public: public:
SectionWorker() {} SectionWorker() {}
......
...@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); ...@@ -69,15 +69,17 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt);
REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
#endif #endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB) #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
#endif #endif
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
REGISTER_DEVICE_WORKER_CLASS(SectionWorker); REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -34,7 +34,12 @@ TEST(Dim, Equality) { ...@@ -34,7 +34,12 @@ TEST(Dim, Equality) {
// construct a Dim on the GPU // construct a Dim on the GPU
thrust::device_vector<paddle::framework::Dim<2>> t(2); thrust::device_vector<paddle::framework::Dim<2>> t(2);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
thrust::raw_pointer_cast(t.data()));
#else
test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
#endif
a = t[0]; a = t[0];
EXPECT_EQ(a[0], 5); EXPECT_EQ(a[0], 5);
EXPECT_EQ(a[1], 6); EXPECT_EQ(a[1], 6);
...@@ -55,7 +60,12 @@ TEST(Dim, Equality) { ...@@ -55,7 +60,12 @@ TEST(Dim, Equality) {
// dynamic access on GPU // dynamic access on GPU
thrust::device_vector<int64_t> r(1); thrust::device_vector<int64_t> r(1);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0,
thrust::raw_pointer_cast(r.data()));
#else
dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
#endif
int64_t res = r[0]; int64_t res = r[0];
EXPECT_EQ(res, 6); EXPECT_EQ(res, 6);
} }
......
...@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -83,7 +83,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
} }
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLGPU; ctx.device_type = kDLGPU;
ctx.device_id = place.device; ctx.device_id = place.device;
...@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -95,7 +95,7 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
} }
inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
::DLContext ctx; ::DLContext ctx;
ctx.device_type = kDLCPUPinned; ctx.device_type = kDLCPUPinned;
ctx.device_id = 0; ctx.device_id = 0;
......
...@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, ...@@ -103,7 +103,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place,
template <typename T> template <typename T>
void TestMainLoop() { void TestMainLoop() {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::vector<platform::Place> places{platform::CPUPlace(), std::vector<platform::Place> places{platform::CPUPlace(),
platform::CUDAPlace(0), platform::CUDAPlace(0),
platform::CUDAPinnedPlace()}; platform::CUDAPinnedPlace()};
......
...@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -431,7 +431,7 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
......
...@@ -25,7 +25,7 @@ namespace paddle { ...@@ -25,7 +25,7 @@ namespace paddle {
namespace framework { namespace framework {
const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) { const std::shared_ptr<Generator>& GetDefaultCUDAGenerator(int64_t device_id) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
static int64_t num_cuda_devices = -1; static int64_t num_cuda_devices = -1;
static std::once_flag num_devices_init_flag; static std::once_flag num_devices_init_flag;
...@@ -157,7 +157,7 @@ uint64_t Generator::Random64() { ...@@ -157,7 +157,7 @@ uint64_t Generator::Random64() {
std::pair<uint64_t, uint64_t> Generator::IncrementOffset( std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
uint64_t increament_offset) { uint64_t increament_offset) {
uint64_t cur_offset = this->state_.thread_offset; uint64_t cur_offset = this->state_.thread_offset;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::lock_guard<std::mutex> lock(this->mu_); std::lock_guard<std::mutex> lock(this->mu_);
this->state_.thread_offset += increament_offset; this->state_.thread_offset += increament_offset;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册