未验证 提交 1d996637 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid imperative for rocm (part1), test=develop (#31017)

* [ROCM] update fluid imperative for rocm (part1), test=develop

* [ROCM] update reducer.cc after merge, test=develop

* update reducer cmake after merge, test=develop
上级 b95eb38b
...@@ -9,11 +9,16 @@ cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) ...@@ -9,11 +9,16 @@ cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
cc_library(imperative_profiler SRCS profiler.cc) cc_library(imperative_profiler SRCS profiler.cc)
if(NOT WIN32) if(NOT WIN32)
if(WITH_NCCL) if(WITH_NCCL OR WITH_RCCL)
cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor) cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits) cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits)
if(WITH_NCCL)
nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
endif() endif()
if(WITH_RCCL)
hip_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
endif()
endif()
if(WITH_XPU_BKCL) if(WITH_XPU_BKCL)
cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits) cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits)
cc_library(reducer SRCS reducer.cc DEPS layer) cc_library(reducer SRCS reducer.cc DEPS layer)
......
...@@ -12,11 +12,17 @@ ...@@ -12,11 +12,17 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/imperative/all_reduce.h"
#ifdef PADDLE_WITH_NCCL
#include <nccl.h> #include <nccl.h>
#endif
#ifdef PADDLE_WITH_RCCL
#include <rccl.h>
#endif
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -46,7 +52,7 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) { ...@@ -46,7 +52,7 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) {
} }
static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
const cudaStream_t stream, const gpuStream_t stream,
const platform::NCCLComm *comm) { const platform::NCCLComm *comm) {
const auto &place = src.place(); const auto &place = src.place();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -67,7 +73,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, ...@@ -67,7 +73,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
static void AllReduce(const framework::SelectedRows &src, static void AllReduce(const framework::SelectedRows &src,
framework::SelectedRows *dst, framework::SelectedRows *dst,
const ParallelStrategy &strategy, const ParallelStrategy &strategy,
const cudaStream_t stream, const gpuStream_t stream,
const platform::NCCLComm *comm) { const platform::NCCLComm *comm) {
VLOG(3) << "SelectedRows AllReduce start"; VLOG(3) << "SelectedRows AllReduce start";
const auto &src_tensor = src.value(); const auto &src_tensor = src.value();
...@@ -99,7 +105,11 @@ static void AllReduce(const framework::SelectedRows &src, ...@@ -99,7 +105,11 @@ static void AllReduce(const framework::SelectedRows &src,
comm->comm(), stream)); comm->comm(), stream));
if (!use_calc_stream) { if (!use_calc_stream) {
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
} }
const auto *cpu_rows_num_ptr = rows_num_vector.data(); const auto *cpu_rows_num_ptr = rows_num_vector.data();
...@@ -176,7 +186,7 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst, ...@@ -176,7 +186,7 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
platform::NCCLComm *comm = platform::NCCLComm *comm =
platform::NCCLCommContext::Instance().Get(ring_id, place); platform::NCCLCommContext::Instance().Get(ring_id, place);
cudaStream_t stream = (use_calc_stream ? dev_ctx->stream() : comm->stream()); gpuStream_t stream = (use_calc_stream ? dev_ctx->stream() : comm->stream());
if (src.IsType<framework::LoDTensor>()) { if (src.IsType<framework::LoDTensor>()) {
if (!dst->IsType<framework::LoDTensor>()) { if (!dst->IsType<framework::LoDTensor>()) {
...@@ -199,8 +209,12 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst, ...@@ -199,8 +209,12 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
AllReduce(src.Get<framework::SelectedRows>(), AllReduce(src.Get<framework::SelectedRows>(),
tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream, tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream,
comm); comm);
// stream must synchronize to ensure accuracy of the move operation // stream must synchronize to ensure accuracy of the move operation
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
*dst = std::move(tmp_dst); *dst = std::move(tmp_dst);
} }
#endif #endif
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#pragma once #pragma once
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -99,7 +99,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -99,7 +99,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx = platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>( dynamic_cast<platform::CUDADeviceContext*>(
...@@ -186,7 +186,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -186,7 +186,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
if (data_type == framework::proto::VarType::FP16) { if (data_type == framework::proto::VarType::FP16) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return TensorAddImpl<platform::CUDADeviceContext, platform::float16>( return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
src_tensor, dst_tensor, place); src_tensor, dst_tensor, place);
#else #else
...@@ -224,7 +224,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src, ...@@ -224,7 +224,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src,
return; \ return; \
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
...@@ -232,7 +232,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src, ...@@ -232,7 +232,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src,
#endif #endif
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
...@@ -267,7 +267,7 @@ static void SelectedRowsAddTensor( ...@@ -267,7 +267,7 @@ static void SelectedRowsAddTensor(
return; \ return; \
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
...@@ -275,7 +275,7 @@ static void SelectedRowsAddTensor( ...@@ -275,7 +275,7 @@ static void SelectedRowsAddTensor(
#endif #endif
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float); PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float);
PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double); PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
...@@ -314,7 +314,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge( ...@@ -314,7 +314,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
return dst_var; \ return dst_var; \
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float); PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double); PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
...@@ -322,7 +322,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge( ...@@ -322,7 +322,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
#endif #endif
PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float); PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float);
PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double); PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
...@@ -518,7 +518,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, ...@@ -518,7 +518,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (paddle::platform::is_gpu_place(place)) { if (paddle::platform::is_gpu_place(place)) {
// sum selected rows firstly // sum selected rows firstly
for (auto& var_info : tmp_grad_vars_) { for (auto& var_info : tmp_grad_vars_) {
...@@ -579,7 +579,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var, ...@@ -579,7 +579,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
// Increase count // Increase count
IncreaseCurCnt(); IncreaseCurCnt();
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} }
#endif #endif
tmp_grad_vars_.clear(); tmp_grad_vars_.clear();
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/imperative/nccl_context.h"
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/imperative/all_reduce.h" #include "paddle/fluid/imperative/all_reduce.h"
#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/gen_comm_id_helper.h"
...@@ -31,7 +31,7 @@ class Variable; ...@@ -31,7 +31,7 @@ class Variable;
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void NCCLParallelContext::BcastNCCLId( void NCCLParallelContext::BcastNCCLId(
std::vector<ncclUniqueId> &nccl_ids, // NOLINT std::vector<ncclUniqueId> &nccl_ids, // NOLINT
...@@ -113,9 +113,14 @@ void NCCLParallelContext::WaitCompute(int ring_id) { ...@@ -113,9 +113,14 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream(); platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream();
auto event = compute_events_[ring_id].get(); auto event = compute_events_[ring_id].get();
// compute_stream-->event-->comm_stream // compute_stream-->event-->comm_stream
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
#endif
} }
void NCCLParallelContext::WaitComm(int ring_id) { void NCCLParallelContext::WaitComm(int ring_id) {
...@@ -134,9 +139,14 @@ void NCCLParallelContext::WaitComm(int ring_id) { ...@@ -134,9 +139,14 @@ void NCCLParallelContext::WaitComm(int ring_id) {
platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream(); platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream();
auto event = comm_events_[ring_id].get(); auto event = comm_events_[ring_id].get();
// comm_stream-->event-->compute_stream // comm_stream-->event-->compute_stream
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
#endif
} }
#endif #endif
......
...@@ -17,11 +17,18 @@ ...@@ -17,11 +17,18 @@
#include <string> #include <string>
#include <vector> #include <vector>
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/cuda_resource_pool.h" #include "paddle/fluid/platform/cuda_resource_pool.h"
#endif
#ifdef PADDLE_WITH_NCCL
#include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/dynload/nccl.h"
#endif #endif
#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#endif
#include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/imperative/parallel_context.h"
namespace paddle { namespace paddle {
...@@ -33,7 +40,7 @@ class Variable; ...@@ -33,7 +40,7 @@ class Variable;
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
class NCCLParallelContext : public ParallelContext { class NCCLParallelContext : public ParallelContext {
public: public:
explicit NCCLParallelContext(const ParallelStrategy& strategy, explicit NCCLParallelContext(const ParallelStrategy& strategy,
......
...@@ -27,7 +27,8 @@ ...@@ -27,7 +27,8 @@
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
// div the nranks // div the nranks
void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
framework::Tensor *tensor = framework::Tensor *tensor =
...@@ -37,7 +38,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { ...@@ -37,7 +38,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
: dense_contents_.GetMutable<framework::LoDTensor>(); : dense_contents_.GetMutable<framework::LoDTensor>();
if (platform::is_gpu_place(tensor->place())) { if (platform::is_gpu_place(tensor->place())) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
DivNRanks(tensor, nranks, context); DivNRanks(tensor, nranks, context);
#endif #endif
} else if (platform::is_cpu_place(tensor->place())) { } else if (platform::is_cpu_place(tensor->place())) {
...@@ -206,7 +207,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>( ...@@ -206,7 +207,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
void Group::ConcatTensors(const platform::DeviceContext &context) { void Group::ConcatTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
ConcatTensorsWithType( ConcatTensorsWithType(
static_cast<const platform::CUDADeviceContext &>(context), static_cast<const platform::CUDADeviceContext &>(context),
dense_tensors_, &dense_contents_, dtype_); dense_tensors_, &dense_contents_, dtype_);
...@@ -238,7 +239,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { ...@@ -238,7 +239,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
void Group::SplitTensors(const platform::DeviceContext &context) { void Group::SplitTensors(const platform::DeviceContext &context) {
auto place = context.GetPlace(); auto place = context.GetPlace();
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_NCCL #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
SplitTensorsWithType( SplitTensorsWithType(
static_cast<const platform::CUDADeviceContext &>(context), static_cast<const platform::CUDADeviceContext &>(context),
&dense_contents_, &dense_tensors_, dtype_); &dense_contents_, &dense_tensors_, dtype_);
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void Group::DivNRanks(framework::Tensor *tensor, int64_t nranks, void Group::DivNRanks(framework::Tensor *tensor, int64_t nranks,
const platform::DeviceContext &context) { const platform::DeviceContext &context) {
framework::VisitDataTypeSmall( framework::VisitDataTypeSmall(
......
...@@ -47,7 +47,8 @@ class VariableWrapper; ...@@ -47,7 +47,8 @@ class VariableWrapper;
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
template <typename T> template <typename T>
struct DivNRanksFunctor { struct DivNRanksFunctor {
......
if(WIN32) if(WIN32)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
else() else()
if (WITH_NCCL) if (WITH_NCCL OR WITH_RCCL)
cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
endif() endif()
if (WITH_XPU_BKCL) if (WITH_XPU_BKCL)
...@@ -16,6 +16,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s ...@@ -16,6 +16,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
if (WITH_NCCL OR WITH_XPU_BKCL) if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
endif() endif()
...@@ -33,7 +33,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { ...@@ -33,7 +33,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
return strategy; return strategy;
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) { void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
auto strategy = GetStrategy(local_rank); auto strategy = GetStrategy(local_rank);
platform::CUDAPlace gpu(local_rank); platform::CUDAPlace gpu(local_rank);
......
...@@ -53,7 +53,7 @@ int TensorddTest(Place place, T t1, T t2) { ...@@ -53,7 +53,7 @@ int TensorddTest(Place place, T t1, T t2) {
sizeof(T) * src_data.size()); sizeof(T) * src_data.size());
paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
sizeof(T) * dst_data.size()); sizeof(T) * dst_data.size());
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
} else { } else {
paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
sizeof(T) * src_data.size(), 0); sizeof(T) * src_data.size(), 0);
...@@ -74,7 +74,7 @@ int TensorddTest(Place place, T t1, T t2) { ...@@ -74,7 +74,7 @@ int TensorddTest(Place place, T t1, T t2) {
} }
TEST(test_add_functor, add_functor) { TEST(test_add_functor, add_functor) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace gpu_place(0); platform::CUDAPlace gpu_place(0);
#endif #endif
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
...@@ -88,7 +88,7 @@ TEST(test_add_functor, add_functor) { ...@@ -88,7 +88,7 @@ TEST(test_add_functor, add_functor) {
cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0), cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)); static_cast<platform::float16>(2.0));
EXPECT_EQ(cpu_res, 0); EXPECT_EQ(cpu_res, 0);
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int gpu_res = 1; int gpu_res = 1;
gpu_res = TensorddTest(gpu_place, 1.0, 0.0); gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
EXPECT_EQ(gpu_res, 0); EXPECT_EQ(gpu_res, 0);
...@@ -107,7 +107,7 @@ TEST(test_add_functor, execption) { ...@@ -107,7 +107,7 @@ TEST(test_add_functor, execption) {
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0)); ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0)); ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
static_cast<platform::float16>(1.0), static_cast<platform::float16>(1.0),
...@@ -358,7 +358,7 @@ TEST(test_gradient_accumulator, test_unchange_input) { ...@@ -358,7 +358,7 @@ TEST(test_gradient_accumulator, test_unchange_input) {
for (auto sort_gradient : {false, true}) { for (auto sort_gradient : {false, true}) {
TestGradientAccumulatorTestUnchangeInput(platform::CPUPlace(), TestGradientAccumulatorTestUnchangeInput(platform::CPUPlace(),
sort_gradient); sort_gradient);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TestGradientAccumulatorTestUnchangeInput(platform::CUDAPlace(0), TestGradientAccumulatorTestUnchangeInput(platform::CUDAPlace(0),
sort_gradient); sort_gradient);
#endif #endif
......
...@@ -73,7 +73,7 @@ void GroupConcatSplit(Place place, size_t size) { ...@@ -73,7 +73,7 @@ void GroupConcatSplit(Place place, size_t size) {
} }
if (std::is_same<Place, platform::CUDAPlace>::value) { if (std::is_same<Place, platform::CUDAPlace>::value) {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
paddle::memory::Copy(place, data, cpu_place, value.data(), paddle::memory::Copy(place, data, cpu_place, value.data(),
sizeof(T) * value.size(), 0); sizeof(T) * value.size(), 0);
#endif #endif
...@@ -133,7 +133,7 @@ void GroupConcatSplit(Place place, size_t size) { ...@@ -133,7 +133,7 @@ void GroupConcatSplit(Place place, size_t size) {
} }
} }
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
TEST(TestGroup, TestConcatSplit) { TEST(TestGroup, TestConcatSplit) {
platform::CUDAPlace cuda_place(0); platform::CUDAPlace cuda_place(0);
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
......
...@@ -106,7 +106,7 @@ TEST(test_prepare_op, test_get_tensor_from_var) { ...@@ -106,7 +106,7 @@ TEST(test_prepare_op, test_get_tensor_from_var) {
ASSERT_TRUE(ts != nullptr); ASSERT_TRUE(ts != nullptr);
} }
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(test_prepare_op, test_prepare_data) { TEST(test_prepare_op, test_prepare_data) {
std::shared_ptr<imperative::VarBase> vin( std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin")); new imperative::VarBase(false, "vin"));
......
...@@ -195,7 +195,7 @@ TEST(test_tracer, test_track_backward_input) { ...@@ -195,7 +195,7 @@ TEST(test_tracer, test_track_backward_input) {
ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
} }
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(test_tracer, test_trace_op_with_multi_device_inputs) { TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
// Doing an mul // Doing an mul
imperative::Tracer tracer; imperative::Tracer tracer;
...@@ -521,7 +521,7 @@ static void TestVarOpDestructionMain(const platform::Place& place, ...@@ -521,7 +521,7 @@ static void TestVarOpDestructionMain(const platform::Place& place,
TEST(test_tracer, test_var_op_destruction) { TEST(test_tracer, test_var_op_destruction) {
TestVarOpDestructionMain(platform::CPUPlace()); TestVarOpDestructionMain(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TestVarOpDestructionMain(platform::CUDAPlace(0)); TestVarOpDestructionMain(platform::CUDAPlace(0));
#endif #endif
} }
......
...@@ -201,7 +201,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -201,7 +201,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
void Tracer::SetExpectedPlace(platform::Place place) { void Tracer::SetExpectedPlace(platform::Place place) {
// NOTE(wangxi): set device id before launch device kernel // NOTE(wangxi): set device id before launch device kernel
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device); platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
#else #else
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册