未验证 提交 c8fac5ee 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid framework for rocm (part5), test=develop (#31014)

上级 580447d0
......@@ -13,7 +13,7 @@
// limitations under the License.
#include <functional>
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "gflags/gflags.h"
......@@ -53,7 +53,7 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
......@@ -82,18 +82,27 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {
platform::CUDADeviceGuard guard(place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_));
#endif
callback_manager_.reset(new platform::StreamCallbackManager(stream_));
}
StreamGarbageCollector::~StreamGarbageCollector() {
auto place = BOOST_GET_CONST(platform::CUDAPlace, this->dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(place.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(stream_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_));
#endif
}
cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
gpuStream_t StreamGarbageCollector::stream() const { return stream_; }
void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
......
......@@ -80,7 +80,7 @@ class XPUGarbageCollector : public GarbageCollector {
};
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public:
UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
......@@ -110,13 +110,13 @@ class StreamGarbageCollector : public GarbageCollector {
void Wait() const override;
cudaStream_t stream() const;
gpuStream_t stream() const;
protected:
void ClearCallback(const std::function<void()> &callback) override;
private:
cudaStream_t stream_;
gpuStream_t stream_;
std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
};
......
......@@ -152,7 +152,7 @@ class HeterObjectPool {
std::lock_guard<std::mutex> lock(mutex_);
if (pool_.empty()) {
num_ += 1;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
VLOG(0) << "pool construct size: " << num_;
#endif
return std::make_shared<T>();
......
......@@ -21,9 +21,10 @@ limitations under the License. */
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
defined PADDLE_WITH_XPU) && \
(defined PADDLE_WITH_PSLIB)
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
namespace paddle {
......@@ -48,16 +49,25 @@ void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc,
dataset->GetReaders();
for (int i = 0; i < place_num; ++i) {
int num = trainer_desc.worker_places(i);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
platform::CUDAPlace place = platform::CUDAPlace(num);
platform::CUDADeviceGuard guard(place.device);
cudaStream_t stream;
gpuStream_t stream;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
#endif
copy_streams_.push_back(stream);
places_.push_back(place);
cudaEvent_t event;
gpuEvent_t event;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
#endif
events_.push_back(event);
#endif
#ifdef PADDLE_WITH_XPU
......@@ -140,8 +150,13 @@ void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program,
_ForEachDataType_(HeterMemcpyFunc);
}
}
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream));
hipEventSynchronize(event);
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream));
cudaEventSynchronize(event);
#endif
}
place_ = place;
}
......@@ -150,7 +165,7 @@ template <typename T>
void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor,
LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream) {
gpuStream_t stream) {
T* thread_ptr =
thread_tensor->mutable_data<T>(root_tensor->dims(), thread_place);
T* root_ptr = root_tensor->data<T>();
......@@ -171,7 +186,7 @@ void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) {
for (size_t i = 0; i < places_.size(); ++i) {
pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope());
pull_dense_worker_->AddPlace(places_[i]);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pull_dense_worker_->AddStream(copy_streams_[i]);
#endif
}
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include <string>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#endif
......
......@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "gtest/gtest.h"
......@@ -34,8 +32,14 @@ TEST(LoD, data) {
auto& v = lod[0];
paddle::platform::CUDAPlace gpu(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, v.CUDAMutableData(gpu),
v.size());
hipDeviceSynchronize();
#else
test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
......@@ -59,8 +63,14 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod();
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(test, dim3(1), dim3(8), 0, 0,
lod[0].CUDAMutableData(place), lod[0].size());
hipDeviceSynchronize();
#else
test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
cudaDeviceSynchronize();
#endif
for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
......@@ -31,7 +31,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
......
......@@ -12,7 +12,13 @@
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <memory>
#include "glog/logging.h"
......@@ -22,6 +28,7 @@
template <typename T>
using vec = paddle::framework::Vector<T>;
using gpuStream_t = paddle::gpuStream_t;
static __global__ void multiply_10(int* ptr) {
for (int i = 0; i < 10; ++i) {
......@@ -29,7 +36,7 @@ static __global__ void multiply_10(int* ptr) {
}
}
cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
gpuStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
......@@ -43,7 +50,12 @@ TEST(mixed_vector, GPU_VECTOR) {
ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu),
tmp.MutableData(gpu));
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
#endif
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 10);
......@@ -64,11 +76,23 @@ TEST(mixed_vector, MultiGPU) {
ASSERT_EQ(tmp.size(), 10UL);
paddle::platform::CUDAPlace gpu0(0);
paddle::platform::SetDeviceId(0);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu0),
tmp.MutableData(gpu0));
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
#endif
paddle::platform::CUDAPlace gpu1(1);
auto* gpu1_ptr = tmp.MutableData(gpu1);
paddle::platform::SetDeviceId(1);
#ifdef PADDLE_WITH_HIP
hipLaunchKernelGGL(multiply_10, dim3(1), dim3(1), 0, GetCUDAStream(gpu1),
gpu1_ptr);
#else
multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
#endif
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 100);
}
......
......@@ -369,7 +369,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
// TODO(fengjiayi): The following macros
// seems ugly, do we have better method?
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
#else
#define USE_OP_KERNEL(op_type) \
......
......@@ -193,7 +193,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
try {
VLOG(4) << place << " " << DebugStringEx(&scope);
if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
PADDLE_THROW(platform::errors::Unavailable(
"Cannot run operator on place %s, please recompile paddle or "
"reinstall Paddle with CUDA support.",
......@@ -1166,6 +1166,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
#if defined(PADDLE_WITH_CUDA)
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
#endif
#if defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
#endif
}
......
......@@ -384,7 +384,7 @@ class ExecutionContext {
return device_context_;
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true,
platform::errors::PreconditionNotMet(
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -60,7 +60,7 @@ static std::once_flag gProfileOnce;
static bool gProfileStarted = false;
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::once_flag p2p_init_flag;
#endif
......@@ -132,7 +132,7 @@ class ParallelExecutorPrivate {
}
}
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
<< ", num_trainers:" << bst.num_trainers_
......@@ -371,7 +371,7 @@ class ParallelExecutorPrivate {
std::unordered_map<std::string, bool> is_persistable_;
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform::NCCLCommunicator *nccl_ctxs_{nullptr};
#elif defined(PADDLE_WITH_XPU_BKCL)
platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
......@@ -483,7 +483,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
}
std::unique_ptr<GarbageCollector> gc;
if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size));
......@@ -572,7 +572,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
}
void InitP2P(const std::vector<platform::Place> &places) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::call_once(p2p_init_flag, [&]() {
int count = places.size();
if (count <= 1) return;
......@@ -590,14 +590,24 @@ void InitP2P(const std::vector<platform::Place> &places) {
for (int j = 0; j < count; ++j) {
if (devices[i] == devices[j]) continue;
int can_acess = -1;
#ifdef PADDLE_WITH_HIP
hipError_t ret =
hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
if (ret != hipSuccess || can_acess != 1) {
#else
cudaError_t ret =
cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
if (ret != cudaSuccess || can_acess != 1) {
#endif
LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
<< " to " << devices[j];
} else {
platform::CUDADeviceGuard guard(devices[i]);
#ifdef PADDLE_WITH_HIP
hipDeviceEnablePeerAccess(devices[j], 0);
#else
cudaDeviceEnablePeerAccess(devices[j], 0);
#endif
}
}
}
......@@ -630,7 +640,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
BuildStrategy::ReduceStrategy::kAllReduce;
member_->use_all_reduce_ = true;
}
#if defined(PADDLE_WITH_CUDA) && defined(_WIN32)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
if (member_->IsUseCUDA(member_->use_device_)) {
PADDLE_ENFORCE_EQ(
places.size(), 1,
......@@ -638,7 +648,8 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
#endif
#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL)
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
(!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
if (member_->IsUseCUDA(member_->use_device_)) {
PADDLE_ENFORCE_EQ(
places.size(), 1,
......@@ -710,7 +721,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
}
if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
// Initialize device context's nccl comm, will be used by normal
......@@ -774,7 +785,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
std::vector<ir::Graph *> async_graphs(places.size());
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (member_->build_strategy_.async_mode_) {
VLOG(3) << "use local async mode";
graph = member_->build_strategy_.Apply(
......@@ -885,7 +896,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
final_graphs = async_graphs;
} else if (member_->build_strategy_.enable_parallel_graph_) {
VLOG(3) << "use ParallelSSAGraphExecutor";
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
bool is_inference = details::IsDataParallelInferenceGraph(*graph);
......@@ -996,7 +1007,7 @@ void ParallelExecutor::BCastParamsToDevices(
}
auto &dims = main_tensor.dims();
if (paddle::platform::is_gpu_place(main_tensor.place())) {
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
std::vector<void *> buffers;
buffers.reserve(member_->places_.size());
size_t numel = main_tensor.numel();
......
......@@ -32,7 +32,7 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......
......@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
......
......@@ -24,7 +24,8 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle {
......
......@@ -19,7 +19,8 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
(defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
#if defined _WIN32 || defined __APPLE__
......
......@@ -59,17 +59,19 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_[tid] = 0;
}
fleet_ptr_ = FleetWrapper::GetInstance();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_.clear();
#endif
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
places_.clear();
thread_scopes_.clear();
#endif
}
void PullDenseWorker::CreatePinVar() {
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
......@@ -84,7 +86,7 @@ void PullDenseWorker::CreatePinVar() {
auto* ptr = root_scope_->Var(name + "pin");
InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
LoDTensor* pin_tensor = ptr->GetMutable<LoDTensor>();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
pin_tensor->mutable_data<float>(tensor->dims(),
platform::CUDAPinnedPlace());
#endif
......@@ -113,7 +115,8 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit(-1);
}
status_vec->resize(0);
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
for (size_t i = 0; i < places_.size(); ++i) {
// for (auto& v : dense_value_names_) {
......@@ -131,7 +134,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
Variable* var = thread_scopes_[i]->FindVar(name);
LoDTensor* tensor = var->GetMutable<LoDTensor>();
float* w = tensor->data<float>();
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, places_[i]), w,
platform::CUDAPinnedPlace(), pin_w,
sizeof(float) * tensor->numel(), copy_streams_[i]);
......@@ -161,7 +164,8 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t tid = static_cast<uint64_t>(
dwp_param_.program_config(0).pull_dense_table_id(i));
if (force_update || CheckUpdateParam(tid)) {
#if (defined PADDLE_WITH_CUDA) || (defined PADDLE_WITH_XPU)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU)
VLOG(3) << "pull dense " << force_update << " " << tid;
fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid],
&pull_dense_status_, false);
......
......@@ -297,7 +297,7 @@ bool SaveTensorToDisk(const std::string& file_name,
tensor->numel() * framework::SizeOfType(tensor->type());
auto* data_ptr = tensor->data<void>();
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
framework::Tensor temp;
TensorCopySync(*tensor, platform::CPUPlace(), &temp);
data_ptr = temp.data<void>();
......
......@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_NCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include <float.h>
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
......@@ -38,7 +38,7 @@ void SectionWorker::TrainFiles() {
std::unique_ptr<GarbageCollector> gc;
auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_);
if (max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (platform::is_gpu_place(place_)) {
if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector(
......@@ -70,7 +70,11 @@ void SectionWorker::TrainFiles() {
}
}
}
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize();
#else
cudaDeviceSynchronize();
#endif
}
// backward pass
......@@ -89,7 +93,11 @@ void SectionWorker::TrainFiles() {
}
}
}
#ifdef PADDLE_WITH_RCCL
hipDeviceSynchronize();
#else
cudaDeviceSynchronize();
#endif
}
// update pass
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册