未验证 提交 69875dc4 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid memory for rocm35 (part1), test=develop (#30758)

上级 5b1ab51c
...@@ -19,6 +19,12 @@ if (WITH_GPU) ...@@ -19,6 +19,12 @@ if (WITH_GPU)
DEPS device_context malloc) DEPS device_context malloc)
endif() endif()
if (WITH_ROCM)
hip_test(malloc_test
SRCS malloc_test.cu
DEPS device_context malloc)
endif()
#if (WITH_GPU) #if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif() #endif()
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...@@ -70,7 +70,7 @@ class AllocatorFacadePrivate { ...@@ -70,7 +70,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
...@@ -87,7 +87,7 @@ class AllocatorFacadePrivate { ...@@ -87,7 +87,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id)); InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
...@@ -104,7 +104,7 @@ class AllocatorFacadePrivate { ...@@ -104,7 +104,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id)); InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id)); InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
...@@ -152,7 +152,7 @@ class AllocatorFacadePrivate { ...@@ -152,7 +152,7 @@ class AllocatorFacadePrivate {
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p); system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
} }
#endif #endif
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
system_allocators_[platform::CUDAPinnedPlace()] = system_allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CPUPinnedAllocator>(); std::make_shared<CPUPinnedAllocator>();
int device_count = platform::GetCUDADeviceCount(); int device_count = platform::GetCUDADeviceCount();
...@@ -168,7 +168,7 @@ class AllocatorFacadePrivate { ...@@ -168,7 +168,7 @@ class AllocatorFacadePrivate {
std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace()); std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InitNaiveBestFitCUDAPinnedAllocator() { void InitNaiveBestFitCUDAPinnedAllocator() {
allocators_[platform::CUDAPinnedPlace()] = allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace()); std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
...@@ -215,7 +215,7 @@ class AllocatorFacadePrivate { ...@@ -215,7 +215,7 @@ class AllocatorFacadePrivate {
void InitZeroSizeAllocators() { void InitZeroSizeAllocators() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int device_count = platform::GetCUDADeviceCount(); int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) { for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::CUDAPlace(dev_id)); places.emplace_back(platform::CUDAPlace(dev_id));
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
...@@ -45,7 +45,7 @@ void AllocateTestCases() { ...@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ(cpu_allocation->size(), size); ASSERT_EQ(cpu_allocation->size(), size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
place = platform::CUDAPlace(0); place = platform::CUDAPlace(0);
size = 1024; size = 1024;
...@@ -81,7 +81,7 @@ void AllocateTestCases() { ...@@ -81,7 +81,7 @@ void AllocateTestCases() {
} }
TEST(Allocator, SpecifyGpuMemory) { TEST(Allocator, SpecifyGpuMemory) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb // FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use = 0.0; FLAGS_fraction_of_gpu_memory_to_use = 0.0;
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(initial_gpu_memory_in_mb);
...@@ -45,7 +45,7 @@ void AllocateTestCases() { ...@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ(cpu_allocation->size(), size); ASSERT_EQ(cpu_allocation->size(), size);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
place = platform::CUDAPlace(0); place = platform::CUDAPlace(0);
size = 1024; size = 1024;
...@@ -81,7 +81,7 @@ void AllocateTestCases() { ...@@ -81,7 +81,7 @@ void AllocateTestCases() {
} }
TEST(Allocator, Allocator) { TEST(Allocator, Allocator) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500; FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use); DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time); DECLARE_int64(gpu_allocator_retry_time);
...@@ -40,7 +40,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) { ...@@ -40,7 +40,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
} }
TEST(allocator, allocator) { TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500; FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
...@@ -62,7 +62,7 @@ TEST(allocator, allocator) { ...@@ -62,7 +62,7 @@ TEST(allocator, allocator) {
ASSERT_EQ(cpu_allocation->size(), AlignedSize(size, 1024)); ASSERT_EQ(cpu_allocation->size(), AlignedSize(size, 1024));
} }
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
{ {
place = platform::CUDAPlace(0); place = platform::CUDAPlace(0);
size = 1024; size = 1024;
...@@ -101,7 +101,7 @@ TEST(allocator, allocator) { ...@@ -101,7 +101,7 @@ TEST(allocator, allocator) {
TEST(multithread_allocate, test_segfault) { TEST(multithread_allocate, test_segfault) {
FLAGS_allocator_strategy = "auto_growth"; FLAGS_allocator_strategy = "auto_growth";
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::mutex mtx; std::mutex mtx;
std::condition_variable cv; std::condition_variable cv;
bool flag = false; bool flag = false;
......
...@@ -13,8 +13,16 @@ ...@@ -13,8 +13,16 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <string> #include <string>
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -39,7 +47,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { ...@@ -39,7 +47,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
void* ptr; void* ptr;
auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device); auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
if (LIKELY(result == cudaSuccess)) { if (LIKELY(result == gpuSuccess)) {
return new Allocation(ptr, size, platform::Place(place_)); return new Allocation(ptr, size, platform::Place(place_));
} }
......
...@@ -14,8 +14,6 @@ ...@@ -14,8 +14,6 @@
#pragma once #pragma once
#include <cuda_runtime.h>
#include <map> #include <map>
#include <memory> #include <memory>
#include <utility> #include <utility>
...@@ -79,17 +77,26 @@ class CUDADeviceContextAllocation : public Allocation { ...@@ -79,17 +77,26 @@ class CUDADeviceContextAllocation : public Allocation {
class CUDADeviceContextAllocator : public Allocator { class CUDADeviceContextAllocator : public Allocator {
public: public:
explicit CUDADeviceContextAllocator(platform::CUDAPlace place, explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
cudaStream_t default_stream) gpuStream_t default_stream)
: place_(place), default_stream_(default_stream) { : place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming)); cudaEventCreate(&event_, cudaEventDisableTiming));
#endif
} }
~CUDADeviceContextAllocator() { ~CUDADeviceContextAllocator() {
if (event_) { if (event_) {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
#endif
} }
} }
...@@ -102,10 +109,15 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -102,10 +109,15 @@ class CUDADeviceContextAllocator : public Allocator {
platform::CUDADeviceGuard guard(place_.device); platform::CUDADeviceGuard guard(place_.device);
auto allocation = auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size)); new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream // Wait for the event on stream
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0)); cudaStreamWaitEvent(default_stream_, event_, 0));
#endif
return allocation; return allocation;
} }
...@@ -113,8 +125,8 @@ class CUDADeviceContextAllocator : public Allocator { ...@@ -113,8 +125,8 @@ class CUDADeviceContextAllocator : public Allocator {
private: private:
platform::CUDAPlace place_; platform::CUDAPlace place_;
cudaEvent_t event_{nullptr}; gpuEvent_t event_{nullptr};
cudaStream_t default_stream_{nullptr}; gpuStream_t default_stream_{nullptr};
}; };
/** /**
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -216,7 +216,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) { ...@@ -216,7 +216,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUBuddyAllocatorList { class GPUBuddyAllocatorList {
private: private:
GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) { GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
...@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { ...@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
template <> template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) { size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) { ...@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
template <> template <>
void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size); auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) { if (ptr == nullptr) {
...@@ -311,7 +311,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -311,7 +311,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
string::HumanReadableSize(Used<platform::CUDAPlace>(place)))); string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
} else { } else {
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
#ifdef PADDLE_WITH_HIP
hipMemset(ptr, 0xEF, size);
#else
cudaMemset(ptr, 0xEF, size); cudaMemset(ptr, 0xEF, size);
#endif
} }
} }
return ptr; return ptr;
...@@ -324,7 +328,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -324,7 +328,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -334,7 +338,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, ...@@ -334,7 +338,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
template <> template <>
uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) { uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetGPUBuddyAllocator(place.device)->Release(); return GetGPUBuddyAllocator(place.device)->Release();
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -342,7 +346,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) { ...@@ -342,7 +346,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
BuddyAllocator *GetCUDAPinnedBuddyAllocator() { BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag; static std::once_flag init_flag;
static BuddyAllocator *ba = nullptr; static BuddyAllocator *ba = nullptr;
...@@ -360,7 +364,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() { ...@@ -360,7 +364,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
template <> template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) { size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetCUDAPinnedBuddyAllocator()->Used(); return GetCUDAPinnedBuddyAllocator()->Used();
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -371,7 +375,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) { ...@@ -371,7 +375,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
template <> template <>
void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
void *ptr = buddy_allocator->Alloc(size); void *ptr = buddy_allocator->Alloc(size);
...@@ -392,7 +396,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -392,7 +396,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) { void *p, size_t size) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -403,7 +407,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -403,7 +407,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
uint64_t Release<platform::CUDAPinnedPlace>( uint64_t Release<platform::CUDAPinnedPlace>(
const platform::CUDAPinnedPlace &place) { const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetCUDAPinnedBuddyAllocator()->Release(); return GetCUDAPinnedBuddyAllocator()->Release();
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -449,7 +453,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const { ...@@ -449,7 +453,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
} }
size_t Usage::operator()(const platform::CUDAPlace &gpu) const { size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return Used(gpu); return Used(gpu);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
...@@ -458,7 +462,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const { ...@@ -458,7 +462,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
} }
size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return Used(cuda_pinned); return Used(cuda_pinned);
#else #else
PADDLE_THROW(platform::errors::PermissionDenied( PADDLE_THROW(platform::errors::PermissionDenied(
......
...@@ -41,7 +41,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) { ...@@ -41,7 +41,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
alloc.Release(platform::CPUPlace()); alloc.Release(platform::CPUPlace());
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(NaiveBestFitAllocatorTest, GpuAlloc) { TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
NaiveBestFitAllocator alloc{platform::CUDAPlace(0)}; NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
{ {
......
...@@ -19,12 +19,20 @@ namespace memory { ...@@ -19,12 +19,20 @@ namespace memory {
namespace allocation { namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr())); PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif
delete allocation; delete allocation;
} }
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
void *ptr; void *ptr;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif
return new Allocation(ptr, size, platform::CUDAPinnedPlace()); return new Allocation(ptr, size, platform::CUDAPinnedPlace());
} }
} // namespace allocation } // namespace allocation
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h" #include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif #endif
...@@ -127,7 +127,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { ...@@ -127,7 +127,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
} }
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{ {
platform::CUDAPlace p(0); platform::CUDAPlace p(0);
RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms); RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
......
...@@ -2,11 +2,13 @@ include(ExternalProject) ...@@ -2,11 +2,13 @@ include(ExternalProject)
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place) cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
if(${WITH_GPU}) if(WITH_GPU)
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else(${WITH_GPU}) elseif(WITH_ROCM)
hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else()
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
endif(${WITH_GPU}) endif()
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "glog/logging.h" #include "glog/logging.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif #endif
...@@ -220,7 +220,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( ...@@ -220,7 +220,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t allocate_bytes = max_chunk_size_; size_t allocate_bytes = max_chunk_size_;
size_t index = 0; size_t index = 0;
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (system_allocator_->UseGpu()) { if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) { if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation. // Compute the allocation size for gpu for the first allocation.
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <fstream> #include <fstream>
#include <string> #include <string>
...@@ -76,7 +76,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes, ...@@ -76,7 +76,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
return nullptr; return nullptr;
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(BuddyAllocator, GpuFraction) { TEST(BuddyAllocator, GpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB // In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.01; FLAGS_fraction_of_gpu_memory_to_use = 0.01;
...@@ -195,8 +195,13 @@ TEST(BuddyAllocator, AllocFromAvailable) { ...@@ -195,8 +195,13 @@ TEST(BuddyAllocator, AllocFromAvailable) {
// Take half of available GPU // Take half of available GPU
void* p; void* p;
#ifdef PADDLE_WITH_HIP
hipError_t result = hipMalloc(&p, available >> 1);
EXPECT_TRUE(result == hipSuccess);
#else
cudaError_t result = cudaMalloc(&p, available >> 1); cudaError_t result = cudaMalloc(&p, available >> 1);
EXPECT_TRUE(result == cudaSuccess); EXPECT_TRUE(result == cudaSuccess);
#endif
// BuddyAllocator should be able to alloc the remaining GPU // BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator( BuddyAllocator buddy_allocator(
...@@ -209,7 +214,11 @@ TEST(BuddyAllocator, AllocFromAvailable) { ...@@ -209,7 +214,11 @@ TEST(BuddyAllocator, AllocFromAvailable) {
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30)); TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
if (p) { if (p) {
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipFree(p) == hipSuccess);
#else
EXPECT_TRUE(cudaFree(p) == cudaSuccess); EXPECT_TRUE(cudaFree(p) == cudaSuccess);
#endif
} }
} }
...@@ -219,7 +228,12 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) { ...@@ -219,7 +228,12 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_reallocate_gpu_memory_in_mb = 0; FLAGS_reallocate_gpu_memory_in_mb = 0;
void* p = nullptr; void* p = nullptr;
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipMalloc(&p, static_cast<size_t>(1) << 30) == hipSuccess);
#else
EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(1) << 30) == cudaSuccess); EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(1) << 30) == cudaSuccess);
#endif
// BuddyAllocator should be able to alloc the remaining GPU // BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator( BuddyAllocator buddy_allocator(
...@@ -230,7 +244,11 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) { ...@@ -230,7 +244,11 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30); TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
if (p) { if (p) {
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipFree(p) == hipSuccess);
#else
EXPECT_TRUE(cudaFree(p) == cudaSuccess); EXPECT_TRUE(cudaFree(p) == cudaSuccess);
#endif
} }
} }
......
...@@ -35,7 +35,7 @@ limitations under the License. */ ...@@ -35,7 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -111,7 +111,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -111,7 +111,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void* GPUAllocator::Alloc(size_t* index, size_t size) { void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr // CUDA documentation doesn't explain if cudaMalloc returns nullptr
...@@ -121,7 +121,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -121,7 +121,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
void* p; void* p;
auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_); auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
if (result == cudaSuccess) { if (result == gpuSuccess) {
*index = 0; *index = 0;
gpu_alloc_size_ += size; gpu_alloc_size_ += size;
return p; return p;
...@@ -193,10 +193,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -193,10 +193,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
} }
void* p; void* p;
// PINNED memory is visible to all CUDA contexts. // PINNED memory is visible to all CUDA contexts.
#ifdef PADDLE_WITH_HIP
hipError_t result = hipHostMalloc(&p, size);
#else
cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
#endif
if (result == cudaSuccess) { if (result == gpuSuccess) {
*index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
return p; return p;
...@@ -209,7 +213,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -209,7 +213,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
} }
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err; gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
"The index should be 1, but got %d", index)); "The index should be 1, but got %d", index));
...@@ -219,6 +223,15 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { ...@@ -219,6 +223,15 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"allocated cuda pinned memory (%d)", "allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_)); size, cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size; cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP
err = hipHostFree(p);
if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ(
err, hipSuccess,
platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
}
#else
err = cudaFreeHost(p); err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because // Purposefully allow cudaErrorCudartUnloading, because
...@@ -233,6 +246,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { ...@@ -233,6 +246,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d", "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err)); err));
} }
#endif
} }
bool CUDAPinnedAllocator::UseGpu() const { return false; } bool CUDAPinnedAllocator::UseGpu() const { return false; }
......
...@@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator { ...@@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator {
virtual bool UseGpu() const; virtual bool UseGpu() const;
}; };
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUAllocator : public SystemAllocator { class GPUAllocator : public SystemAllocator {
public: public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
......
...@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) { ...@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
TestAllocator(&a, 0); TestAllocator(&a, 0);
} }
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(GPUAllocator, Alloc) { TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a(0); paddle::memory::detail::GPUAllocator a(0);
TestAllocator(&a, 2048); TestAllocator(&a, 2048);
...@@ -77,7 +77,11 @@ TEST(GPUAllocator, AllocFailure) { ...@@ -77,7 +77,11 @@ TEST(GPUAllocator, AllocFailure) {
allocator.Alloc(&index, alloc_size); allocator.Alloc(&index, alloc_size);
ASSERT_TRUE(false); ASSERT_TRUE(false);
} catch (paddle::memory::allocation::BadAlloc&) { } catch (paddle::memory::allocation::BadAlloc&) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
#endif
} }
} }
#endif #endif
...@@ -222,7 +222,7 @@ inline void SyncCUDAStream() { ...@@ -222,7 +222,7 @@ inline void SyncCUDAStream() {
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
...@@ -244,7 +244,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -244,7 +244,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>( void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
...@@ -266,7 +266,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -266,7 +266,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>( void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) { const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -327,7 +327,7 @@ template <> ...@@ -327,7 +327,7 @@ template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num, platform::CUDAPlace src_place, const void* src, size_t num,
cudaStream_t stream) { gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -345,7 +345,7 @@ template <> ...@@ -345,7 +345,7 @@ template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num, platform::CUDAPinnedPlace src_place, const void* src, size_t num,
cudaStream_t stream) { gpuStream_t stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
......
...@@ -33,7 +33,7 @@ namespace memory { ...@@ -33,7 +33,7 @@ namespace memory {
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifdef PADDLE_WITH_CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/** /**
* \brief Copy memory from one place to another place. * \brief Copy memory from one place to another place.
...@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); ...@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
*/ */
template <typename DstPlace, typename SrcPlace> template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream); gpuStream_t stream);
#endif #endif
} // namespace memory } // namespace memory
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册