未验证 提交 69875dc4 编写于 作者: Q Qi Li 提交者: GitHub

[ROCM] update fluid memory for rocm35 (part1), test=develop (#30758)

上级 5b1ab51c
......@@ -19,6 +19,12 @@ if (WITH_GPU)
DEPS device_context malloc)
endif()
if (WITH_ROCM)
hip_test(malloc_test
SRCS malloc_test.cu
DEPS device_context malloc)
endif()
#if (WITH_GPU)
# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory)
#endif()
......@@ -31,7 +31,7 @@
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
......@@ -70,7 +70,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
......@@ -87,7 +87,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
......@@ -104,7 +104,7 @@ class AllocatorFacadePrivate {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
......@@ -152,7 +152,7 @@ class AllocatorFacadePrivate {
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
system_allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CPUPinnedAllocator>();
int device_count = platform::GetCUDADeviceCount();
......@@ -168,7 +168,7 @@ class AllocatorFacadePrivate {
std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InitNaiveBestFitCUDAPinnedAllocator() {
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
......@@ -215,7 +215,7 @@ class AllocatorFacadePrivate {
void InitZeroSizeAllocators() {
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int device_count = platform::GetCUDADeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::CUDAPlace(dev_id));
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
......@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
place = platform::CUDAPlace(0);
size = 1024;
......@@ -81,7 +81,7 @@ void AllocateTestCases() {
}
TEST(Allocator, SpecifyGpuMemory) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and
// FLAGS_reallocate_gpu_memory_in_mb
FLAGS_fraction_of_gpu_memory_to_use = 0.0;
......
......@@ -16,7 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_uint64(initial_gpu_memory_in_mb);
......@@ -45,7 +45,7 @@ void AllocateTestCases() {
ASSERT_EQ(cpu_allocation->size(), size);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
place = platform::CUDAPlace(0);
size = 1024;
......@@ -81,7 +81,7 @@ void AllocateTestCases() {
}
TEST(Allocator, Allocator) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
......
......@@ -22,7 +22,7 @@
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_double(fraction_of_gpu_memory_to_use);
DECLARE_double(fraction_of_cuda_pinned_memory_to_use);
DECLARE_int64(gpu_allocator_retry_time);
......@@ -40,7 +40,7 @@ static inline size_t AlignTo(size_t size, size_t alignment) {
}
TEST(allocator, allocator) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
FLAGS_gpu_allocator_retry_time = 500;
FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5;
......@@ -62,7 +62,7 @@ TEST(allocator, allocator) {
ASSERT_EQ(cpu_allocation->size(), AlignedSize(size, 1024));
}
#ifdef PADDLE_WITH_CUDA
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
{
place = platform::CUDAPlace(0);
size = 1024;
......@@ -101,7 +101,7 @@ TEST(allocator, allocator) {
TEST(multithread_allocate, test_segfault) {
FLAGS_allocator_strategy = "auto_growth";
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::mutex mtx;
std::condition_variable cv;
bool flag = false;
......
......@@ -13,8 +13,16 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#ifdef PADDLE_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#endif
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include <string>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -39,7 +47,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
void* ptr;
auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
if (LIKELY(result == cudaSuccess)) {
if (LIKELY(result == gpuSuccess)) {
return new Allocation(ptr, size, platform::Place(place_));
}
......
......@@ -14,8 +14,6 @@
#pragma once
#include <cuda_runtime.h>
#include <map>
#include <memory>
#include <utility>
......@@ -79,17 +77,26 @@ class CUDADeviceContextAllocation : public Allocation {
class CUDADeviceContextAllocator : public Allocator {
public:
explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
cudaStream_t default_stream)
gpuStream_t default_stream)
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(
hipEventCreateWithFlags(&event_, hipEventDisableTiming));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming));
#endif
}
~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
#endif
}
}
......@@ -102,10 +109,15 @@ class CUDADeviceContextAllocator : public Allocator {
platform::CUDADeviceGuard guard(place_.device);
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
// Wait for the event on stream
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0));
#endif
return allocation;
}
......@@ -113,8 +125,8 @@ class CUDADeviceContextAllocator : public Allocator {
private:
platform::CUDAPlace place_;
cudaEvent_t event_{nullptr};
cudaStream_t default_stream_{nullptr};
gpuEvent_t event_{nullptr};
gpuStream_t default_stream_{nullptr};
};
/**
......
......@@ -26,7 +26,7 @@
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_XPU
......@@ -216,7 +216,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#endif
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUBuddyAllocatorList {
private:
GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
......@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
template <>
size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA
#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
return GetGPUBuddyAllocator(place.device)->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
template <>
void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
auto *ptr = buddy_allocator->Alloc(size);
if (ptr == nullptr) {
......@@ -311,7 +311,11 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
} else {
if (FLAGS_init_allocated_mem) {
#ifdef PADDLE_WITH_HIP
hipMemset(ptr, 0xEF, size);
#else
cudaMemset(ptr, 0xEF, size);
#endif
}
}
return ptr;
......@@ -324,7 +328,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -334,7 +338,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
template <>
uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetGPUBuddyAllocator(place.device)->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -342,7 +346,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
#endif
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
static std::once_flag init_flag;
static BuddyAllocator *ba = nullptr;
......@@ -360,7 +364,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
template <>
size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetCUDAPinnedBuddyAllocator()->Used();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -371,7 +375,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
template <>
void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
size_t size) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
void *ptr = buddy_allocator->Alloc(size);
......@@ -392,7 +396,7 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -403,7 +407,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <>
uint64_t Release<platform::CUDAPinnedPlace>(
const platform::CUDAPinnedPlace &place) {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return GetCUDAPinnedBuddyAllocator()->Release();
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -449,7 +453,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
}
size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return Used(gpu);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......@@ -458,7 +462,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
}
size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
return Used(cuda_pinned);
#else
PADDLE_THROW(platform::errors::PermissionDenied(
......
......@@ -41,7 +41,7 @@ TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
alloc.Release(platform::CPUPlace());
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
{
......
......@@ -19,12 +19,20 @@ namespace memory {
namespace allocation {
bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif
delete allocation;
}
Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
void *ptr;
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif
return new Allocation(ptr, size, platform::CUDAPinnedPlace());
}
} // namespace allocation
......
......@@ -26,7 +26,7 @@
#include "paddle/fluid/memory/allocation/best_fit_allocator.h"
#include "paddle/fluid/memory/allocation/cpu_allocator.h"
#include "paddle/fluid/memory/allocation/locked_allocator.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/memory/allocation/cuda_allocator.h"
#endif
......@@ -127,7 +127,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
}
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
{
platform::CUDAPlace p(0);
RetryAllocator allocator(std::make_shared<CUDAAllocator>(p), retry_ms);
......
......@@ -2,11 +2,13 @@ include(ExternalProject)
cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place)
if(${WITH_GPU})
if(WITH_GPU)
nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else(${WITH_GPU})
elseif(WITH_ROCM)
hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place)
else()
cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place)
endif(${WITH_GPU})
endif()
cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
......
......@@ -18,7 +18,7 @@ limitations under the License. */
#include "gflags/gflags.h"
#include "glog/logging.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DECLARE_uint64(reallocate_gpu_memory_in_mb);
#endif
......@@ -220,7 +220,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
size_t allocate_bytes = max_chunk_size_;
size_t index = 0;
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (system_allocator_->UseGpu()) {
if ((total_used_ + total_free_) == 0) {
// Compute the allocation size for gpu for the first allocation.
......
......@@ -23,7 +23,7 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <fstream>
#include <string>
......@@ -76,7 +76,7 @@ int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes,
return nullptr;
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(BuddyAllocator, GpuFraction) {
// In a 16 GB machine, the pool size will be about 160 MB
FLAGS_fraction_of_gpu_memory_to_use = 0.01;
......@@ -195,8 +195,13 @@ TEST(BuddyAllocator, AllocFromAvailable) {
// Take half of available GPU
void* p;
#ifdef PADDLE_WITH_HIP
hipError_t result = hipMalloc(&p, available >> 1);
EXPECT_TRUE(result == hipSuccess);
#else
cudaError_t result = cudaMalloc(&p, available >> 1);
EXPECT_TRUE(result == cudaSuccess);
#endif
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator(
......@@ -209,7 +214,11 @@ TEST(BuddyAllocator, AllocFromAvailable) {
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1 << 30));
if (p) {
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipFree(p) == hipSuccess);
#else
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
#endif
}
}
......@@ -219,7 +228,12 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
FLAGS_reallocate_gpu_memory_in_mb = 0;
void* p = nullptr;
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipMalloc(&p, static_cast<size_t>(1) << 30) == hipSuccess);
#else
EXPECT_TRUE(cudaMalloc(&p, static_cast<size_t>(1) << 30) == cudaSuccess);
#endif
// BuddyAllocator should be able to alloc the remaining GPU
BuddyAllocator buddy_allocator(
......@@ -230,7 +244,11 @@ TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) {
TestBuddyAllocator(&buddy_allocator, static_cast<size_t>(1) << 30);
if (p) {
#ifdef PADDLE_WITH_HIP
EXPECT_TRUE(hipFree(p) == hipSuccess);
#else
EXPECT_TRUE(cudaFree(p) == cudaSuccess);
#endif
}
}
......
......@@ -35,7 +35,7 @@ limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/gpu_info.h"
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
......@@ -111,7 +111,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
bool CPUAllocator::UseGpu() const { return false; }
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void* GPUAllocator::Alloc(size_t* index, size_t size) {
// CUDA documentation doesn't explain if cudaMalloc returns nullptr
......@@ -121,7 +121,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
void* p;
auto result = platform::RecordedCudaMalloc(&p, size, gpu_id_);
if (result == cudaSuccess) {
if (result == gpuSuccess) {
*index = 0;
gpu_alloc_size_ += size;
return p;
......@@ -193,10 +193,14 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
}
void* p;
// PINNED memory is visible to all CUDA contexts.
// PINNED memory is visible to all CUDA contexts.
#ifdef PADDLE_WITH_HIP
hipError_t result = hipHostMalloc(&p, size);
#else
cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
#endif
if (result == cudaSuccess) {
if (result == gpuSuccess) {
*index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
return p;
......@@ -209,7 +213,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
......@@ -219,6 +223,15 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP
err = hipHostFree(p);
if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ(
err, hipSuccess,
platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
}
#else
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
......@@ -233,6 +246,7 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err));
}
#endif
}
bool CUDAPinnedAllocator::UseGpu() const { return false; }
......
......@@ -41,7 +41,7 @@ class CPUAllocator : public SystemAllocator {
virtual bool UseGpu() const;
};
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
class GPUAllocator : public SystemAllocator {
public:
explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
......
......@@ -56,7 +56,7 @@ TEST(CPUAllocator, LockMem) {
TestAllocator(&a, 0);
}
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(GPUAllocator, Alloc) {
paddle::memory::detail::GPUAllocator a(0);
TestAllocator(&a, 2048);
......@@ -77,7 +77,11 @@ TEST(GPUAllocator, AllocFailure) {
allocator.Alloc(&index, alloc_size);
ASSERT_TRUE(false);
} catch (paddle::memory::allocation::BadAlloc&) {
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError());
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError());
#endif
}
}
#endif
......@@ -222,7 +222,7 @@ inline void SyncCUDAStream() {
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
......@@ -244,7 +244,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
......@@ -266,7 +266,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, cudaStream_t stream) {
const void* src, size_t num, gpuStream_t stream) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
......@@ -327,7 +327,7 @@ template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
gpuStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
......@@ -345,7 +345,7 @@ template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
cudaStream_t stream) {
gpuStream_t stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
......
......@@ -33,7 +33,7 @@ namespace memory {
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifdef PADDLE_WITH_CUDA
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
/**
* \brief Copy memory from one place to another place.
......@@ -51,7 +51,7 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
*/
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream);
gpuStream_t stream);
#endif
} // namespace memory
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册