// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifdef PADDLE_WITH_CUDA #include #endif #ifdef PADDLE_WITH_HIP #include #endif #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { __global__ void write_kernel(int* data, uint64_t n, uint64_t step) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (uint64_t i = thread_id; i * step < n; i += thread_num) { *(data + i * step) = 1; } } __global__ void sum_kernel(int* data, uint64_t n, uint64_t step, int* sum) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (uint64_t i = thread_id; i * step < n; i += thread_num) { atomicAdd(sum, *(data + i * step)); } } TEST(ManagedMemoryTest, H2DTest) { if (!platform::IsGPUManagedMemorySupported(0)) { return; } uint64_t n_data = 1024; uint64_t step = 1; allocation::AllocationPtr allocation = Alloc(platform::CUDAPlace(0), n_data * sizeof(int)); int* data = static_cast(allocation->ptr()); memset(data, 0, n_data * sizeof(int)); // located on host memory write_kernel<<<1, 1024>>>(data, n_data, step); // trans to device memory #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif int sum = 0; for (uint64_t i = 0; i < n_data; ++i) { sum += *(data + i); } EXPECT_EQ(sum, n_data / step); allocation = nullptr; } TEST(ManagedMemoryTest, D2HTest) { if (!platform::IsGPUManagedMemorySupported(0)) { return; } uint64_t n_data = 1024; uint64_t step = 1; AllocationPtr allocation = Alloc(platform::CUDAPlace(0), n_data * sizeof(int)); int* data = static_cast(allocation->ptr()); write_kernel<<<1, 1024>>>(data, n_data, step); // located on device memory #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif memset(data, 0, n_data * sizeof(int)); // trans to host memory int sum = 0; for (uint64_t i = 0; i < n_data; ++i) { sum += *(data + i); } EXPECT_EQ(sum, 0); } TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) { if (!platform::IsGPUManagedMemoryOversubscriptionSupported(0)) { return; } uint64_t available_mem = platform::GpuAvailableMemToAlloc(); uint64_t n_data = available_mem * 2 / sizeof(int) + 1; // requires more than 2 * available_mem bytes uint64_t step = 1024; AllocationPtr data_allocation = Alloc(platform::CUDAPlace(0), n_data * sizeof(int)); AllocationPtr sum_allocation = Alloc(platform::CUDAPlace(0), sizeof(int)); int* data = static_cast(data_allocation->ptr()); int* sum = static_cast(sum_allocation->ptr()); (*sum) = 0; write_kernel<<<5120, 1024>>>(data, n_data, step); sum_kernel<<<5120, 1024>>>(data, n_data, step, sum); #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); #else PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); #endif EXPECT_EQ(*sum, (n_data + step - 1) / step); } TEST(ManagedMemoryTest, OOMExceptionTest) { EXPECT_THROW(Alloc(platform::CUDAPlace(0), size_t(1) << 60), memory::allocation::BadAlloc); } } // namespace memory } // namespace paddle