// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifdef PADDLE_WITH_CUDA #include #include #endif #ifdef PADDLE_WITH_HIP #include #endif #include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { namespace memory { __global__ void add_kernel(int *x, int n) { int tid = threadIdx.x + blockIdx.x * blockDim.x; for (int i = tid; i < n; i += blockDim.x * gridDim.x) { atomicAdd(x + i, tid); } } class StreamSafeCUDAAllocTest : public ::testing::Test { protected: void SetUp() override { place_ = platform::CUDAPlace(); stream_num_ = 64; grid_num_ = 1; block_num_ = 64; data_num_ = 64; default_stream = nullptr; streams_.reserve(stream_num_); streams_.emplace_back(default_stream); for (size_t i = 1; i < stream_num_; ++i) { gpuStream_t stream; #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); #else PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); #endif streams_.emplace_back(stream); } for (size_t i = 0; i < stream_num_; ++i) { size_t allocation_size = data_num_ * sizeof(int); std::shared_ptr allocation = AllocShared(place_, allocation_size, streams_[i]); #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemset(allocation->ptr(), 0, allocation->size())); #else PADDLE_ENFORCE_CUDA_SUCCESS( hipMemset(allocation->ptr(), 0, allocation->size())); #endif allocations_.emplace_back(allocation); } } void SingleStreamRun(size_t idx) { for (size_t i = 0; i < stream_num_; ++i) { int *x = reinterpret_cast(allocations_[i]->ptr()); add_kernel<<>>(x, data_num_); if (i != idx) { RecordStream(allocations_[i].get(), streams_[idx]); } } } void MultiStreamRun() { for (int i = 0; i < stream_num_; ++i) { SingleStreamRun(i); } allocations_.clear(); // fast_gc } void MultiThreadMUltiStreamRun() { std::vector threads; for (size_t i = 0; i < stream_num_; ++i) { threads.push_back( std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); } for (size_t i = 0; i < stream_num_; ++i) { threads[i].join(); } allocations_.clear(); // fast_gc } void CheckResult() { auto host_x = std::unique_ptr(new int[data_num_]); size_t thread_num = grid_num_ * block_num_; for (int i = 0; i < stream_num_; ++i) { // tricky code, the allocations are still accessible even though // allocations_.clear() has been called #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemcpy(host_x.get(), allocations_[i]->ptr(), data_num_ * sizeof(int), cudaMemcpyDeviceToHost)); #else PADDLE_ENFORCE_CUDA_SUCCESS( hipMemcpy(host_x.get(), allocations_[i]->ptr(), data_num_ * sizeof(int), hipMemcpyDeviceToHost)); #endif for (int j = 0; j < data_num_; ++j) { EXPECT_TRUE(host_x[j] == (j % thread_num) * stream_num_); } } } void TearDown() override { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); #else PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); #endif for (gpuStream_t stream : streams_) { Release(place_, stream); } for (size_t i = 1; i < stream_num_; ++i) { #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(streams_[i])); #else PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamDestroy(streams_[i])); #endif } uint64_t cuda_malloc_size = platform::RecordedCudaMallocSize(place_.GetDeviceId()); ASSERT_EQ(cuda_malloc_size, 0) << "Found " << cuda_malloc_size << " bytes memory that not released yet," << " there may be a memory leak problem"; } size_t stream_num_; size_t grid_num_; size_t block_num_; size_t data_num_; platform::CUDAPlace place_; gpuStream_t default_stream; std::vector streams_; std::vector> allocations_; }; TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { MultiStreamRun(); CheckResult(); } TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { MultiThreadMUltiStreamRun(); CheckResult(); } TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; std::shared_ptr allocation_implicit_stream = AllocShared(place, alloc_size); EXPECT_GE(allocation_implicit_stream->size(), alloc_size); void *address = allocation_implicit_stream->ptr(); allocation_implicit_stream.reset(); gpuStream_t default_stream = nullptr; allocation::AllocationPtr allocation_unique = Alloc(place, alloc_size, default_stream); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); } TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream1)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream2)); #else PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream1)); PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream2)); #endif size_t available_size = platform::GpuAvailableMemToAlloc(); // alloc_size < available_size < 2 * alloc_size size_t alloc_size = available_size / 4 * 3; std::shared_ptr allocation1 = AllocShared(place, alloc_size, stream1); std::shared_ptr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); allocation2 = AllocShared(place, alloc_size, stream2); }); allocation1.reset(); // free but not release th.join(); EXPECT_GE(allocation2->size(), alloc_size); allocation2.reset(); #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaDeviceSynchronize()); #else PADDLE_ENFORCE_CUDA_SUCCESS(hipDeviceSynchronize()); #endif Release(place, stream1); Release(place, stream2); } } // namespace memory } // namespace paddle