stream_safe_cuda_allocator.cc 7.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"

namespace paddle {
namespace memory {
namespace allocation {

StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
    AllocationPtr underlying_allocation, gpuStream_t owning_stream)
F
From00 已提交
23 24 25
    : Allocation(underlying_allocation->ptr(),
                 underlying_allocation->base_ptr(),
                 underlying_allocation->size(), underlying_allocation->place()),
26
      underlying_allocation_(std::move(underlying_allocation)),
27
      owning_stream_(std::move(owning_stream)) {}
28

29 30
void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
  VLOG(8) << "Try record stream " << stream << " for address " << ptr();
31
  if (stream == owning_stream_) {
32
    VLOG(9) << "Record the same stream of " << stream;
33 34
    return;
  }
35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93

  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
  gpuEvent_t record_event;
  auto it = outstanding_event_map_.find(stream);
  if (it == outstanding_event_map_.end()) {
    gpuEvent_t new_event;
#ifdef PADDLE_WITH_CUDA
    PADDLE_ENFORCE_GPU_SUCCESS(
        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
#else
    PADDLE_ENFORCE_GPU_SUCCESS(
        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
#endif
    outstanding_event_map_[stream] = new_event;
    record_event = new_event;
    VLOG(9) << "Create a new event " << new_event;
  } else {
    record_event = it->second;
    VLOG(9) << "Reuse event " << record_event;
  }

#ifdef PADDLE_WITH_CUDA
  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
#else
  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
#endif
  VLOG(8) << "Record event " << record_event << " to stream " << stream;
}

bool StreamSafeCUDAAllocation::CanBeFreed() {
  // NOTE(Ruibiao): This function will not execute concurrently,
  // so outstanding_event_lock_ is not required here
  for (auto it = outstanding_event_map_.begin();
       it != outstanding_event_map_.end(); ++it) {
    gpuEvent_t& event = it->second;
#ifdef PADDLE_WITH_CUDA
    gpuError_t err = cudaEventQuery(event);
    if (err == cudaErrorNotReady) {
      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
      // Erase the completded event before "it"
      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
      return false;
    }
    PADDLE_ENFORCE_GPU_SUCCESS(err);
    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
#else
    gpuError_t err = hipEventQuery(event);
    if (err == hipErrorNotReady) {
      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
      // Erase the completded event before "it"
      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
      return false;
    }
    PADDLE_ENFORCE_GPU_SUCCESS(err);
    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
#endif
    VLOG(8) << "Destroy event " << event;
  }
  return true;
94 95
}

96 97
const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
  return owning_stream_;
98 99 100
}

StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
101 102 103 104 105 106 107
    std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
    gpuStream_t default_stream)
    : underlying_allocator_(std::move(underlying_allocator)),
      place_(std::move(place)),
      default_stream_(std::move(default_stream)) {
  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
  allocator_map_[place].emplace_back(this);
108 109 110
}

StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
111 112
  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
113 114 115 116 117 118 119
  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
                   allocators.end());
}

bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }

Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
120
  ProcessUnfreedAllocations();
121
  VLOG(8) << "Try allocate " << size << " bytes";
122 123 124 125
  AllocationPtr underlying_allocation;
  try {
    underlying_allocation = underlying_allocator_->Allocate(size);
  } catch (BadAlloc&) {
126 127
    VLOG(4) << "Allocation failed when allocating " << size << " bytes";
    ReleaseImpl(place_);
128 129 130
    try {
      underlying_allocation = underlying_allocator_->Allocate(size);
    } catch (...) {
131 132
      VLOG(3)
          << "Still allocation failed after release memory from all streams";
133 134 135 136 137 138 139
      throw;
    }
  } catch (...) {
    throw;
  }
  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
      std::move(underlying_allocation), default_stream_);
140 141
  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
          << allocation->ptr();
142 143 144 145
  return allocation;
}

void StreamSafeCUDAAllocator::FreeImpl(Allocation* allocation) {
146 147 148 149 150 151 152 153
  StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
                          platform::errors::InvalidArgument(
                              "Failed to dynamic cast %p from Allocation* to "
                              "StreamSafeCUDAAllocation*",
                              allocation));
  VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
154
  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
155
  if (stream_safe_cuda_allocation->CanBeFreed()) {
156
    VLOG(9) << "Directly delete allocation";
157
    delete stream_safe_cuda_allocation;
158
  } else {
159
    VLOG(9) << "Put into unfreed_allocation list";
160
    unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
161 162 163 164
  }
}

uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
165
  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
166
  std::vector<StreamSafeCUDAAllocator*>& allocators =
167 168
      allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
  uint64_t released_size = 0;
169
  for (StreamSafeCUDAAllocator* allocator : allocators) {
170
    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
171
  }
172 173
  VLOG(8) << "Release " << released_size << " bytes memory from all streams";
  return released_size;
174 175
}

176 177 178 179 180 181 182
void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
  for (auto it = unfreed_allocations_.begin();
       it != unfreed_allocations_.end();) {
    if ((*it)->CanBeFreed()) {
      delete *it;
      it = unfreed_allocations_.erase(it);
183
    } else {
184
      ++it;
185 186 187 188
    }
  }
}

189 190
uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
  ProcessUnfreedAllocations();
191 192 193 194
  return underlying_allocator_->Release(place_);
}

std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
195 196
    StreamSafeCUDAAllocator::allocator_map_;
SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;
197 198 199 200

}  // namespace allocation
}  // namespace memory
}  // namespace paddle