diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 332ff062d47846505f91153d67cbaf2a6cdd7292..2b0fbfa87e22ad5697b1062d1d7b80f5eb251399 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -39,22 +39,22 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { // pointer shall not be dereferenced -- so we make it nullptr. if (size <= 0) return nullptr; - if (FLAGS_use_pinned_memory) { - void* p = malloc(size); - if (p != nullptr) { - mlock(p, size); - } - } + index = 0; // unlock memory void* p = malloc(size); - if (p != nullptr && FLAGS_use_pinned_memory) { - mlock(p, size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + index = 1; + mlock(p, size); // lock memory + } } + return p; } void CPUAllocator::Free(void* p, size_t size, size_t index) { - if (p != nullptr && FLAGS_use_pinned_memory) { + if (p != nullptr && index == 1) { munlock(p, size); } free(p); @@ -73,26 +73,34 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { // Reserve memory for page tables, etc. size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); - size_t remaining = available > reserving ? available - reserving : 0; + size_t usable = available > reserving ? available - reserving : 0; // If remaining size no less than expected size, using general // cudaMalloc to allocate GPU memory. void* p = 0; - if (size <= remaining) { + if (size <= usable) { cudaError_t result = cudaMalloc(&p, size); if (result == cudaSuccess) { index = 0; - total_alloc_size_ += size; + gpu_alloc_size_ += size; return p; } } // If remaining size less than expected size or cudaMalloc failed, // cudaMallocHost will be considered as a fallback allocator. + // + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host fallback allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + cudaError_t result = cudaMallocHost(&p, size); if (result == cudaSuccess) { index = 1; - total_alloc_size_ += size; + fallback_alloc_size_ += size; return p; } @@ -100,16 +108,26 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { } void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if // cudaFree succeeds. - PADDLE_ASSERT(total_alloc_size_ >= size); - total_alloc_size_ -= size; - cudaError_t err = index == 1 ? cudaFreeHost(p) : cudaFree(p); if (err != cudaErrorCudartUnloading) { - platform::throw_on_error(err, "cudaFree{Host} failed"); + platform::throw_on_error(err, + "cudaFree{Host} failed in GPUAllocator::Free."); } } diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h index e15302ce4f0ae106c2beb0d07dfc911b8ad00187..7093c42967162c6737e1fe68176cf3993a18ba09 100644 --- a/paddle/memory/detail/system_allocator.h +++ b/paddle/memory/detail/system_allocator.h @@ -47,7 +47,8 @@ class GPUAllocator : public SystemAllocator { virtual void Free(void* p, size_t size, size_t index); private: - size_t total_alloc_size_ = 0; + size_t gpu_alloc_size_ = 0; + size_t fallback_alloc_size_ = 0; }; #endif // PADDLE_ONLY_CPU diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 67154a8d7d366bd983b4426da87e0b33307fced4..1bf59ed4840ae69afc5bce49c86a08b60e9603ee 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -1381,7 +1381,7 @@ def inputs(layers, *args): if len(args) != 0: layers.extend(args) - Inputs(* [l.name for l in layers]) + Inputs(*[l.name for l in layers]) def outputs(layers, *args): @@ -1424,7 +1424,7 @@ def outputs(layers, *args): assert len(layers) > 0 if HasInputsSet(): # input already set - Outputs(* [l.name for l in layers]) + Outputs(*[l.name for l in layers]) return # just return outputs. if len(layers) != 1: