未验证 提交 37f76407 编写于 作者: Z Zeng Jinle 提交者: GitHub

fix cuda dev_ctx allocator cmake deps, test=develop (#19953)

上级 ebff68fa
......@@ -17,8 +17,6 @@ cc_library(memory
memcpy)
if (WITH_GPU)
add_dependencies(malloc cuda_device_context_allocator_pool)
target_link_libraries(malloc cuda_device_context_allocator_pool)
nv_test(malloc_test
SRCS malloc_test.cu
DEPS device_context malloc)
......
......@@ -14,12 +14,6 @@ endif()
if (WITH_GPU)
nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard)
nv_library(cuda_device_context_allocation SRCS cuda_device_context_allocation.cc
DEPS allocator enforce place ${MKLDNN_CTX_DEPS})
nv_library(cuda_device_context_allocator SRCS cuda_device_context_allocator.cc
DEPS allocator enforce place cuda_device_context_allocation ${MKLDNN_CTX_DEPS})
nv_library(cuda_device_context_allocator_pool SRCS cuda_device_context_allocator_pool.cc
DEPS allocator enforce place cuda_device_context_allocation cuda_device_context_allocator ${MKLDNN_CTX_DEPS})
endif()
cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
#include <utility>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
CUDADeviceContextAllocation::CUDADeviceContextAllocation(
AllocationPtr allocation)
: Allocation(allocation->ptr(), allocation->size(), allocation->place()),
underlying_allocation_(std::move(allocation)) {}
CUDADeviceContextAllocation::~CUDADeviceContextAllocation() {
PADDLE_ENFORCE_NOT_NULL(
dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
auto *p_allocation = underlying_allocation_.release();
VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
<< p_allocation;
dev_ctx_->AddStreamCallback([p_allocation] {
VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
AllocationDeleter()(p_allocation);
});
}
void CUDADeviceContextAllocation::SetCUDADeviceContext(
const platform::CUDADeviceContext *dev_ctx) {
dev_ctx_ = dev_ctx;
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace memory {
namespace allocation {
/**
* CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
* CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
* allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
* which deletes allocation in the callback.
*/
class CUDADeviceContextAllocation : public Allocation {
public:
explicit CUDADeviceContextAllocation(AllocationPtr allocation);
~CUDADeviceContextAllocation();
void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx);
private:
AllocationPtr underlying_allocation_;
const platform::CUDADeviceContext *dev_ctx_{nullptr};
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
CUDADeviceContextAllocator::CUDADeviceContextAllocator(
const platform::CUDAPlace place, cudaStream_t default_stream)
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming),
"Create event failed in CUDADeviceContextAllocator");
}
CUDADeviceContextAllocator::~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
}
}
Allocation *CUDADeviceContextAllocator::AllocateImpl(size_t size) {
PADDLE_ENFORCE_NOT_NULL(
default_stream_,
"Didn't set default stream for CUDADeviceContextAllocator");
platform::CUDADeviceGuard guard(place_.device);
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
return allocation;
}
void CUDADeviceContextAllocator::FreeImpl(Allocation *allocation) {
delete allocation;
}
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -15,15 +15,58 @@
#pragma once
#include <cuda_runtime.h>
#include <map>
#include <memory>
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
namespace memory {
namespace allocation {
/**
* CUDADeviceContextAllocation is a wrapper of the underbeneath allocation.
* CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath
* allocation so that CUDADeviceContextAllocation can be used in a CUDA stream
* which deletes allocation in the callback.
*/
class CUDADeviceContextAllocation : public Allocation {
public:
explicit CUDADeviceContextAllocation(AllocationPtr allocation)
: Allocation(allocation->ptr(), allocation->size(), allocation->place()),
underlying_allocation_(std::move(allocation)) {}
~CUDADeviceContextAllocation() {
PADDLE_ENFORCE_NOT_NULL(
dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation");
auto *p_allocation = underlying_allocation_.release();
VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at "
<< p_allocation;
dev_ctx_->AddStreamCallback([p_allocation] {
VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
AllocationDeleter()(p_allocation);
});
}
void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) {
dev_ctx_ = dev_ctx;
}
private:
AllocationPtr underlying_allocation_;
const platform::CUDADeviceContext *dev_ctx_{nullptr};
};
/**
* CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation
* after waiting for a self-created event on the default stream. It does so to
......@@ -33,12 +76,42 @@ namespace allocation {
class CUDADeviceContextAllocator : public Allocator {
public:
explicit CUDADeviceContextAllocator(platform::CUDAPlace place,
cudaStream_t default_stream);
~CUDADeviceContextAllocator();
cudaStream_t default_stream)
: place_(place), default_stream_(default_stream) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreate(&event_, cudaEventDisableTiming),
"Create event failed in CUDADeviceContextAllocator");
}
~CUDADeviceContextAllocator() {
if (event_) {
platform::CUDADeviceGuard guard(place_.device);
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventDestroy(event_),
"Destory event failed in CUDADeviceContextAllocator destroctor");
}
}
protected:
Allocation *AllocateImpl(size_t size) override;
void FreeImpl(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size) override {
PADDLE_ENFORCE_NOT_NULL(
default_stream_,
"Didn't set default stream for CUDADeviceContextAllocator");
platform::CUDADeviceGuard guard(place_.device);
auto allocation =
new CUDADeviceContextAllocation(memory::Alloc(place_, size));
// Wait for the event on stream
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventRecord(event_, default_stream_),
"Failed to record event in CUDADeviceContextAllocator");
PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(default_stream_, event_, 0),
"Failed to wait event in CUDADeviceContextAllocator");
return allocation;
}
void FreeImpl(Allocation *allocation) override { delete allocation; }
private:
platform::CUDAPlace place_;
......@@ -46,6 +119,49 @@ class CUDADeviceContextAllocator : public Allocator {
cudaStream_t default_stream_{nullptr};
};
/**
* CUDADeviceContextAllocatorPool is a singletion stores mapping from
* CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
* CUDADeviceContext's compute stream isn't default stream, it can call this
* class to allocate GPU memory which will be released by a callback after
* stream execution.
*/
class CUDADeviceContextAllocatorPool {
public:
static CUDADeviceContextAllocatorPool &Instance() {
static CUDADeviceContextAllocatorPool pool;
return pool;
}
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) {
auto iter =
allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
"CUDADeviceContextAllocatorPool initialization error");
auto &allocator = iter->second;
AllocationPtr allocation = allocator->Allocate(size);
static_cast<CUDADeviceContextAllocation *>(allocation.get())
->SetCUDADeviceContext(&dev_ctx);
return allocation;
}
private:
CUDADeviceContextAllocatorPool() {
std::vector<int> devices = platform::GetSelectedDevices();
for (int i : devices) {
auto place = platform::CUDAPlace(i);
auto compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
new CUDADeviceContextAllocator(place, compute_stream));
allocators_.insert(make_pair(place, allocator));
}
}
std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
allocators_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
#include <utility>
#include <vector>
#include "paddle/fluid/memory/allocation/cuda_device_context_allocation.h"
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace memory {
namespace allocation {
CUDADeviceContextAllocatorPool &CUDADeviceContextAllocatorPool::Instance() {
static CUDADeviceContextAllocatorPool pool;
return pool;
}
AllocationPtr CUDADeviceContextAllocatorPool::Alloc(
const platform::CUDADeviceContext &dev_ctx, size_t size) {
auto iter =
allocators_.find(boost::get<platform::CUDAPlace>(dev_ctx.GetPlace()));
PADDLE_ENFORCE_EQ(iter != allocators_.end(), true,
"CUDADeviceContextAllocatorPool initialization error");
auto &allocator = iter->second;
AllocationPtr allocation = allocator->Allocate(size);
static_cast<CUDADeviceContextAllocation *>(allocation.get())
->SetCUDADeviceContext(&dev_ctx);
return allocation;
}
CUDADeviceContextAllocatorPool::CUDADeviceContextAllocatorPool() {
std::vector<int> devices = platform::GetSelectedDevices();
for (int i : devices) {
auto place = platform::CUDAPlace(i);
auto compute_stream =
platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
auto allocator = std::shared_ptr<CUDADeviceContextAllocator>(
new CUDADeviceContextAllocator(place, compute_stream));
allocators_.insert(make_pair(place, allocator));
}
}
} // namespace allocation
} // namespace memory
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace memory {
namespace allocation {
/**
* CUDADeviceContextAllocatorPool is a singletion stores mapping from
* CUDAPlace(s) to std::shared_ptr<CUDADeviceContextAllocator>. When a
* CUDADeviceContext's compute stream isn't default stream, it can call this
* class to allocate GPU memory which will be released by a callback after
* stream execution.
*/
class CUDADeviceContextAllocatorPool {
public:
static CUDADeviceContextAllocatorPool &Instance();
AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size);
private:
CUDADeviceContextAllocatorPool();
std::map<platform::CUDAPlace, std::shared_ptr<CUDADeviceContextAllocator>>
allocators_;
};
} // namespace allocation
} // namespace memory
} // namespace paddle
......@@ -17,10 +17,6 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator_pool.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -35,26 +31,5 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
return allocation::AllocatorFacade::Instance().Alloc(place, size);
}
AllocationPtr Alloc(const platform::DeviceContext &dev_ctx, size_t size) {
auto place = dev_ctx.GetPlace();
#ifdef PADDLE_WITH_CUDA
if (size == 0 || !platform::is_gpu_place(place)) {
return Alloc(place, size);
}
auto *default_dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto &desired_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
return Alloc(place, size);
} else {
return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
desired_dev_ctx, size);
}
#else
return Alloc(place, size);
#endif
}
} // namespace memory
} // namespace paddle
......@@ -70,7 +70,7 @@ ENDIF()
# memcpy depends on device_context, here add deps individually for
# avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
${dgc_deps})
......
......@@ -18,11 +18,39 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#include "glog/logging.h"
namespace paddle {
namespace memory {
AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
auto place = dev_ctx.GetPlace();
#ifdef PADDLE_WITH_CUDA
if (size == 0 || !platform::is_gpu_place(place)) {
return Alloc(place, size);
}
auto* default_dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto& desired_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(dev_ctx);
if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
return Alloc(place, size);
} else {
return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
desired_dev_ctx, size);
}
#else
return Alloc(place, size);
#endif
}
} // namespace memory
} // namespace paddle
namespace paddle {
namespace platform {
......@@ -174,6 +202,15 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
mutable std::unordered_map<void*, memory::AllocationPtr> allocations_;
};
void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
if (required_workspace_bytes <= WorkspaceSize()) {
return;
}
// reset allocation first before re-allocate to save memory
allocation_.reset();
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
CUDADeviceGuard guard(place_.device);
compute_capability_ = GetCUDAComputeCapability(place_.device);
......
......@@ -220,14 +220,7 @@ class CudnnWorkspaceHandle {
ResetWorkspace();
}
inline void ReallocWorkspace(size_t required_workspace_bytes) {
if (required_workspace_bytes <= WorkspaceSize()) {
return;
}
// reset allocation first before re-allocate to save memory
allocation_.reset();
allocation_ = memory::Alloc(device_context_, required_workspace_bytes);
}
void ReallocWorkspace(size_t required_workspace_bytes);
inline void ResetWorkspace() { allocation_ = nullptr; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册