未验证 提交 6b5e33b4 编写于 作者: L Leo Chen 提交者: GitHub

add device-agnostic stream class (#38391)

* add device-agnostic stream class

* add stream.h

* fix ut

* fix cpu compile
上级 78375990
...@@ -84,14 +84,9 @@ void* Tensor::mutable_data(const platform::Place& place, ...@@ -84,14 +84,9 @@ void* Tensor::mutable_data(const platform::Place& place,
return mutable_data(place, type_, requested_size); return mutable_data(place, type_, requested_size);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void* Tensor::mutable_data(const platform::Place& place,
void* Tensor::mutable_data(const platform::CUDAPlace& place,
proto::VarType::Type type, proto::VarType::Type type,
const gpuStream_t& stream) { const platform::Stream& stream) {
if (!FLAGS_use_stream_safe_cuda_allocator) {
return mutable_data(place, type);
}
type_ = type; type_ = type;
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
numel(), 0, numel(), 0,
...@@ -111,7 +106,6 @@ void* Tensor::mutable_data(const platform::CUDAPlace& place, ...@@ -111,7 +106,6 @@ void* Tensor::mutable_data(const platform::CUDAPlace& place,
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
#endif
Tensor& Tensor::ShareDataWith(const Tensor& src) { Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size(); src.check_memory_size();
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -150,10 +151,8 @@ class Tensor { ...@@ -150,10 +151,8 @@ class Tensor {
void* mutable_data(const platform::Place& place, size_t requested_size = 0); void* mutable_data(const platform::Place& place, size_t requested_size = 0);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void* mutable_data(const platform::Place& place, proto::VarType::Type type,
void* mutable_data(const platform::CUDAPlace& place, const platform::Stream& stream);
proto::VarType::Type type, const gpuStream_t& stream);
#endif
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
......
...@@ -879,9 +879,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ...@@ -879,9 +879,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) {
->Release(place); ->Release(place);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
std::shared_ptr<Allocation> AllocatorFacade::AllocShared( std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { const platform::Place& place, size_t size, const platform::Stream& stream) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true, FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -896,12 +896,16 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared( ...@@ -896,12 +896,16 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
"Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
} }
#endif #endif
gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
return std::shared_ptr<Allocation>(Alloc(place, size, stream)); return std::shared_ptr<Allocation>(Alloc(place, size, s));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
#endif
} }
AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
size_t size, const gpuStream_t& stream) { AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
FLAGS_use_stream_safe_cuda_allocator, true, FLAGS_use_stream_safe_cuda_allocator, true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
...@@ -917,11 +921,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, ...@@ -917,11 +921,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place,
} }
#endif #endif
platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
return m_->GetAllocator(place, stream, /* create_if_not_found = */ true) return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
->Allocate(size); ->Allocate(size);
} else { } else {
return m_->GetAllocator(place, size)->Allocate(size); return m_->GetAllocator(p, size)->Allocate(size);
} }
} }
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -57,21 +58,23 @@ class AllocatorFacade { ...@@ -57,21 +58,23 @@ class AllocatorFacade {
// Release unused memory pool. // Release unused memory pool.
uint64_t Release(const platform::Place& place); uint64_t Release(const platform::Place& place);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
size_t size, size_t size,
const gpuStream_t& stream); const platform::Stream& stream);
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// TODO(zhiqiu): change gpuStream_t to platform::Stream if needed.
AllocationPtr Alloc(const platform::Place& place, size_t size,
const gpuStream_t& stream); const gpuStream_t& stream);
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation, void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream); const gpuStream_t& stream);
const gpuStream_t& GetStream( const gpuStream_t& GetStream(
const std::shared_ptr<Allocation>& allocation) const; const std::shared_ptr<Allocation>& allocation) const;
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
#endif
#endif #endif
// TODO(yy): Allocate a Copy-On-Write allocation? // TODO(yy): Allocate a Copy-On-Write allocation?
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -33,14 +34,14 @@ uint64_t Release(const platform::Place& place) { ...@@ -33,14 +34,14 @@ uint64_t Release(const platform::Place& place) {
return allocation::AllocatorFacade::Instance().Release(place); return allocation::AllocatorFacade::Instance().Release(place);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
size_t size, size_t size,
const gpuStream_t& stream) { const platform::Stream& stream) {
return allocation::AllocatorFacade::Instance().AllocShared(place, size, return allocation::AllocatorFacade::Instance().AllocShared(place, size,
stream); stream);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream) { const gpuStream_t& stream) {
return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); return allocation::AllocatorFacade::Instance().Alloc(place, size, stream);
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
...@@ -40,11 +41,11 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); ...@@ -40,11 +41,11 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
extern uint64_t Release(const platform::Place& place); extern uint64_t Release(const platform::Place& place);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
extern std::shared_ptr<Allocation> AllocShared(const platform::CUDAPlace& place,
size_t size, size_t size,
const gpuStream_t& stream); const platform::Stream& stream);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
const gpuStream_t& stream); const gpuStream_t& stream);
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -69,8 +70,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { ...@@ -69,8 +70,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream));
#endif #endif
std::shared_ptr<Allocation> allocation = std::shared_ptr<Allocation> allocation = AllocShared(
AllocShared(place_, workspace_size_, stream); place_, workspace_size_,
platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS( PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemset(allocation->ptr(), 0, allocation->size())); cudaMemset(allocation->ptr(), 0, allocation->size()));
...@@ -283,8 +285,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { ...@@ -283,8 +285,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) {
PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&new_stream)); PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&new_stream));
#endif #endif
std::shared_ptr<Allocation> allocation_new_stream = std::shared_ptr<Allocation> allocation_new_stream = AllocShared(
AllocShared(place, alloc_size, new_stream); place, alloc_size,
platform::Stream(reinterpret_cast<platform::StreamId>(new_stream)));
EXPECT_EQ(GetStream(allocation_new_stream), new_stream); EXPECT_EQ(GetStream(allocation_new_stream), new_stream);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -311,7 +314,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { ...@@ -311,7 +314,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) {
EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet);
EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
EXPECT_THROW(AllocShared(place, alloc_size, nullptr), EXPECT_THROW(AllocShared(place, alloc_size,
platform::Stream(
reinterpret_cast<platform::StreamId>(nullptr))),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
EXPECT_THROW(Alloc(place, alloc_size, nullptr), EXPECT_THROW(Alloc(place, alloc_size, nullptr),
paddle::platform::EnforceNotMet); paddle::platform::EnforceNotMet);
...@@ -342,13 +347,16 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { ...@@ -342,13 +347,16 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry // so the second alloc will fail and retry
size_t alloc_size = available_size / 4 * 3; size_t alloc_size = available_size / 4 * 3;
std::shared_ptr<Allocation> allocation1 = std::shared_ptr<Allocation> allocation1 = AllocShared(
AllocShared(place, alloc_size, stream1); place, alloc_size,
platform::Stream(reinterpret_cast<platform::StreamId>(stream1)));
std::shared_ptr<Allocation> allocation2; std::shared_ptr<Allocation> allocation2;
std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::thread th([&allocation2, &place, &stream2, alloc_size]() {
std::this_thread::sleep_for(std::chrono::seconds(1)); std::this_thread::sleep_for(std::chrono::seconds(1));
allocation2 = AllocShared(place, alloc_size, stream2); allocation2 = AllocShared(
place, alloc_size,
platform::Stream(reinterpret_cast<platform::StreamId>(stream2)));
}); });
allocation1.reset(); // free but not release allocation1.reset(); // free but not release
th.join(); th.join();
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/stream/stream.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -42,11 +43,15 @@ class MemcpyH2DFunctor { ...@@ -42,11 +43,15 @@ class MemcpyH2DFunctor {
void operator()(const framework::LoDTensor &lod_tensor) const { void operator()(const framework::LoDTensor &lod_tensor) const {
auto &out_tensor = *out_->GetMutable<framework::LoDTensor>(); auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
out_tensor.mutable_data( auto stream =
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()), static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream();
lod_tensor.type(), #else
static_cast<const platform::CUDADeviceContext *>(&dev_ctx_)->stream()); auto stream = nullptr;
#endif #endif
out_tensor.mutable_data(
dev_ctx_.GetPlace(), lod_tensor.type(),
platform::Stream(reinterpret_cast<platform::StreamId>(stream)));
if (dst_place_type_ == 0 || dst_place_type_ == 1) { if (dst_place_type_ == 0 || dst_place_type_ == 1) {
framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
&out_tensor); &out_tensor);
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstdint>
#include <memory>
namespace paddle {
namespace platform {
using StreamId = uint64_t;
class Stream final {
public:
explicit Stream(StreamId id) : id_(id) {}
StreamId id() const { return id_; }
private:
StreamId id_;
};
} // namespace platform
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册