From 6b5e33b46bcf5151785c9bd3bfa6a1eb23f51841 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 27 Dec 2021 16:09:57 +0800 Subject: [PATCH] add device-agnostic stream class (#38391) * add device-agnostic stream class * add stream.h * fix ut * fix cpu compile --- paddle/fluid/framework/tensor.cc | 10 ++---- paddle/fluid/framework/tensor.h | 7 ++-- .../memory/allocation/allocator_facade.cc | 21 +++++++----- .../memory/allocation/allocator_facade.h | 13 ++++--- paddle/fluid/memory/malloc.cc | 7 ++-- paddle/fluid/memory/malloc.h | 7 ++-- .../memory/stream_safe_cuda_alloc_test.cu | 24 ++++++++----- paddle/fluid/operators/memcpy_h2d_op.h | 13 ++++--- paddle/fluid/platform/stream/stream.h | 34 +++++++++++++++++++ 9 files changed, 93 insertions(+), 43 deletions(-) create mode 100644 paddle/fluid/platform/stream/stream.h diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index cbbc020989..e59733b4fb 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -84,14 +84,9 @@ void* Tensor::mutable_data(const platform::Place& place, return mutable_data(place, type_, requested_size); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -void* Tensor::mutable_data(const platform::CUDAPlace& place, +void* Tensor::mutable_data(const platform::Place& place, proto::VarType::Type type, - const gpuStream_t& stream) { - if (!FLAGS_use_stream_safe_cuda_allocator) { - return mutable_data(place, type); - } - + const platform::Stream& stream) { type_ = type; PADDLE_ENFORCE_GE( numel(), 0, @@ -111,7 +106,6 @@ void* Tensor::mutable_data(const platform::CUDAPlace& place, return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } -#endif Tensor& Tensor::ShareDataWith(const Tensor& src) { src.check_memory_size(); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 4b1ae041fc..a146f57174 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace memory { @@ -150,10 +151,8 @@ class Tensor { void* mutable_data(const platform::Place& place, size_t requested_size = 0); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void* mutable_data(const platform::CUDAPlace& place, - proto::VarType::Type type, const gpuStream_t& stream); -#endif + void* mutable_data(const platform::Place& place, proto::VarType::Type type, + const platform::Stream& stream); /** * @brief Return a pointer to mutable memory block. diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index a53c6a8dbe..3a53954436 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -879,9 +879,9 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ->Release(place); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) std::shared_ptr AllocatorFacade::AllocShared( - const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { + const platform::Place& place, size_t size, const platform::Stream& stream) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -896,12 +896,16 @@ std::shared_ptr AllocatorFacade::AllocShared( "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); } #endif - - return std::shared_ptr(Alloc(place, size, stream)); + gpuStream_t s = reinterpret_cast(stream.id()); + return std::shared_ptr(Alloc(place, size, s)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); +#endif } -AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, - size_t size, const gpuStream_t& stream) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -917,11 +921,12 @@ AllocationPtr AllocatorFacade::Alloc(const platform::CUDAPlace& place, } #endif + platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place); if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->GetAllocator(place, stream, /* create_if_not_found = */ true) + return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) ->Allocate(size); } else { - return m_->GetAllocator(place, size)->Allocate(size); + return m_->GetAllocator(p, size)->Allocate(size); } } diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 4c4f805a0c..b10ea9948d 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -22,6 +22,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace memory { @@ -57,21 +58,23 @@ class AllocatorFacade { // Release unused memory pool. uint64_t Release(const platform::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::shared_ptr AllocShared(const platform::CUDAPlace& place, + std::shared_ptr AllocShared(const platform::Place& place, size_t size, - const gpuStream_t& stream); - AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const platform::Stream& stream); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(zhiqiu): change gpuStream_t to platform::Stream if needed. + AllocationPtr Alloc(const platform::Place& place, size_t size, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); const gpuStream_t& GetStream( const std::shared_ptr& allocation) const; +#endif + #ifdef PADDLE_WITH_CUDA void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); -#endif #endif // TODO(yy): Allocate a Copy-On-Write allocation? diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 5ec96c39bb..6a8bb59260 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace memory { @@ -33,14 +34,14 @@ uint64_t Release(const platform::Place& place) { return allocation::AllocatorFacade::Instance().Release(place); } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -std::shared_ptr AllocShared(const platform::CUDAPlace& place, +std::shared_ptr AllocShared(const platform::Place& place, size_t size, - const gpuStream_t& stream) { + const platform::Stream& stream) { return allocation::AllocatorFacade::Instance().AllocShared(place, size, stream); } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 7ca15c5dfc..b2ad3a9810 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { @@ -40,11 +41,11 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); extern uint64_t Release(const platform::Place& place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern std::shared_ptr AllocShared(const platform::CUDAPlace& place, +extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, - const gpuStream_t& stream); + const platform::Stream& stream); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 52c3825053..083b8a14d2 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -30,6 +30,7 @@ #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace memory { @@ -69,8 +70,9 @@ class StreamSafeCUDAAllocTest : public ::testing::Test { PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); #endif - std::shared_ptr allocation = - AllocShared(place_, workspace_size_, stream); + std::shared_ptr allocation = AllocShared( + place_, workspace_size_, + platform::Stream(reinterpret_cast(stream))); #ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_GPU_SUCCESS( cudaMemset(allocation->ptr(), 0, allocation->size())); @@ -283,8 +285,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&new_stream)); #endif - std::shared_ptr allocation_new_stream = - AllocShared(place, alloc_size, new_stream); + std::shared_ptr allocation_new_stream = AllocShared( + place, alloc_size, + platform::Stream(reinterpret_cast(new_stream))); EXPECT_EQ(GetStream(allocation_new_stream), new_stream); #ifdef PADDLE_WITH_CUDA @@ -311,7 +314,9 @@ TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), paddle::platform::EnforceNotMet); - EXPECT_THROW(AllocShared(place, alloc_size, nullptr), + EXPECT_THROW(AllocShared(place, alloc_size, + platform::Stream( + reinterpret_cast(nullptr))), paddle::platform::EnforceNotMet); EXPECT_THROW(Alloc(place, alloc_size, nullptr), paddle::platform::EnforceNotMet); @@ -342,13 +347,16 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { // so the second alloc will fail and retry size_t alloc_size = available_size / 4 * 3; - std::shared_ptr allocation1 = - AllocShared(place, alloc_size, stream1); + std::shared_ptr allocation1 = AllocShared( + place, alloc_size, + platform::Stream(reinterpret_cast(stream1))); std::shared_ptr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = AllocShared(place, alloc_size, stream2); + allocation2 = AllocShared( + place, alloc_size, + platform::Stream(reinterpret_cast(stream2))); }); allocation1.reset(); // free but not release th.join(); diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h index 43ac5984bc..a19dc3367a 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.h +++ b/paddle/fluid/operators/memcpy_h2d_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/stream/stream.h" namespace paddle { namespace platform { @@ -42,11 +43,15 @@ class MemcpyH2DFunctor { void operator()(const framework::LoDTensor &lod_tensor) const { auto &out_tensor = *out_->GetMutable(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - out_tensor.mutable_data( - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_.GetPlace()), - lod_tensor.type(), - static_cast(&dev_ctx_)->stream()); + auto stream = + static_cast(&dev_ctx_)->stream(); +#else + auto stream = nullptr; #endif + out_tensor.mutable_data( + dev_ctx_.GetPlace(), lod_tensor.type(), + platform::Stream(reinterpret_cast(stream))); + if (dst_place_type_ == 0 || dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); diff --git a/paddle/fluid/platform/stream/stream.h b/paddle/fluid/platform/stream/stream.h new file mode 100644 index 0000000000..79ca51220b --- /dev/null +++ b/paddle/fluid/platform/stream/stream.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +namespace paddle { +namespace platform { + +using StreamId = uint64_t; +class Stream final { + public: + explicit Stream(StreamId id) : id_(id) {} + StreamId id() const { return id_; } + + private: + StreamId id_; +}; + +} // namespace platform +} // namespace paddle -- GitLab