diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 412bc9cbe88b860a698d17e239d7b94d8956b781..867970717b38653252dae014a2ec7c0af2daeff8 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -48,10 +48,10 @@ if(WITH_GPU) nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) add_dependencies(tensor tensor_util) else() - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context ) endif(WIN32) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context ) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 5b09cad06c3f87ce29a8c986d30217099bd10d74..ef096c2b810187c50fbcde7d93d9e5a2ecd8b0f3 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -28,8 +28,7 @@ void Tensor::check_memory_size() const { "or maybe the required data-type mismatches the data already stored."); } -Tensor::Tensor(std::type_index type) - : type_(framework::ToDataType(type)), offset_(0) {} +Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {} size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 2e110133a33ede5c58779f9f7c52abd8e74c2fa0..40606d9b06baf4dbebf87f3c02580e49ae6e2a70 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -69,7 +69,7 @@ class Tensor { public: Tensor() : type_(proto::VarType::FP32), offset_(0) {} - explicit Tensor(std::type_index type); + explicit Tensor(const proto::VarType::Type&); /*! Return a pointer to mutable memory block. */ template diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index cab6d9b67e4e64335be0a386bfffb7ebe4373b3e..871c7bd2a77d1cc5057177619b5cd7b2083ff308 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/temporary_allocator.h" namespace paddle { namespace framework { @@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector* dst) { src_ptr, size); } +template +paddle::framework::Tensor GetTensor( + memory::allocation::AllocationPtr temp_allocation_ptr, + const framework::DDim& dim) { + auto& deleter = temp_allocation_ptr.get_deleter(); + auto* allocation_ptr = temp_allocation_ptr.release(); + auto shared_allocation = + std::shared_ptr(allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 4a7b31c7d491f0e4b73e2b574456d1567b7cc5dc..2519f5e7acdb7828743c6e114adfe5e530058406 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,11 +18,11 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" namespace paddle { namespace operators { @@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel { auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( framework::product(col_shape) * sizeof(T)); - Tensor tep_tensor = - platform::GetTensor(std::move(tmp_allocation_ptr), col_shape); - - col.ShareDataWith(tep_tensor); + col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index b10a19b658e383b8c7b4fbbe8f90da1fe0d4fd14..e925e7bb5917c9433c3c79b9a21a41b4d48a5ba0 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -131,8 +131,9 @@ class ConcatFunctor { int in_col = input[0].numel() / in_row; int out_row = in_row, out_col = 0; - std::vector inputs_data(in_num); + std::vector inputs_data; std::vector inputs_col(in_num + 1); + inputs_data.reserve(in_num); inputs_col[0] = 0; bool sameShape = true; @@ -143,7 +144,7 @@ class ConcatFunctor { } out_col += t_cols; inputs_col[i + 1] = out_col; - inputs_data[i] = const_cast(input[i].data()); + inputs_data.emplace_back(input[i].data()); } // computation diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h deleted file mode 100644 index 00fcc5f86209b2a827ac070773f4b0049b0457d8..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/create_tensor_with_allocationptr.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/temporary_allocator.h" -namespace paddle { -namespace platform { - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim &dim) { - auto &deleter = temp_allocation_ptr.get_deleter(); - auto *allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE(dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor(std::type_index(typeid(T))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 81c443d758fcf22545af4bf8e452be8f0ecc0a89..022afb686b29c2c493cfd05600ee372470cbc710 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device << ", CUDA Capability: " << compute_capability_ - << ", Driver Version: " << driver_version_ / 1000 + << ", Driver API Version: " << driver_version_ / 1000 << "." << (driver_version_ % 100) / 10 - << ", Runtime Version: " << runtime_version_ / 1000 - << "." << (runtime_version_ % 100) / 10; + << ", Runtime API Version: " + << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; size_t cudnn_dso_ver = dynload::cudnnGetVersion(); LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index af9744dcb847f8af97e87cc18d2aee376f3f3d6c..7e875801893f3b73f8efaf33af690f8c855beee4 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -41,7 +41,28 @@ limitations under the License. */ namespace paddle { namespace platform { -/*! \brief device temporary allocator singleton */ +/*! \brief device temporary allocator singleton. + * + * Some operator needs temporary memory during computation, for example, + * conv_gemm, which needs use col to store the result of im2col. If we + * create a stack memory which is used by CUDA Kernel, before the + * Computation(...) returns, we should add ctx->Wait(), because the + * execution of CUDA is async, if there doesn't have ctx->Wait(), + * the temporary memory will be released before the CUDA Kernel uses + * it. + * + * DeviceTemporaryAllocator is a singleton, which contains a + * `TemporaryAllocator` for each . And the TemporaryAllocator + * contains a temp_allocation_queue which is used to store the temporary + * allocations. The allocation, which is allocated by TemporaryAllocator, + * is a unique_ptr, and when it is not held by any variable, it will be + * pushed into the temp_allocation_queue. There are two opportunities to free + * the allocations of temp_allocation_queue: + * - when the Stream calls cudaStreamSynchronize; + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class DeviceTemporaryAllocator { public: static DeviceTemporaryAllocator& Instance() { diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 4e32d2d6959e69c94e869491ef8d11708870f7df..812c4a333189d8c432be398ca0ebbce11f957561 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation { memory::allocation::AllocationPtr underlying_allocation_; }; +/*! \brief the TemporaryAllocator is used to alloc the temporary allocation + * which used by CUDA's async operation. + * + * The TemporaryAllocator contains a temp_allocation_queue which + * is used to store the temporary allocations. The allocation, which is + * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held + * by any variable, it will be pushed into the temp_allocation_queue. + * + * There is one opportunity to free the allocations of temp_allocation_queue: + * - when the allocation size of opportunities exceeds a certain threshold + * (defined by FLAGS_limit_of_temporary_allocation). + * + * */ class TemporaryAllocator : public memory::allocation::Allocator { public: explicit TemporaryAllocator(platform::Place place); diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index 3b940b0e8243c0ae1e0eeb3a2c13f3d16c228925..e4e5be5b89f4cbecd6b5e9deec9cc5bffa6a4917 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,8 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/create_tensor_with_allocationptr.h" +#include "paddle/fluid/framework/tensor_util.h" DECLARE_double(limit_of_temporary_allocation); namespace paddle { @@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) { TEST(temporary_allocator, add_callback) { #ifdef PADDLE_WITH_CUDA + const double limit = FLAGS_limit_of_temporary_allocation; FLAGS_limit_of_temporary_allocation = 10; platform::CUDAPlace gpu_place(0); TemporaryAllocator gpu_alloc(gpu_place); @@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) { }); { gpu_alloc.Allocate(100); } PADDLE_ENFORCE(deleted); - FLAGS_limit_of_temporary_allocation = -1; + FLAGS_limit_of_temporary_allocation = limit; #endif } @@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = cpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) { auto allocation = gpu_alloc.Allocate(memory_size); void* address = allocation->ptr(); int numel = memory_size / sizeof(float); - framework::Tensor tensor = - GetTensor(std::move(allocation), framework::make_ddim({numel})); + framework::Tensor tensor = framework::GetTensor( + std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } @@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = cpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel); @@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) { { auto allocation = gpu_alloc.Allocate(memory_size); address = allocation->ptr(); - framework::Tensor tensor = GetTensor( + framework::Tensor tensor = framework::GetTensor( std::move(allocation), framework::make_ddim({numel})); PADDLE_ENFORCE_EQ(address, tensor.data()); PADDLE_ENFORCE_EQ(tensor.numel(), numel);