未验证 提交 b9fb03cf 编写于 作者: C chengduo 提交者: GitHub

Move GetTensor to tensor_util (#15011)

* refine tensor
test=develop

* refine tensor
test=develop

* fix device_context log
test=develop
上级 bc16bcda
......@@ -48,10 +48,10 @@ if(WITH_GPU)
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
add_dependencies(tensor tensor_util)
else()
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
endif(WIN32)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
......
......@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
"or maybe the required data-type mismatches the data already stored.");
}
Tensor::Tensor(std::type_index type)
: type_(framework::ToDataType(type)), offset_(0) {}
Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : holder_->size() - offset_;
......
......@@ -69,7 +69,7 @@ class Tensor {
public:
Tensor() : type_(proto::VarType::FP32), offset_(0) {}
explicit Tensor(std::type_index type);
explicit Tensor(const proto::VarType::Type&);
/*! Return a pointer to mutable memory block. */
template <typename T>
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/temporary_allocator.h"
namespace paddle {
namespace framework {
......@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
src_ptr, size);
}
template <typename T>
paddle::framework::Tensor GetTensor(
memory::allocation::AllocationPtr temp_allocation_ptr,
const framework::DDim& dim) {
auto& deleter = temp_allocation_ptr.get_deleter();
auto* allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation =
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE(
dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(
framework::ToDataType(std::type_index(typeid(T))));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
} // namespace framework
} // namespace paddle
......@@ -18,11 +18,11 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
namespace paddle {
namespace operators {
......@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
Tensor tep_tensor =
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
......@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
auto tmp_allocation_ptr =
platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
framework::product(col_shape) * sizeof(T));
Tensor tep_tensor =
platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col.ShareDataWith(tep_tensor);
col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
}
......
......@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int in_col = input[0].numel() / in_row;
int out_row = in_row, out_col = 0;
std::vector<T*> inputs_data(in_num);
std::vector<const T*> inputs_data;
std::vector<int> inputs_col(in_num + 1);
inputs_data.reserve(in_num);
inputs_col[0] = 0;
bool sameShape = true;
......@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
}
out_col += t_cols;
inputs_col[i + 1] = out_col;
inputs_data[i] = const_cast<T*>(input[i].data<T>());
inputs_data.emplace_back(input[i].data<T>());
}
// computation
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/temporary_allocator.h"
namespace paddle {
namespace platform {
template <typename T>
paddle::framework::Tensor GetTensor(
memory::allocation::AllocationPtr temp_allocation_ptr,
const framework::DDim &dim) {
auto &deleter = temp_allocation_ptr.get_deleter();
auto *allocation_ptr = temp_allocation_ptr.release();
auto shared_allocation =
std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
"The AllocationPtr must be TemporaryAllocation.");
PADDLE_ENFORCE_EQ(allocation_ptr->size(),
framework::product(dim) * sizeof(T));
paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
temp_tensor.Resize(dim);
temp_tensor.ResetHolder(std::move(shared_allocation));
return temp_tensor;
}
} // namespace platform
} // namespace paddle
......@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
<< ", CUDA Capability: " << compute_capability_
<< ", Driver Version: " << driver_version_ / 1000
<< ", Driver API Version: " << driver_version_ / 1000
<< "." << (driver_version_ % 100) / 10
<< ", Runtime Version: " << runtime_version_ / 1000
<< "." << (runtime_version_ % 100) / 10;
<< ", Runtime API Version: "
<< runtime_version_ / 1000 << "."
<< (runtime_version_ % 100) / 10;
size_t cudnn_dso_ver = dynload::cudnnGetVersion();
LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
<< ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
......
......@@ -41,7 +41,28 @@ limitations under the License. */
namespace paddle {
namespace platform {
/*! \brief device temporary allocator singleton */
/*! \brief device temporary allocator singleton.
*
* Some operator needs temporary memory during computation, for example,
* conv_gemm, which needs use col to store the result of im2col. If we
* create a stack memory which is used by CUDA Kernel, before the
* Computation(...) returns, we should add ctx->Wait(), because the
* execution of CUDA is async, if there doesn't have ctx->Wait(),
* the temporary memory will be released before the CUDA Kernel uses
* it.
*
* DeviceTemporaryAllocator is a singleton, which contains a
* `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
* contains a temp_allocation_queue which is used to store the temporary
* allocations. The allocation, which is allocated by TemporaryAllocator,
* is a unique_ptr, and when it is not held by any variable, it will be
* pushed into the temp_allocation_queue. There are two opportunities to free
* the allocations of temp_allocation_queue:
* - when the Stream calls cudaStreamSynchronize;
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class DeviceTemporaryAllocator {
public:
static DeviceTemporaryAllocator& Instance() {
......
......@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation {
memory::allocation::AllocationPtr underlying_allocation_;
};
/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
* which used by CUDA's async operation.
*
* The TemporaryAllocator contains a temp_allocation_queue which
* is used to store the temporary allocations. The allocation, which is
* allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
* by any variable, it will be pushed into the temp_allocation_queue.
*
* There is one opportunity to free the allocations of temp_allocation_queue:
* - when the allocation size of opportunities exceeds a certain threshold
* (defined by FLAGS_limit_of_temporary_allocation).
*
* */
class TemporaryAllocator : public memory::allocation::Allocator {
public:
explicit TemporaryAllocator(platform::Place place);
......
......@@ -14,8 +14,7 @@
#include "paddle/fluid/platform/temporary_allocator.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
#include "paddle/fluid/framework/tensor_util.h"
DECLARE_double(limit_of_temporary_allocation);
namespace paddle {
......@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) {
TEST(temporary_allocator, add_callback) {
#ifdef PADDLE_WITH_CUDA
const double limit = FLAGS_limit_of_temporary_allocation;
FLAGS_limit_of_temporary_allocation = 10;
platform::CUDAPlace gpu_place(0);
TemporaryAllocator gpu_alloc(gpu_place);
......@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) {
});
{ gpu_alloc.Allocate(100); }
PADDLE_ENFORCE(deleted);
FLAGS_limit_of_temporary_allocation = -1;
FLAGS_limit_of_temporary_allocation = limit;
#endif
}
......@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
auto allocation = cpu_alloc.Allocate(memory_size);
void* address = allocation->ptr();
int numel = memory_size / sizeof(float);
framework::Tensor tensor =
GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel);
}
......@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
auto allocation = gpu_alloc.Allocate(memory_size);
void* address = allocation->ptr();
int numel = memory_size / sizeof(float);
framework::Tensor tensor =
GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel);
}
......@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
{
auto allocation = cpu_alloc.Allocate(memory_size);
address = allocation->ptr();
framework::Tensor tensor = GetTensor<float>(
framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel);
......@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
{
auto allocation = gpu_alloc.Allocate(memory_size);
address = allocation->ptr();
framework::Tensor tensor = GetTensor<float>(
framework::Tensor tensor = framework::GetTensor<float>(
std::move(allocation), framework::make_ddim({numel}));
PADDLE_ENFORCE_EQ(address, tensor.data<float>());
PADDLE_ENFORCE_EQ(tensor.numel(), numel);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册