/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include #include #include #include #include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { class Allocation; } // namespace allocation } // namespace memory } // namespace paddle namespace paddle { namespace framework { using LoD = std::vector>; /* NOTE(liym27): [ What is TensorInplaceVersion used for? ] TensorInplaceVersion is a version counter and every Tensor has a version counter. It's used to check whether an inplace operation will result in an incorrect gradient calculation. Version is incremented when the data of the Variable is modified in place. - Question: In what scenarios will version counters be shared? - Answer: When two Variables/VarBases share the same C++ Tensor(its Allocation may change), both of them share the same version counter. For examples: 1. `z = paddle.assign(input=x, output=y)`, `z` shares the same version counter of `y` because z and y is the same VarBase; 2. `y = x.detach()`, `y` shares the same version counter of `x`. - Question: In what scenarios will version counters NOT be shared? - Answer: Replacing a `Variable`'s data by calling `Tensor::ShareDataWith(...)` or `Tensor::ShareBufferWith(...)`. Because they share the same Allocation but not framework::Tensor. - Question: Why put the inplace_version_counter_ in framework::Tensor instead of Allocation or Variable? - Answer: 1. Tensor can call ResetHolder() to reset the corresponding Allocation so that the inplace_version_counter_ changes if it's in Allocation, which will lead to confusing information about inplace version. 2. If inplace_version_counter_ is in Variable, different VariableWrappers should be able to share the same Variable. However, a VariableWrapper hold a Variable object but not a pointer. */ class TensorInplaceVersion { public: explicit TensorInplaceVersion(uint32_t inplace_version = 0) : inplace_version_(inplace_version) {} bool IsUnique() const { return inplace_version_ == 0; } void Bump() { ++inplace_version_; } uint32_t CurrentVersion() const { return inplace_version_; } void SetInplaceVersionToZero() { inplace_version_ = 0; } private: uint32_t inplace_version_; }; class Tensor { #ifdef PADDLE_WITH_MKLDNN public: inline dnnl::memory::format_tag format() const { return format_; } inline void set_format(const dnnl::memory::format_tag format) { format_ = format; } protected: /** * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, * nChw16c, etc. For a MKLDNN memory block, layout will be set as * DataLayout::kMKLDNN meanwhile detail memory format will be kept in * this field. */ dnnl::memory::format_tag format_ = dnnl::memory::format_tag::undef; #endif public: template friend struct EigenTensor; template friend struct EigenMatrix; template friend struct EigenVector; public: Tensor() : type_(proto::VarType::FP32), offset_(0), inplace_version_counter_(std::make_shared(0)) {} explicit Tensor(const proto::VarType::Type&); /*! Return a pointer to mutable memory block. */ template T* data(); /*! Return a pointer to constant memory block. */ template const T* data() const; inline bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. * @note If not exist, then allocation. */ template T* mutable_data(const platform::Place& place, size_t requested_size = 0); void* mutable_data(const platform::Place& place, proto::VarType::Type type, size_t requested_size = 0); void* mutable_data(const platform::Place& place, size_t requested_size = 0); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void* mutable_data(const platform::CUDAPlace& place, proto::VarType::Type type, const gpuStream_t& stream); #endif /** * @brief Return a pointer to mutable memory block. * * @param[in] dims The dimensions of the memory block. * @param[in] place The place of the memory block. * @param[in] requested_size The size of the block in bytes. * * @note If not exist, then allocation. */ template T* mutable_data(const DDim& dims, const platform::Place& place, size_t requested_size = 0); /*! Return the dimensions of the memory block. */ const DDim& dims() const; /*! Return the numel of the memory block. */ int64_t numel() const; /*! Resize the dimensions of the memory block. */ Tensor& Resize(const DDim& dims); /*! The internal of two tensors share the same memory block. */ Tensor& ShareDataWith(const Tensor& src); /*! The internal of two tensors share the same inplace version counter. */ Tensor& ShareInplaceVersionCounterWith(const Tensor& src); /** * @brief Return a sub-tensor of the given tensor. * * @param[in] begin_idx The index of the start row(inclusive) to slice. * The index number begins from 0. * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ Tensor Slice(int64_t begin_idx, int64_t end_idx) const; /** * @brief Return a tensor list of the given tensor. * * @param[in] split_size The size of tensor to be split along axis. * @param[in] axis The axis along which to split. */ std::vector Split(int64_t split_size, int64_t axis) const; /** * @brief Return a tensor list of the given tensor. * * @param[in] chunks The number of tensor to be split along axis. * @param[in] axis The axis along which to split. */ std::vector Chunk(int64_t chunks, int64_t axis) const; const platform::Place& place() const { PADDLE_ENFORCE_NOT_NULL( holder_, platform::errors::PreconditionNotMet( "Tensor not initialized yet when Tensor::place() is called.")); return holder_->place(); } proto::VarType::Type type() const { PADDLE_ENFORCE_NOT_NULL( holder_, platform::errors::PreconditionNotMet( "Tensor not initialized yet when Tensor::type() is called.")); return type_; } /** * [Add method get the saved type of tensor] * * After the introduction of complex number calculations, Ops that support * complex number calculations generally support type promotion, such as * x(float32) + y(complex64) = out(complex64), then the type of the grad * tensor should be dout(complex64), dx(float32), dy (complex64), but the * type of dx to be recognized to be float32 by the grad Op relay on the type * of forward tensor x. But many of our ops have registered InplaceInferer, * covering the tensor memory of x with out, so as to save storage. * * In this case, the dim and type information recorded by x still exist, * but because x becomes an uninitialized tensor, The type of x record cannot * be obtained with x.type(), but the type is still valid here, so we * add saved_type(), This method SHOULD NOT be called by general scenarios. */ proto::VarType::Type saved_type() const { return type_; } // memory size returns the holding memory size in byte. size_t memory_size() const; void check_memory_size() const; DataLayout layout() const { return layout_; } void set_layout(const DataLayout layout) { layout_ = layout; } void clear() { holder_ = nullptr; offset_ = 0; } void ShareBufferWith(const Tensor& tensor) { holder_ = tensor.holder_; offset_ = tensor.offset_; // NOTE(chenfeiyu): when sharing buffer, by definition only holder // to the memory allocation and offset should be shared. Shape, // data type, layout, and other metadata associated with a Tensor // should not be copied. } void ShareDataTypeWith(const Tensor& tensor) { type_ = tensor.type_; } bool IsSharedBufferWith(const Tensor& src) const { return holder_ && holder_ == src.Holder(); } const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } std::shared_ptr MoveMemoryHolder() { return std::move(holder_); } void ResetHolder(std::shared_ptr holder); void ResetHolderWithType(std::shared_ptr holder, const proto::VarType::Type& type); void set_type(const proto::VarType::Type& type); TensorInplaceVersion& InplaceVersionCounter() { return *inplace_version_counter_; } private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; proto::VarType::Type type_; /** * @brief points to elements dimensions. * * @note dims_ do not indicate the memory block size. */ DDim dims_; /** * @brief the layout of memory block, default is NHWC. * * @note the memory allocation order, describe how weight/data is stored * For example, in 4-D Tensor(rank=4), there are three commonly * used layout. They are * NCHW, NHWC, CHWN. * N,C,H,W for respectively the batch size, the number of * feature maps, the height. */ // Fix me: here just change the default layout to kNCHW // it doesn't fix the real issue, i.e. feeder should set up tensor layout // according to actual input data DataLayout layout_ = DataLayout::kNCHW; /** * @brief A PlaceHolder may be shared by more than one tensor. * * @note Some of them may be slices of the others. So the offset_ * is introduced here to indicate the byte offset between * PlaceHolder::ptr_ and where the tensor data really begins. */ size_t offset_; std::shared_ptr inplace_version_counter_; /* ---------------------------------------------------------- */ /* --------------- Reserved for LoDTensor ------------------- */ /* ---------------------------------------------------------- */ public: explicit Tensor(const LoD& lod) : lod_(lod) {} void set_lod(const LoD& lod) { lod_ = lod; } const LoD& lod() const { return lod_; } LoD* mutable_lod() { return &lod_; } std::pair lod_element(size_t level, size_t elem) const { PADDLE_ENFORCE_LT( level, NumLevels(), platform::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, NumLevels())); PADDLE_ENFORCE_LT(elem, NumElements(level), platform::errors::InvalidArgument( "The input element of LoD is invalid, it should be " "less than the number of elements in its level." "The input element is %zu, the number of elements in " "its level is %zu.", elem, NumElements(level))); return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]); } size_t NumLevels() const { return lod_.size(); } size_t NumElements(size_t level = 0) const { PADDLE_ENFORCE_LT( level, NumLevels(), platform::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, NumLevels())); // the last offset is the end of last element return (lod_)[level].size() - 1; } // Split LoDTensor and copy to each place specified in places. std::vector SplitLoDTensor( const std::vector places) const; void MergeLoDTensor(const std::vector& lod_tensors, platform::Place place); private: LoD lod_; }; // Get the absolute offset of a lod[start_level][start_idx:end_idx] and // relative length of details for every levels(i.e., [start_level: ]). // // For example, // lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] // start_level = 0 // start_idx = 1 // end_idx = 3 // // Returns: // LoD = [[1, 4], [2, 4, 2, 3, 2]] // pair = {11, 24} std::pair> GetSubLoDAndAbsoluteOffset( const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); } // namespace framework } // namespace paddle #include "paddle/fluid/framework/tensor_impl.h"