diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index d8051e1fbb116198781ec2895431f9f8207ad85d..7a3b450f7154853d22182e993cc996dbea0033fa 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -91,15 +91,16 @@ endif() cc_test(copy_same_tensor_test SRCS copy_same_tensor_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) +cc_library(mixed_vector SRCS mixed_vector.cc DEPS device_context) if(WITH_GPU) - nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) + nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor) elseif(WITH_ROCM) - hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) + hip_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS mixed_vector place memory device_context tensor) else() - cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) + cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS mixed_vector place memory device_context tensor) endif() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) diff --git a/paddle/fluid/framework/mixed_vector.cc b/paddle/fluid/framework/mixed_vector.cc new file mode 100644 index 0000000000000000000000000000000000000000..b15a66c51c4b6365cb4285894efb1e37a03b7b64 --- /dev/null +++ b/paddle/fluid/framework/mixed_vector.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/mixed_vector.h" + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/details/cow_ptr.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/utils/none.h" +#include "paddle/utils/optional.h" + +namespace paddle { +namespace framework { + +template +void CopyToCPUHelper(std::vector *cpu_, paddle::memory::AllocationPtr *gpu_, + size_t *gpu_memory_size_) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get((*gpu_)->place())); + auto stream = dev_ctx->stream(); + void *src = (*gpu_)->ptr(); + void *dst = cpu_->data(); + paddle::memory::Copy(platform::CPUPlace(), dst, + OptionalCUDAPlace(*gpu_).get(), src, *gpu_memory_size_, + stream); + dev_ctx->Wait(); +#endif +} + +template +void CopyCPUDataToCUDAHelper(std::vector *cpu_, + paddle::memory::AllocationPtr *gpu_, + size_t *gpu_memory_size_, + const platform::Place &place) { +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + void *src = cpu_->data(); + *gpu_memory_size_ = cpu_->size() * sizeof(T); // sizeof(T) + (*gpu_) = memory::Alloc(place, *gpu_memory_size_); + void *dst = (*gpu_)->ptr(); + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + paddle::memory::Copy(OptionalCUDAPlace(*gpu_).get(), dst, + platform::CPUPlace(), src, *gpu_memory_size_, stream); +#endif +} + +#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__) \ + template <> \ + void Vector<__TYPE__>::VectorData::CopyToCPU() const { \ + CopyToCPUHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_); \ + } \ + \ + template <> \ + void Vector<__TYPE__>::VectorData::CopyCPUDataToCUDA( \ + const platform::Place &place) const { \ + CopyCPUDataToCUDAHelper<__TYPE__>(&cpu_, &gpu_, &gpu_memory_size_, place); \ + } + +INSTANTIATE_VECTOR_FOR_TYPE(size_t) +INSTANTIATE_VECTOR_FOR_TYPE(int) +INSTANTIATE_VECTOR_FOR_TYPE(int64_t) + +}; // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index cf71cdfc6d651019fce8989ecf30ed12952d6ce7..d1aee6cb2f662b27e9b2c78f28da4ba873b1d7e6 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -23,17 +23,21 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/framework/details/cow_ptr.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/utils/none.h" #include "paddle/utils/optional.h" namespace paddle { namespace framework { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +inline paddle::optional OptionalCUDAPlace( + const paddle::memory::allocation::AllocationPtr &gpu_) { + return gpu_ == nullptr + ? paddle::none + : paddle::optional( + BOOST_GET_CONST(platform::CUDAPlace, gpu_->place())); +} + // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template @@ -198,10 +202,7 @@ class Vector { std::mutex &Mutex() const { return mtx_; } paddle::optional CUDAPlace() const { - return gpu_ == nullptr - ? paddle::none - : paddle::optional( - BOOST_GET_CONST(platform::CUDAPlace, gpu_->place())); + return OptionalCUDAPlace(gpu_); } private: @@ -212,17 +213,7 @@ class Vector { kDirty = 0x10 }; - void CopyToCPU() const { - // COPY GPU Data To CPU - auto *dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(gpu_->place())); - auto stream = dev_ctx->stream(); - void *src = gpu_->ptr(); - void *dst = cpu_.data(); - paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, - gpu_memory_size_, stream); - dev_ctx->Wait(); - } + void CopyToCPU() const; void MutableCPU() { if (IsInCUDA() && IsDirty()) { @@ -260,17 +251,7 @@ class Vector { } } - void CopyCPUDataToCUDA(const platform::Place &place) const { - void *src = cpu_.data(); - gpu_memory_size_ = cpu_.size() * sizeof(T); - gpu_ = memory::Alloc(place, gpu_memory_size_); - void *dst = gpu_->ptr(); - auto *dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto stream = dev_ctx->stream(); - paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, - gpu_memory_size_, stream); - } + void CopyCPUDataToCUDA(const platform::Place &place) const; void ImmutableCPU() const { if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or @@ -291,7 +272,7 @@ class Vector { bool IsInCPU() const { return flag_ & kDataInCPU; } mutable std::vector cpu_; - mutable paddle::memory::AllocationPtr gpu_; + mutable paddle::memory::allocation::AllocationPtr gpu_; mutable size_t gpu_memory_size_{0}; mutable int flag_; @@ -465,81 +446,5 @@ class Vector { mutable details::COWPtr m_; }; -#else // PADDLE_WITH_CUDA - -template -class CPUVector : public std::vector> { - public: - CPUVector() : std::vector() {} - CPUVector(size_t count, const T &value = T()) // NOLINT - : std::vector(count, value) {} - CPUVector(std::initializer_list init) : std::vector(init) {} - CPUVector(const std::vector &other) : std::vector(other) {} // NOLINT - CPUVector(const CPUVector &other) : std::vector(other) {} - CPUVector(CPUVector &&other) : std::vector(std::move(other)) {} - CPUVector(std::vector &&other) // NOLINT - : std::vector(std::move(other)) {} - CPUVector &operator=(const CPUVector &other) { - this->assign(other.begin(), other.end()); - return *this; - } - CPUVector &operator=(const std::vector &other) { - this->assign(other.begin(), other.end()); - return *this; - } - - friend std::ostream &operator<<(std::ostream &os, const CPUVector &other) { - std::stringstream ss; - for (auto v : other) { - os << v << " "; - } - return os; - } - - T &operator[](size_t id) { return this->at(id); } - - const T &operator[](size_t id) const { return this->at(id); } - - template - void Extend(const D &begin, const D &end) { - this->reserve(this->size() + size_t(end - begin)); - this->insert(this->end(), begin, end); - } - - const T *CUDAData(platform::Place place) const { - PADDLE_THROW(platform::errors::Unavailable( - "Vector::CUDAData() method is not supported in CPU-only version.")); - } - - T *CUDAMutableData(platform::Place place) { - PADDLE_THROW(platform::errors::Unavailable( - "Vector::CUDAMutableData() method is not supported in CPU-only " - "version.")); - } - - const T *Data(platform::Place place) const { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(place), true, - platform::errors::Unavailable( - "Vector::Data() method is not supported when not in CPUPlace.")); - return this->data(); - } - - T *MutableData(platform::Place place) { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(place), true, - platform::errors::Unavailable("Vector::MutableData() method is not " - "supported when not in CPUPlace.")); - return this->data(); - } - - const void *Handle() const { return static_cast(this); } -}; - -template -using Vector = CPUVector; - -#endif // PADDLE_WITH_CUDA - }; // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 10e7ed0fb60219c69425442ec4941ac212c7efc5..011e2729d4adffd49c65f536f2ebb33d9a949e56 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -25,6 +25,7 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/device_context.h" template using vec = paddle::framework::Vector; diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index fd0f42df11875e7d343f5593e406dc0a254ab406..fa0cab04168d1e3ea48fc3cf7397e976a39eac2a 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -31,13 +31,9 @@ namespace operators { using Tensor = framework::Tensor; using SelectedRows = framework::SelectedRows; using LoDTensor = framework::LoDTensor; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + template using Vector = framework::Vector; -#else -template -using Vector = framework::CPUVector; -#endif template class FilterByInstagKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/shuffle_batch_op.h b/paddle/fluid/operators/shuffle_batch_op.h index f05af3f249ce040fa223e45ffe191be540e8776b..bd24bbeb9f0475ae7c466c8e8eb0a79264aeb8b2 100644 --- a/paddle/fluid/operators/shuffle_batch_op.h +++ b/paddle/fluid/operators/shuffle_batch_op.h @@ -33,13 +33,9 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + template using Vector = framework::Vector; -#else -template -using Vector = framework::CPUVector; -#endif template class ShuffleBatchKernel : public framework::OpKernel { diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt index 63cae6cc70867d247cbeaba71262811ddc073f89..28c522d7ea640dc3a935c57c6ab1ab355aae4241 100644 --- a/paddle/pten/core/CMakeLists.txt +++ b/paddle/pten/core/CMakeLists.txt @@ -8,7 +8,7 @@ endif() cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce convert_utils) cc_library(kernel_context SRCS kernel_context.cc DEPS enforce pten_context) - cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce) -cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce) +cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector) + cc_library(dense_tensor SRCS dense_tensor.cc DEPS tensor_meta tensor_base) diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index fbb39e6f17fed79189e8f5cf10b5e32c75b8918d..ccbcf02ffe70a53c762d2acebc2ca59a866949fe 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -83,9 +83,7 @@ class DenseTensor : public TensorBase, /// \brief Returns the lod of the tensor. /// \return The lod of the tensor. - const std::vector>& lod() const noexcept { - return meta_.lod; - } + const LoD& lod() const noexcept { return meta_.lod; } /// \brief Returns the data type of the tensor. /// \return The data type of the tensor. diff --git a/paddle/pten/core/tensor_meta.cc b/paddle/pten/core/tensor_meta.cc index d8a3b5c9b2c61c4dd2d3c07dc12000c9764c003f..3343527e8cd418e0073d702eec5da0b904ce900d 100644 --- a/paddle/pten/core/tensor_meta.cc +++ b/paddle/pten/core/tensor_meta.cc @@ -27,7 +27,7 @@ DenseTensorMeta::DenseTensorMeta(DataType dtype, DenseTensorMeta::DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, - const std::vector>& lod) + const LoD& lod) : dims(dims), dtype(dtype), layout(layout), lod(lod) {} bool DenseTensorMeta::valid() const noexcept { diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h index 5341c6bff54498e22a1a2a11428121c9ed7ffc2d..083ef2c5d39a55db380a6c0bfe98e7d5f01a1a2f 100644 --- a/paddle/pten/core/tensor_meta.h +++ b/paddle/pten/core/tensor_meta.h @@ -22,15 +22,16 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/framework/ddim.h" + // Note: mixed_vector include many header now, LoD will be // used on CUDA device? Can we use small_vector here? -// #include "paddle/fluid/framework/mixed_vector.h" +// @zhanlve: Rollback to original LoD for now +#include "paddle/fluid/framework/mixed_vector.h" namespace pten { using DDim = paddle::framework::DDim; -using LoD = std::vector>; - +using LoD = std::vector>; /// \brief The meta data of dense tensor. Take the structure type /// and use all default operations. /// @@ -44,7 +45,7 @@ struct DenseTensorMeta { DenseTensorMeta(DataType dtype, const DDim& dims, DataLayout layout, - const std::vector>& lod); + const LoD& lod); /// \brief Test whether the metadata is valid. Does not throw exceptions. /// \return Whether the metadata is valid. diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc index b59cee5dc7e8449fb40203b0fe4f0d2bbc20d01e..50116caca58a765d173f5069b978f7f15cfedfde 100644 --- a/paddle/pten/tests/api/test_tensor_utils.cc +++ b/paddle/pten/tests/api/test_tensor_utils.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/core/tensor_meta.h" namespace paddle { namespace tests { @@ -30,7 +31,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) { const DDim dims({2, 1}); const DataType dtype{DataType::FLOAT32}; const DataLayout layout{DataLayout::NCHW}; - const std::vector> lod{{0, 2}}; + const pten::LoD lod{{0, 2}}; DenseTensorMeta meta(dtype, dims, layout, lod); auto alloc = @@ -46,7 +47,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) { CHECK(dense_tensor.lod().size() == lod_tensor.lod().size()); CHECK(dense_tensor.lod()[0] == - static_cast>((lod_tensor.lod()[0]))); + static_cast>((lod_tensor.lod()[0]))); CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type())); CHECK(dense_tensor.layout() == pten::TransToPtenDataLayout(lod_tensor.layout())); diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc index 4a17046b2f30c1ff50df30eddf4fac32925b0e96..07ad582725d50c32f940e59b38823c9b9feb00fc 100644 --- a/paddle/pten/tests/core/test_dense_tensor.cc +++ b/paddle/pten/tests/core/test_dense_tensor.cc @@ -25,7 +25,7 @@ TEST(dense_tensor, meta) { const DataType dtype{DataType::INT8}; const DataLayout layout{DataLayout::NHWC}; // TODO(Shixiaowei02): need to check the lod is valid. - const std::vector> lod{}; + const LoD lod{}; DenseTensorMeta meta_0; CHECK(!meta_0.valid()); @@ -72,7 +72,7 @@ TEST(dense_tensor, ctor) { const DDim dims({1, 2}); const DataType dtype{DataType::INT8}; const DataLayout layout{DataLayout::NHWC}; - const std::vector> lod{}; + const LoD lod{}; DenseTensorMeta meta(dtype, dims, layout, lod); auto alloc = std::make_shared(); @@ -106,7 +106,7 @@ TEST(dense_tensor, resize) { const DDim dims({1, 2}); const DataType dtype{DataType::INT8}; const DataLayout layout{DataLayout::NHWC}; - const std::vector> lod{}; + const LoD lod{}; DenseTensorMeta meta(dtype, dims, layout, lod); auto alloc = std::make_shared(); @@ -126,7 +126,7 @@ TEST(dense_tensor, shallow_copy) { const DDim dims({1, 2}); const DataType dtype{DataType::INT8}; const DataLayout layout{DataLayout::NHWC}; - const std::vector> lod{}; + const LoD lod{}; DenseTensorMeta meta(dtype, dims, layout, lod); auto alloc = std::make_shared();