From de8a8feeb48f981e5d75603eadc491b7eb139064 Mon Sep 17 00:00:00 2001 From: liaogang Date: Tue, 25 Jul 2017 17:34:59 +0800 Subject: [PATCH] ENH: Refine Tensor and And CopyFrom --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/detail/tensor-inl.h | 158 ++++++++++++++++++++++ paddle/framework/tensor.cc | 2 +- paddle/framework/tensor.h | 190 +++++++++++++-------------- paddle/framework/tensor_test.cc | 49 ++++--- paddle/platform/device_context.h | 2 +- 6 files changed, 277 insertions(+), 126 deletions(-) create mode 100644 paddle/framework/detail/tensor-inl.h diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 760d84e51e..dec4ab80d8 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory) +cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context) cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h new file mode 100644 index 0000000000..4fc328d231 --- /dev/null +++ b/paddle/framework/detail/tensor-inl.h @@ -0,0 +1,158 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/memory/memcpy.h" + +namespace paddle { +namespace framework { + +template +inline void Tensor::check_memory_size() const { + PADDLE_ENFORCE(holder_ != nullptr, + "Tenosr holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory."); +} + +template +inline const T* Tensor::data() const { + check_memory_size(); + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +template +inline T* Tensor::data() { + check_memory_size(); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place) { + Resize(dims); + return mutable_data(place); +} + +template +inline T* Tensor::mutable_data(platform::Place place) { + PADDLE_ENFORCE(product(dims_) > 0, + "Tensor's numel must be larger than zero to call " + "Tensor::mutable_data. Call Tensor::set_dim first."); + /* some versions of boost::variant don't have operator!= */ + size_t size = product(dims_) * sizeof(T); + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + if (platform::is_cpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(place)) { + holder_.reset(new PlaceholderImpl( + boost::get(place), size)); + } +#endif + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline void Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; +} + +template +inline void Tensor::CopyFrom(const Tensor& src, + const platform::CPUDeviceContext& ctx) { + src.check_memory_size(); + Resize(src.dims()); + + auto src_place = src.holder_->place(); + auto src_ptr = static_cast(src.data()); + + auto dst_place = ctx.GetPlace(); + auto dst_ptr = static_cast(mutable_data(dst_place)); + + auto size = product(src.dims_) * sizeof(T); + + if (platform::is_cpu_place(src_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifndef PADDLE_ONLY_CPU + else if (platform::is_gpu_place(src_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, 0); + } +#endif +} + +#ifndef PADDLE_ONLY_CPU +template +inline void Tensor::CopyFrom(const Tensor& src, + const platform::CUDADeviceContext& ctx) { + src.check_memory_size(); + Resize(src.dims()); + + auto src_place = src.holder_->place(); + auto src_ptr = static_cast(src.data()); + + auto dst_place = ctx.GetPlace(); + auto dst_ptr = static_cast(mutable_data(dst_place)); + + auto size = product(src.dims_) * sizeof(T); + + if (platform::is_cpu_place(src_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, + ctx.stream()); + } else if (platform::is_gpu_place(src_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size, + ctx.stream()); + } +} +#endif + +template +inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const { + check_memory_size(); + PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); + PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); + PADDLE_ENFORCE(begin_idx < end_idx, + "Begin index must be less than end index."); + PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); + int base = product(dims_) / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * sizeof(T); + return dst; +} + +inline void Tensor::Resize(const DDim& dims) { dims_ = dims; } + +inline const DDim& Tensor::dims() const { return dims_; } + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc index 964f15ab66..ea7b2a1f7b 100644 --- a/paddle/framework/tensor.cc +++ b/paddle/framework/tensor.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/framework/tensor.h" namespace paddle { namespace framework {} diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index a36f375d2e..d3f56b31cd 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include "paddle/framework/ddim.h" #include "paddle/memory/memory.h" +#include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -31,9 +32,11 @@ template struct CastToPyBufferImpl; } // namespace details } // namespace pybind + namespace framework { class Tensor { + public: template friend struct paddle::pybind::details::CastToPyBufferImpl; @@ -46,106 +49,84 @@ class Tensor { public: Tensor() : offset_(0) {} + /*! Return a pointer to mutable memory block. */ template - const T* data() const { - EnforceSufficientMemory(); - return reinterpret_cast( - reinterpret_cast(holder_->ptr()) + offset_); - } + inline T* data(); + /*! Return a pointer to constant memory block. */ template - T* data() { - EnforceSufficientMemory(); - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } - - template ::value>::type* = nullptr> - T* mutable_data(DDim dims, platform::Place place) { - Resize(dims); - return mutable_data(place); - } - - template ::value>::type* = nullptr> - T* mutable_data(platform::Place place) { - PADDLE_ENFORCE(product(dims_) > 0, - "Tensor's numel must be larger than zero to call " - "Tensor::mutable_data. Call Tensor::set_dim first."); - if (holder_ == nullptr || - !(holder_->place() == - place) /* some versions of boost::variant don't have operator!= */ - || holder_->size() < product(dims_) * sizeof(T) + offset_) { - if (platform::is_cpu_place(place)) { - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); - } else if (platform::is_gpu_place(place)) { -#ifdef PADDLE_ONLY_CPU - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); -#else - holder_.reset(new PlaceholderImpl( - boost::get(place), product(dims_) * sizeof(T))); -#endif - } else { - PADDLE_THROW("Unknown 'place'."); - } - offset_ = 0; - } - return reinterpret_cast(reinterpret_cast(holder_->ptr()) + - offset_); - } + inline const T* data() const; + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + /*! Resize the dimensions of the memory block. */ + inline void Resize(const DDim& dims); + + /*! The internal of two tensors share the same memory block. */ + template + inline void ShareDataWith(const Tensor& src); + + /** + * @brief Copy the content of external tensor to a new place. + * + * @param[in] src The external tensor. + * @param[in] ctx The device context contains place where to store. + * + * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. + */ template - void ShareDataWith(const Tensor& src) { - src.EnforceSufficientMemory(); - *this = src; - } + inline void CopyFrom(const Tensor& src, + const platform::CPUDeviceContext& ctx); +#ifndef PADDLE_ONLY_CPU template - void CopyFrom(const Tensor& src, platform::Place dst_place) { - PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) && - platform::is_cpu_place(dst_place), - "Tensor::CopyFrom only support CPU now."); - src.EnforceSufficientMemory(); - size_t size = product(src.dims_) * sizeof(T); - Resize(src.dims()); - const void* src_ptr = static_cast(src.data()); - void* dst_ptr = static_cast(mutable_data(dst_place)); - memcpy(dst_ptr, src_ptr, size); - } + inline void CopyFrom(const Tensor& src, + const platform::CUDADeviceContext& ctx); +#endif + /** + * @brief Return the slice of the tensor. + * + * @param[in] begin_idx The begin index of the slice. + * @param[in] end_idx The end index of the slice. + */ template - Tensor Slice(const int& begin_idx, const int& end_idx) const { - EnforceSufficientMemory(); - PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero."); - PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound."); - PADDLE_ENFORCE(begin_idx < end_idx, - "Begin index must be less than end index."); - PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1."); - int base = product(dims_) / dims_[0]; - Tensor dst; - dst.holder_ = holder_; - DDim dst_dims = dims_; - dst_dims[0] = end_idx - begin_idx; - dst.Resize(dst_dims); - dst.offset_ = offset_ + begin_idx * base * sizeof(T); - return dst; - } - - void Resize(const DDim& dims) { dims_ = dims; } - - const DDim& dims() const { return dims_; } + inline Tensor Slice(const int& begin_idx, const int& end_idx) const; private: - // Placeholder hides type T, so it doesn't appear as a template - // parameter of Variable. + template + inline void check_memory_size() const; + + private: + /** + * @note Placeholder hides type T, so it doesn't appear as a template + * parameter of Variable. + */ struct Placeholder { virtual ~Placeholder() {} virtual void* ptr() const = 0; - virtual platform::Place place() const = 0; virtual size_t size() const = 0; virtual std::type_index type() const = 0; + virtual platform::Place place() const = 0; }; template @@ -156,33 +137,38 @@ class Tensor { place_(place), size_(size) {} - virtual void* ptr() const { return static_cast(ptr_.get()); } virtual size_t size() const { return size_; } - virtual paddle::platform::Place place() const { return place_; } + virtual platform::Place place() const { return place_; } + virtual void* ptr() const { return static_cast(ptr_.get()); } virtual std::type_index type() const { return std::type_index(typeid(T)); } + /*! the pointer of memory block. */ std::unique_ptr> ptr_; - platform::Place place_; // record the place of ptr_. - size_t size_; // size of the memory block. + + /*! the place of memory block. */ + platform::Place place_; + + /*! the size of memory block. */ + size_t size_; }; - template - inline void EnforceSufficientMemory() const { - PADDLE_ENFORCE(holder_ != nullptr, - "Tenosr holds no memory. Call Tensor::mutable_data first."); - PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_, - "Tensor's dims_ is out of bound. Call Tensor::mutable_data " - "first to re-allocate memory."); - } - - std::shared_ptr holder_; // holds the memory block if allocated. + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ DDim dims_; - // A PlaceHolder may be shared by more than one tensor. Some of them may be - // slices of the others. So the offset_ is introduced here to indicate the - // byte offset between PlaceHolder::ptr_ and where tensor's data really - // begins. + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ size_t offset_; }; } // namespace framework } // namespace paddle + +#include "paddle/framework/detail/tensor-inl.h" diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index 089844dc01..7987d335ac 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -72,7 +72,8 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(make_ddim({2, 2}), CPUPlace()); EXPECT_EQ(p1, p2); } -#ifdef __CUDACC__ + +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; float* p1 = nullptr; @@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) { ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; Tensor dst_tensor; @@ -160,7 +161,7 @@ TEST(Tensor, Slice) { EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); } -#ifdef __CUDACC__ +#ifndef PADDLE_ONLY_CPU { Tensor src_tensor; src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); @@ -188,25 +189,31 @@ TEST(Tensor, Slice) { TEST(Tensor, CopyFrom) { using namespace paddle::framework; using namespace paddle::platform; + { + Tensor src_tensor; + Tensor dst_tensor; - Tensor src_tensor; - int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; - memcpy(src_ptr, arr, 9 * sizeof(int)); - Tensor dst_tensor; - dst_tensor.CopyFrom(src_tensor, CPUPlace()); - const int* dst_ptr = dst_tensor.data(); - ASSERT_NE(src_ptr, dst_ptr); - for (size_t i = 0; i < 9; ++i) { - EXPECT_EQ(src_ptr[i], dst_ptr[i]); - } + int* src_ptr = src_tensor.mutable_data(make_ddim({3, 3}), CPUPlace()); - Tensor slice_tensor = src_tensor.Slice(1, 2); - dst_tensor.CopyFrom(slice_tensor, CPUPlace()); - const int* slice_ptr = slice_tensor.data(); - dst_ptr = dst_tensor.data(); - ASSERT_NE(dst_ptr, slice_ptr); - for (size_t i = 0; i < 3; ++i) { - EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + auto cpu_ctx = paddle::platform::CPUDeviceContext(); + dst_tensor.CopyFrom(src_tensor, cpu_ctx); + + const int* dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + dst_tensor.CopyFrom(slice_tensor, cpu_ctx); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + ASSERT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } } } diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index fe6f13e399..5a366dccdc 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -87,7 +87,7 @@ class CUDADeviceContext : public DeviceContext { "cudaStreamSynchronize failed"); } - cudaStream_t stream() { return stream_; } + cudaStream_t stream() const { return stream_; } Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); } -- GitLab