Merge pull request #3056 from gangliao/cpu_mem

ENH: Refine Tensor and Add CopyFrom

Merge pull request #3056 from gangliao/cpu_mem
ENH: Refine Tensor and Add CopyFrom
1cfc8b6e · Yi Wang · GitHub · 55115ac6 · 1c68f119 · 1cfc8b6e
7 changed file
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

--- a/paddle/framework/detail/tensor-inl.h
+++ b/paddle/framework/detail/tensor-inl.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/memory/memcpy.h"
+namespace paddle {
+namespace framework {
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE(holder_ != nullptr,
+                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                 "first to re-allocate memory.");
+}
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  PADDLE_ENFORCE(product(dims_) > 0,
+                 "Tensor's numel must be larger than zero to call "
+                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    }
+#ifndef PADDLE_ONLY_CPU
+    else if (platform::is_gpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+template <typename T>
+inline void Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+}
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CPUDeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto size = product(src.dims_) * sizeof(T);
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+#endif
+}
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CUDADeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto size = product(src.dims_) * sizeof(T);
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  } else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  }
+}
+#endif
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE(begin_idx < end_idx,
+                 "Begin index must be less than end index.");
+  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+inline const DDim& Tensor::dims() const { return dims_; }
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -12,7 +12,7 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <paddle/framework/tensor.h>
+#include "paddle/framework/tensor.h"
 namespace paddle {
 namespace framework {}

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -31,9 +32,11 @@ template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }  // namespace details
 }  // namespace pybind
 namespace framework {
 class Tensor {
+ public:
  template <bool less, size_t i, typename... args>
  friend struct paddle::pybind::details::CastToPyBufferImpl;
@@ -46,106 +49,84 @@ class Tensor {
 public:
  Tensor() : offset_(0) {}
+  /*! Return a pointer to mutable memory block. */
  template <typename T>
-  const T* data() const {
+  inline T* data();
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
+  /*! Return a pointer to constant memory block. */
  template <typename T>
-  T* data() {
+  inline const T* data() const;
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+  /**
-                                offset_);
+   * @brief   Return a pointer to mutable memory block.
-  }
+   * @note    If not exist, then allocation.
+   */
-  template <typename T,  // must be POD types
+  template <typename T>
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  inline T* mutable_data(platform::Place place);
-  T* mutable_data(DDim dims, platform::Place place) {
-    Resize(dims);
+  /**
-    return mutable_data<T>(place);
+   * @brief     Return a pointer to mutable memory block.
-  }
+   *
+   * @param[in] dims    The dimensions of the memory block.
-  template <typename T,  // must be POD types
+   * @param[in] place   The place of the memory block.
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+   *
-  T* mutable_data(platform::Place place) {
+   * @note      If not exist, then allocation.
-    PADDLE_ENFORCE(product(dims_) > 0,
+   */
-                   "Tensor's numel must be larger than zero to call "
+  template <typename T>
-                   "Tensor::mutable_data. Call Tensor::set_dim first.");
+  inline T* mutable_data(DDim dims, platform::Place place);
-    if (holder_ == nullptr ||
-        !(holder_->place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      if (platform::is_cpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
-      } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#else
-        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
-#endif
-      } else {
-        PADDLE_THROW("Unknown 'place'.");
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+  /*! Resize the dimensions of the memory block. */
+  inline void Resize(const DDim& dims);
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline void ShareDataWith(const Tensor& src);
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
  template <typename T>
-  void ShareDataWith(const Tensor& src) {
+  inline void CopyFrom(const Tensor& src,
-    src.EnforceSufficientMemory<T>();
+                       const platform::CPUDeviceContext& ctx);
-    *this = src;
-  }
+#ifndef PADDLE_ONLY_CPU
  template <typename T>
-  void CopyFrom(const Tensor& src, platform::Place dst_place) {
+  inline void CopyFrom(const Tensor& src,
-    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       const platform::CUDADeviceContext& ctx);
-                       platform::is_cpu_place(dst_place),
+#endif
-                   "Tensor::CopyFrom only support CPU now.");
-    src.EnforceSufficientMemory<T>();
-    size_t size = product(src.dims_) * sizeof(T);
-    Resize(src.dims());
-    const void* src_ptr = static_cast<const void*>(src.data<T>());
-    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-    memcpy(dst_ptr, src_ptr, size);
-  }
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
  template <typename T>
-  Tensor Slice(const int& begin_idx, const int& end_idx) const {
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
-    EnforceSufficientMemory<T>();
-    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
-    PADDLE_ENFORCE(begin_idx < end_idx,
-                   "Begin index must be less than end index.");
-    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    int base = product(dims_) / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
-    return dst;
-  }
-  void Resize(const DDim& dims) { dims_ = dims; }
-  const DDim& dims() const { return dims_; }
 private:
-  // Placeholder hides type T, so it doesn't appear as a template
+  template <typename T>
-  // parameter of Variable.
+  inline void check_memory_size() const;
+ private:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
  struct Placeholder {
    virtual ~Placeholder() {}
    virtual void* ptr() const = 0;
-    virtual platform::Place place() const = 0;
    virtual size_t size() const = 0;
    virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
  };
  template <typename T, typename PlaceType>
@@ -156,33 +137,38 @@ class Tensor {
          place_(place),
          size_(size) {}
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+    /*! the pointer of memory block. */
    std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
-    platform::Place place_;  // record the place of ptr_.
-    size_t size_;            // size of the memory block.
+    /*! the place of memory block. */
+    platform::Place place_;
+    /*! the size of memory block. */
+    size_t size_;
  };
-  template <typename T>
+  /*! holds the memory block if allocated. */
-  inline void EnforceSufficientMemory() const {
+  std::shared_ptr<Placeholder> holder_;
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  /*! points to dimensions of memory block. */
-    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
-                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                   "first to re-allocate memory.");
-  }
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  // A PlaceHolder may be shared by more than one tensor. Some of them may be
-  // slices of the others. So the offset_ is introduced here to indicate the
+  /**
-  // byte offset between PlaceHolder::ptr_ and where tensor's data really
+   * @brief   A PlaceHolder may be shared by more than one tensor.
-  // begins.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
  size_t offset_;
 };
 }  // namespace framework
 }  // namespace paddle
+#include "paddle/framework/detail/tensor-inl.h"
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,7 +72,8 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
    EXPECT_EQ(p1, p2);
  }
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
  {
    Tensor src_tensor;
    float* p1 = nullptr;
@@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) {
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
  {
    Tensor src_tensor;
    Tensor dst_tensor;
@@ -160,7 +161,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
  }
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -188,25 +189,74 @@ TEST(Tensor, Slice) {
 TEST(Tensor, CopyFrom) {
  using namespace paddle::framework;
  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
-  Tensor src_tensor;
+    auto* cpu_ctx = new paddle::platform::CPUDeviceContext();
-  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_ctx);
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
+    const int* dst_ptr = dst_tensor.data<int>();
-  Tensor dst_tensor;
+    ASSERT_NE(src_ptr, dst_ptr);
-  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
+    for (size_t i = 0; i < 9; ++i) {
-  const int* dst_ptr = dst_tensor.data<int>();
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  ASSERT_NE(src_ptr, dst_ptr);
+    }
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_ctx);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
  }
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+    // CPU Tensor to GPU Tensor
+    auto gpu_ctx = new paddle::platform::CUDADeviceContext(0);
+    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_ctx);
+    // GPU Tensor to CPU Tensor
+    auto cpu_ctx = new paddle::platform::CPUDeviceContext();
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
+    // Compare Tensors
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    // CPU Slice Tensor to GPU Tensor
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_ctx);
-  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    // GPU Tensor to CPU Tensor
-  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_ctx);
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
+    // Compare Slice Tensors
-  ASSERT_NE(dst_ptr, slice_ptr);
+    const int* slice_ptr = slice_tensor.data<int>();
-  for (size_t i = 0; i < 3; ++i) {
+    dst_ptr = dst_tensor.data<int>();
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
  }
+#endif
 }
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -29,10 +29,10 @@ void Free(Place, void*);
 template <typename Place>
 size_t Used(Place);
-template <typename T, /* must be POD types */
+template <typename T, typename Place>
-          typename Place /* platform::GPUPlace or platform::CPUPlace */,
-          typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
 class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
 public:
  PODDeleter(Place place) : place_(place) {}
  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -87,7 +87,7 @@ class CUDADeviceContext : public DeviceContext {
                   "cudaStreamSynchronize failed");
  }
-  cudaStream_t stream() { return stream_; }
+  cudaStream_t stream() const { return stream_; }
  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }