From de8a8feeb48f981e5d75603eadc491b7eb139064 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 25 Jul 2017 17:34:59 +0800
Subject: [PATCH] ENH: Refine Tensor and And CopyFrom

---
 paddle/framework/CMakeLists.txt      |   2 +-
 paddle/framework/detail/tensor-inl.h | 158 ++++++++++++++++++++++
 paddle/framework/tensor.cc           |   2 +-
 paddle/framework/tensor.h            | 190 +++++++++++++--------------
 paddle/framework/tensor_test.cc      |  49 ++++---
 paddle/platform/device_context.h     |   2 +-
 6 files changed, 277 insertions(+), 126 deletions(-)
 create mode 100644 paddle/framework/detail/tensor-inl.h
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 760d84e51e..dec4ab80d8 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h
new file mode 100644
index 0000000000..4fc328d231
--- /dev/null
+++ b/paddle/framework/detail/tensor-inl.h
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE(holder_ != nullptr,
+                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                 "first to re-allocate memory.");
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(product(dims_) > 0,
+                 "Tensor's numel must be larger than zero to call "
+                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    }
+#ifndef PADDLE_ONLY_CPU
+    else if (platform::is_gpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline void Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+}
+
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CPUDeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+#endif
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::CUDADeviceContext& ctx) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_place = ctx.GetPlace();
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  } else if (platform::is_gpu_place(src_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size,
+                 ctx.stream());
+  }
+}
+#endif
+
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE(begin_idx < end_idx,
+                 "Begin index must be less than end index.");
+  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+
+inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
index 964f15ab66..ea7b2a1f7b 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/tensor.h>
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace framework {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a36f375d2e..d3f56b31cd 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -31,9 +32,11 @@ template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }  // namespace details
 }  // namespace pybind
+
 namespace framework {
 
 class Tensor {
+ public:
   template <bool less, size_t i, typename... args>
   friend struct paddle::pybind::details::CastToPyBufferImpl;
 
@@ -46,106 +49,84 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
+  /*! Return a pointer to mutable memory block. */
   template <typename T>
-  const T* data() const {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
+  inline T* data();
 
+  /*! Return a pointer to constant memory block. */
   template <typename T>
-  T* data() {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, platform::Place place) {
-    Resize(dims);
-    return mutable_data<T>(place);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(product(dims_) > 0,
-                   "Tensor's numel must be larger than zero to call "
-                   "Tensor::mutable_data. Call Tensor::set_dim first.");
-    if (holder_ == nullptr ||
-        !(holder_->place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      if (platform::is_cpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
-      } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#else
-        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
-#endif
-      } else {
-        PADDLE_THROW("Unknown 'place'.");
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
+  inline const T* data() const;
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
 
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline void Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline void ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
   template <typename T>
-  void ShareDataWith(const Tensor& src) {
-    src.EnforceSufficientMemory<T>();
-    *this = src;
-  }
+  inline void CopyFrom(const Tensor& src,
+                       const platform::CPUDeviceContext& ctx);
 
+#ifndef PADDLE_ONLY_CPU
   template <typename T>
-  void CopyFrom(const Tensor& src, platform::Place dst_place) {
-    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
-                       platform::is_cpu_place(dst_place),
-                   "Tensor::CopyFrom only support CPU now.");
-    src.EnforceSufficientMemory<T>();
-    size_t size = product(src.dims_) * sizeof(T);
-    Resize(src.dims());
-    const void* src_ptr = static_cast<const void*>(src.data<T>());
-    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-    memcpy(dst_ptr, src_ptr, size);
-  }
+  inline void CopyFrom(const Tensor& src,
+                       const platform::CUDADeviceContext& ctx);
+#endif
 
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
   template <typename T>
-  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    EnforceSufficientMemory<T>();
-    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
-    PADDLE_ENFORCE(begin_idx < end_idx,
-                   "Begin index must be less than end index.");
-    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    int base = product(dims_) / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
-    return dst;
-  }
-
-  void Resize(const DDim& dims) { dims_ = dims; }
-
-  const DDim& dims() const { return dims_; }
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
  private:
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
+  template <typename T>
+  inline void check_memory_size() const;
+
+ private:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
-    virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
     virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
   };
 
   template <typename T, typename PlaceType>
@@ -156,33 +137,38 @@ class Tensor {
           place_(place),
           size_(size) {}
 
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
+    /*! the pointer of memory block. */
     std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
-    platform::Place place_;  // record the place of ptr_.
-    size_t size_;            // size of the memory block.
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
   };
 
-  template <typename T>
-  inline void EnforceSufficientMemory() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
-                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                   "first to re-allocate memory.");
-  }
-
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
   DDim dims_;
-  // A PlaceHolder may be shared by more than one tensor. Some of them may be
-  // slices of the others. So the offset_ is introduced here to indicate the
-  // byte offset between PlaceHolder::ptr_ and where tensor's data really
-  // begins.
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
   size_t offset_;
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#include "paddle/framework/detail/tensor-inl.h"
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 089844dc01..7987d335ac 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -72,7 +72,8 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-#ifdef __CUDACC__
+
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -160,7 +161,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -188,25 +189,31 @@ TEST(Tensor, Slice) {
 TEST(Tensor, CopyFrom) {
   using namespace paddle::framework;
   using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
 
-  Tensor src_tensor;
-  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
-  Tensor dst_tensor;
-  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
-  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-  }
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
 
-  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
-  for (size_t i = 0; i < 3; ++i) {
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    auto cpu_ctx = paddle::platform::CPUDeviceContext();
+    dst_tensor.CopyFrom<int>(src_tensor, cpu_ctx);
+
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, cpu_ctx);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
   }
 }
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fe6f13e399..5a366dccdc 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -87,7 +87,7 @@ class CUDADeviceContext : public DeviceContext {
                    "cudaStreamSynchronize failed");
   }
 
-  cudaStream_t stream() { return stream_; }
+  cudaStream_t stream() const { return stream_; }
 
   Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
 
-- 
GitLab