Merge pull request #13525 from reyoung/fix_mixed_vector

Fix mixed vector

Merge pull request #13525 from reyoung/fix_mixed_vector
Fix mixed vector
0be1582d · Yu Yang · GitHub · 4e81e228 · e1913bc5 · 0be1582d
11 changed file
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {
-// Change it to thread safe flags if needed.
+template <class T>
-class ThreadUnsafeOwnershipFlags {
+class COWPtr {
 public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  typedef std::shared_ptr<T> RefPtr;
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-  void SetOwnership(bool flag) { flag_ = flag; }
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
 private:
-  bool flag_;
+  RefPtr m_sp;
-};
-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
 public:
-  // Ctor from raw pointer.
+  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  explicit COWPtr(T* t) : m_sp(t) {}
-  // Move methods. Steal ownership from origin
+  const T& Data() const { return *m_sp; }
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
  T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
+    DetachIfNotUnique();
-        [this] { payload_.reset(new T(*payload_)); });
+    return m_sp.get();
-    return payload_.get();
  }
- private:
+  void DetachIfNotUnique() {
-  // Actual data pointer.
+    T* tmp = m_sp.get();
-  std::shared_ptr<T> payload_;
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }
-  // Ownership flag.
+  void Detach() {
-  OwnershipFlags ownership_;
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }
+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@@ -28,173 +31,167 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
-// Vector<T> implements the std::vector interface, and can get Data or
+namespace details {
-// MutableData from any place. The data will be synced implicitly inside.
+struct CUDABuffer {
-template <typename T>
+  void *data_{nullptr};
-class Vector {
+  size_t size_{0};
- public:
+  platform::CUDAPlace place_;
-  using value_type = T;
-  // Default ctor. Create empty Vector
+  CUDABuffer() {}
-  Vector() { InitEmpty(); }
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-  // Fill vector with value. The vector size is `count`.
+    data_ = memory::Alloc(place_, size);
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    }
  }
-  // Ctor with init_list
+  ~CUDABuffer() { ClearMemory(); }
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
+  CUDABuffer(const CUDABuffer &o) = delete;
-      InitEmpty();
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
  }
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
  }
-  // implicit cast from std::vector.
+ private:
-  template <typename U>
+  void ClearMemory() const {
-  Vector(const std::vector<U> &dat) {  // NOLINT
+    if (data_ != nullptr) {
-    if (dat.size() == 0) {
+      memory::Free(place_, data_);
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
    }
  }
+};
+}  // namespace details
-  // Copy ctor
+// Vector<T> implements the std::vector interface, and can get Data or
-  Vector(const Vector<T> &other) { this->operator=(other); }
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
-  // Copy operator
+ private:
-  Vector<T> &operator=(const Vector<T> &other) {
+  // The actual class to implement vector logic
-    if (other.size() != 0) {
+  class VectorData {
-      this->InitByIter(other.size(), other.begin(), other.end());
+   public:
-    } else {
+    VectorData() : flag_(kDataInCPU) {}
-      InitEmpty();
+    VectorData(size_t count, const T &value)
-    }
+        : cpu_(count, value), flag_(kDataInCPU) {}
-    return *this;
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-  }
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
-  // Move ctor
+    VectorData(const VectorData &o) {
-  Vector(Vector<T> &&other) {
+      o.ImmutableCPU();
-    this->size_ = other.size_;
+      cpu_ = o.cpu_;
-    this->flag_ = other.flag_;
+      flag_ = kDataInCPU;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
    }
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
    }
-  // CPU data access method. Mutable.
    T &operator[](size_t i) {
      MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
+      return cpu_[i];
    }
-  // CPU data access method. Immutable.
    const T &operator[](size_t i) const {
      ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
+      return cpu_[i];
    }
-  // std::vector iterator methods. Based on CPU data access method
+    size_t size() const { return cpu_.size(); }
-  size_t size() const { return size_; }
-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
-  T *end() {
+    iterator end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      MutableCPU();
+      return cpu_.end();
    }
-  T &front() { return *begin(); }
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
    T &back() {
-    auto it = end();
+      MutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  const T *begin() const {
+    const_iterator begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+      ImmutableCPU();
+      return cpu_.begin();
    }
-  const T *end() const {
+    const_iterator end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      ImmutableCPU();
+      return cpu_.end();
    }
-  const T *cbegin() const { return begin(); }
-  const T *cend() const { return end(); }
    const T &back() const {
-    auto it = end();
+      ImmutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  T *data() { return begin(); }
+    T *data() { return &(*this)[0]; }
-  const T *data() const { return begin(); }
+    const T *data() const { return &(*this)[0]; }
-  const T &front() const { return *begin(); }
+    const T &front() const {
-  // end of std::vector iterator methods
+      ImmutableCPU();
+      return cpu_.front();
+    }
    // assign this from iterator.
    // NOTE: the iterator must support `end-begin`
    template <typename Iter>
    void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+      MutableCPU();
+      cpu_.assign(begin, end);
    }
    // push_back. If the previous capacity is not enough, the memory will
    // double.
    void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
+      MutableCPU();
-      reserve((size_ + 1) << 1);
+      cpu_.push_back(elem);
-    }
-    *end() = elem;
-    ++size_;
    }
    // extend a vector by iterator.
    // NOTE: the iterator must support end-begin
    template <typename It>
    void Extend(It begin, It end) {
-    size_t pre_size = size_;
+      MutableCPU();
-    resize(pre_size + (end - begin));
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-    T *ptr = this->begin() + pre_size;
+      std::copy(begin, end, out_it);
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
    }
    // resize the vector
    void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
      MutableCPU();
-      Tensor cpu_tensor;
+      cpu_.resize(size);
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
-    }
    }
    // get cuda ptr. immutable
@@ -202,7 +199,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+      return reinterpret_cast<T *>(gpu_.data_);
    }
    // get cuda ptr. mutable
@@ -214,77 +211,39 @@ class Vector {
    // clear
    void clear() {
-    size_ = 0;
+      cpu_.clear();
      flag_ = kDirty | kDataInCPU;
    }
-  size_t capacity() const {
+    size_t capacity() const { return cpu_.capacity(); }
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
    // reserve data
-  void reserve(size_t size) {
+    void reserve(size_t size) const { cpu_.reserve(size); }
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
-  // the unify method to access CPU or CUDA data. immutable.
-  const T *Data(platform::Place place) const {
-    if (platform::is_gpu_place(place)) {
-      return CUDAData(place);
-    } else {
-      return data();
-    }
-  }
-  // the unify method to access CPU or CUDA data. mutable.
-  T *MutableData(platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      return CUDAMutableData(place);
-    } else {
-      return data();
-    }
-  }
    // implicit cast operator. Vector can be cast to std::vector implicitly.
    operator std::vector<T>() const {
-    std::vector<T> result;
+      ImmutableCPU();
-    result.resize(size());
+      return cpu_;
-    std::copy(begin(), end(), result.begin());
-    return result;
    }
-  bool operator==(const Vector<T> &other) const {
+    bool operator==(const VectorData &other) const {
-    if (size() != other.size()) return false;
+      ImmutableCPU();
-    auto it1 = cbegin();
+      other.ImmutableCPU();
-    auto it2 = other.cbegin();
+      return cpu_ == other.cpu_;
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
    }
- private:
+    std::mutex &Mutex() const { return mtx_; }
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-  template <typename Iter>
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
-  void InitByIter(size_t size, Iter begin, Iter end) {
+      if (gpu_.data_ == nullptr) {
-    platform::Place cpu = platform::CPUPlace();
+        return nullptr;
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
+      } else {
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+        return std::unique_ptr<platform::CUDAPlace>(
-    for (size_t i = 0; i < size; ++i) {
+            new platform::CUDAPlace(gpu_.place_));
-      *ptr++ = *begin++;
+      }
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
    }
+   private:
    enum DataFlag {
      kDataInCPU = 0x01,
      kDataInCUDA = 0x02,
@@ -294,8 +253,15 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-    WaitPlace(cuda_vec_.place());
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
    }
    void MutableCPU() {
@@ -308,16 +274,12 @@ class Vector {
    void ImmutableCUDA(platform::Place place) const {
      if (IsDirty()) {
        if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        } else if (IsInCUDA() &&
-        framework::Tensor tmp;
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+          PADDLE_THROW("This situation should not happen");
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
          // Still dirty
        } else {
          // Dirty && DataInCUDA && Device is same
@@ -326,17 +288,10 @@ class Vector {
      } else {
        if (!IsInCUDA()) {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        framework::Tensor tmp;
+          PADDLE_THROW("This situation should not happen.");
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
        } else {
          // Not Dirty && DataInCUDA && Device is same
          // Do nothing.
@@ -344,9 +299,20 @@ class Vector {
      }
    }
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
    void ImmutableCPU() const {
-    if (IsDirty() &&
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+                                      // CPU has no data.
        CopyToCPU();
        UnsetFlag(kDirty);
      }
@@ -362,23 +328,178 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
-  static void WaitPlace(const platform::Place place) {
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+    mutable std::mutex mtx_;
+  };
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  }
+  // Copy ctor
+  Vector(const Vector<T> &other) { m_ = other.m_; }
+  // Copy operator
+  Vector<T> &operator=(const Vector<T> &other) {
+    m_ = other.m_;
+    return *this;
+  }
+  // Move ctor
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_.Data().size(); }
+  iterator begin() { return m_.MutableData()->begin(); }
+  iterator end() { return m_.MutableData()->end(); }
+  T &front() { return m_.MutableData()->front(); }
+  T &back() { return m_.MutableData()->back(); }
+  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator end() const { return m_.Data().end(); }
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+  const T &back() const { return m_.Data().back(); }
+  T *data() { return m_.MutableData()->data(); }
+  const T *data() const { return m_.Data().data(); }
+  const T &front() const { return m_.Data().front(); }
+  // end of std::vector iterator methods
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_.MutableData()->assign(begin, end);
+  }
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_.MutableData()->Extend(begin, end);
+  }
+  // resize the vector
+  void resize(size_t size) {
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
+    }
+  }
+  // get cuda ptr. immutable
+  const T *CUDAData(platform::Place place) const {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
+  }
+  // get cuda ptr. mutable
+  T *CUDAMutableData(platform::Place place) {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
+  }
+  // clear
+  void clear() { m_.MutableData()->clear(); }
+  size_t capacity() const { return m_.Data().capacity(); }
+  // reserve data
+  void reserve(size_t size) { m_.Data().reserve(size); }
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
+      return CUDAMutableData(place);
-          .Get(boost::get<platform::CUDAPlace>(place))
+    } else {
-          ->Wait();
+      return data();
    }
  }
-  static T &EmptyDummy() {
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
-    static T dummy = T();
+  operator std::vector<T>() const { return m_.Data(); }
-    return dummy;
+  bool operator==(const Vector<T> &other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
  }
-  mutable int flag_;
+  const void *Handle() const { return &m_.Data(); }
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
+ private:
-  size_t size_;
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };
 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");
-    auto label_lod = in_label->lod();
+    auto& label_lod = in_label->lod();
-    auto detect_lod = in_detect->lod();
+    auto& detect_lod = in_detect->lod();
    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto labels = framework::EigenTensor<T, 2>::From(input_label);
    auto detect = framework::EigenTensor<T, 2>::From(input_detect);
-    auto label_lod = input_label.lod();
+    auto& label_lod = input_label.lod();
-    auto detect_lod = input_detect.lod();
+    auto& detect_lod = input_detect.lod();
    int batch_size = label_lod[0].size() - 1;
-    auto label_index = label_lod[0];
+    auto& label_index = label_lod[0];
    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    output_true_pos->set_lod(true_pos_lod);
    output_false_pos->set_lod(false_pos_lod);
-    return;
  }
  void GetInputPos(const framework::Tensor& input_pos_count,
@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto SetData = [](const framework::LoDTensor& pos_tensor,
                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
      const T* pos_data = pos_tensor.data<T>();
-      auto pos_data_lod = pos_tensor.lod()[0];
+      auto& pos_data_lod = pos_tensor.lod()[0];
      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
          T score = pos_data[j * 2];
@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
    int batch_size = gt_boxes.size();
    for (int n = 0; n < batch_size; ++n) {
-      auto image_gt_boxes = gt_boxes[n];
+      auto& image_gt_boxes = gt_boxes[n];
-      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
+      for (auto& image_gt_box : image_gt_boxes) {
        size_t count = 0;
-        auto labeled_bboxes = it->second;
+        auto& labeled_bboxes = image_gt_box.second;
        if (evaluate_difficult) {
          count = labeled_bboxes.size();
        } else {
-          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
+          for (auto& box : labeled_bboxes) {
-            if (!(labeled_bboxes[i].is_difficult)) ++count;
+            if (!box.is_difficult) {
+              ++count;
+            }
+          }
        }
        if (count == 0) {
          continue;
        }
-        int label = it->first;
+        int label = image_gt_box.first;
        if (label_pos_count->find(label) == label_pos_count->end()) {
          (*label_pos_count)[label] = count;
        } else {

--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
-    auto in_rows = in.rows();
+    auto &in_rows = in.rows();
    auto out_dim = framework::make_ddim(
        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
      // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
-                   ids_data, ids_num * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);
      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto out_place = context.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
-    memory::Copy(
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
-        boost::get<platform::CUDAPlace>(out_place), out_data,
                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-        in1_value.numel() * sizeof(T),
+                 in1_value.numel() * sizeof(T), context.stream());
-        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
    auto* in2_data = in2_value.data<T>();
    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();
    auto& in2_rows = *(input2->mutable_rows());
    auto& in1_value = input1.value();

--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();
      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);

--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -124,7 +124,6 @@ class SumKernel : public framework::OpKernel<T> {
      out_value->Resize(framework::make_ddim(in_dim));
      out_value->mutable_data<T>(context.GetPlace());
      // if all the input sparse vars are empty, no need to
      // merge these vars.
      if (first_dim == 0UL) {

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
                    "Output (" + out_name + ") has diff at " + str(place) +
                    "\nExpect " + str(expect_t) + "\n" + "But Got" +
-                    str(actual_t))
+                    str(actual_t) + " in class " + self.__class__.__name__)
                if isinstance(expect, tuple):
                    self.assertListEqual(actual.recursive_sequence_lengths(),
                                         expect[1], "Output (" + out_name +

--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
@@ -20,6 +20,7 @@ import six
 import sys
 import collections
 import math
+import paddle.fluid as fluid
 from op_test import OpTest
@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
        self.detect = np.array(self.detect).astype('float32')
        self.mAP = np.array(self.mAP).astype('float32')
-        if (len(self.class_pos_count) > 0):
+        if len(self.class_pos_count) > 0:
            self.class_pos_count = np.array(self.class_pos_count).astype(
                'int32')
            self.true_pos = np.array(self.true_pos).astype('float32')
@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
 class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
    def init_test_case(self):
        super(TestDetectionMAPOpMultiBatch, self).init_test_case()
-        self.class_pos_count = [0, 2, 1]
+        self.class_pos_count = [0, 2, 1, 0]
        self.true_pos_lod = [[0, 3, 2]]
        self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
        self.false_pos_lod = [[0, 3, 2]]