Fix MixedVector

e1913bc5 · Yu Yang · 6d2c6f96 · e1913bc5 · e1913bc5 · e1913bc5
4 changed file
--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -28,31 +28,27 @@ class COWPtr {
 private:
  RefPtr m_sp;

-  void detach() {
-    T* tmp = m_sp.get();
-    if (!(tmp == nullptr || m_sp.unique())) {
-      m_sp = RefPtr(new T(*tmp));
-    }
-  }
-
 public:
  COWPtr() : m_sp(nullptr) {}
  explicit COWPtr(T* t) : m_sp(t) {}
-  explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}

-  const T& Data() const { return operator*(); }
+  const T& Data() const { return *m_sp; }

-  T* MutableData() { return operator->(); }
+  T* MutableData() {
+    DetachIfNotUnique();
+    return m_sp.get();
+  }

-  const T& operator*() const { return *m_sp; }
-  T& operator*() {
-    detach();
-    return *m_sp;
+  void DetachIfNotUnique() {
+    T* tmp = m_sp.get();
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
  }
-  const T* operator->() const { return m_sp.operator->(); }
-  T* operator->() {
-    detach();
-    return m_sp.operator->();
+
+  void Detach() {
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
  }
 };
 }  // namespace details

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/details/cow_ptr.h"
@@ -51,6 +52,7 @@ struct CUDABuffer {
    ClearMemory();
    place_ = boost::get<platform::CUDAPlace>(place);
    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
    size_ = size;
  }

@@ -62,7 +64,7 @@ struct CUDABuffer {

 private:
  void ClearMemory() const {
-    if (data_) {
+    if (data_ != nullptr) {
      memory::Free(place_, data_);
    }
  }
@@ -89,6 +91,7 @@ class Vector {
    template <typename U>
    explicit VectorData(const std::vector<U> &dat)
        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}

    VectorData(const VectorData &o) {
      o.ImmutableCPU();
@@ -215,7 +218,7 @@ class Vector {
    size_t capacity() const { return cpu_.capacity(); }

    // reserve data
-    void reserve(size_t size) { cpu_.reserve(size); }
+    void reserve(size_t size) const { cpu_.reserve(size); }

    // implicit cast operator. Vector can be cast to std::vector implicitly.
    operator std::vector<T>() const {
@@ -229,6 +232,17 @@ class Vector {
      return cpu_ == other.cpu_;
    }

+    std::mutex &Mutex() const { return mtx_; }
+
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
+      if (gpu_.data_ == nullptr) {
+        return nullptr;
+      } else {
+        return std::unique_ptr<platform::CUDAPlace>(
+            new platform::CUDAPlace(gpu_.place_));
+      }
+    }
+
   private:
    enum DataFlag {
      kDataInCPU = 0x01,
@@ -239,10 +253,15 @@ class Vector {

    void CopyToCPU() const {
      // COPY GPU Data To CPU
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
      void *src = gpu_.data_;
      void *dst = cpu_.data();
      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
-                   nullptr);
+                   stream);
+      dev_ctx->Wait();
    }

    void MutableCPU() {
@@ -260,7 +279,7 @@ class Vector {
          SetFlag(kDataInCUDA);
        } else if (IsInCUDA() &&
                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
+          PADDLE_THROW("This situation should not happen");
          // Still dirty
        } else {
          // Dirty && DataInCUDA && Device is same
@@ -272,28 +291,21 @@ class Vector {
          CopyCPUDataToCUDA(place);
          SetFlag(kDataInCUDA);
        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
+          PADDLE_THROW("This situation should not happen.");
        } else {
          // Not Dirty && DataInCUDA && Device is same
          // Do nothing.
        }
      }
    }
-    void CopyCUDADataToAnotherPlace(const platform::Place &place) const {
-      details::CUDABuffer tmp(place, gpu_.size_);
-      const void *src = gpu_.data_;
-      void *dst = tmp.data_;

-      memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr);
-      gpu_.Swap(tmp);
-    }
    void CopyCPUDataToCUDA(const platform::Place &place) const {
      void *src = cpu_.data();
      gpu_.Resize(place, cpu_.size() * sizeof(T));
      void *dst = gpu_.data_;
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        platform::DeviceContextPool::Instance().Get(place))
-                        ->stream();
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
                   stream);
    }
@@ -319,6 +331,8 @@ class Vector {
    mutable std::vector<T> cpu_;
    mutable details::CUDABuffer gpu_;
    mutable int flag_;
+
+    mutable std::mutex mtx_;
  };

 public:
@@ -350,81 +364,103 @@ class Vector {
  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }

  // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_)[i]; }
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }

  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return (*m_)[i]; }
+  const T &operator[](size_t i) const { return m_.Data()[i]; }

  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_->size(); }
+  size_t size() const { return m_.Data().size(); }

-  iterator begin() { return m_->begin(); }
+  iterator begin() { return m_.MutableData()->begin(); }

-  iterator end() { return m_->end(); }
+  iterator end() { return m_.MutableData()->end(); }

-  T &front() { return m_->front(); }
+  T &front() { return m_.MutableData()->front(); }

-  T &back() { return m_->back(); }
+  T &back() { return m_.MutableData()->back(); }

-  const_iterator begin() const { return m_->begin(); }
+  const_iterator begin() const { return m_.Data().begin(); }

-  const_iterator end() const { return m_->end(); }
+  const_iterator end() const { return m_.Data().end(); }

  const_iterator cbegin() const { return begin(); }

  const_iterator cend() const { return end(); }

-  const T &back() const { return m_->back(); }
+  const T &back() const { return m_.Data().back(); }

-  T *data() { return m_->data(); }
+  T *data() { return m_.MutableData()->data(); }

-  const T *data() const { return m_->data(); }
+  const T *data() const { return m_.Data().data(); }

-  const T &front() const { return m_->front(); }
+  const T &front() const { return m_.Data().front(); }
  // end of std::vector iterator methods

  // assign this from iterator.
  // NOTE: the iterator must support `end-begin`
  template <typename Iter>
  void assign(Iter begin, Iter end) {
-    m_->assign(begin, end);
+    m_.MutableData()->assign(begin, end);
  }

  // push_back. If the previous capacity is not enough, the memory will
  // double.
-  void push_back(T elem) { m_->push_back(elem); }
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }

  // extend a vector by iterator.
  // NOTE: the iterator must support end-begin
  template <typename It>
  void Extend(It begin, It end) {
-    m_->Extend(begin, end);
+    m_.MutableData()->Extend(begin, end);
  }

  // resize the vector
  void resize(size_t size) {
    if (m_.Data().size() != size) {
-      m_->resize(size);
+      m_.MutableData()->resize(size);
    }
  }

  // get cuda ptr. immutable
  const T *CUDAData(platform::Place place) const {
-    return m_.Data().CUDAData(place);
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
  }

  // get cuda ptr. mutable
  T *CUDAMutableData(platform::Place place) {
-    return m_->CUDAMutableData(place);
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
  }

  // clear
-  void clear() { m_->clear(); }
+  void clear() { m_.MutableData()->clear(); }

-  size_t capacity() const { return m_->capacity(); }
+  size_t capacity() const { return m_.Data().capacity(); }

  // reserve data
-  void reserve(size_t size) { m_->reserve(size); }
+  void reserve(size_t size) { m_.Data().reserve(size); }

  // the unify method to access CPU or CUDA data. immutable.
  const T *Data(platform::Place place) const {
@@ -445,7 +481,7 @@ class Vector {
  }

  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return *m_; }
+  operator std::vector<T>() const { return m_.Data(); }

  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
@@ -463,7 +499,7 @@ class Vector {

 private:
  // Vector is an COW object.
-  details::COWPtr<VectorData> m_;
+  mutable details::COWPtr<VectorData> m_;
 };

 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());

      // TODO(yuyang18): Strange code here.
-      memory::Copy(platform::CPUPlace(),
-                   new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
-                   ids_data, ids_num * sizeof(int64_t), stream);
-
+      memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
+                   gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
      d_table->set_rows(new_rows);

      auto *d_table_value = d_table->mutable_value();

--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);

      auto& in_value = grad->value();
-      framework::Vector<int64_t> in_rows(grad->rows());
+      auto& in_rows = grad->rows();

      int64_t in_row_numel = in_value.numel() / in_rows.size();
      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);