diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 4fb015b0ffe27e6fa91d4eaf373fca4feca66361..090517ff3c1822c2e62e61fad05d49e1c8db8573 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -28,31 +28,27 @@ class COWPtr { private: RefPtr m_sp; - void detach() { - T* tmp = m_sp.get(); - if (!(tmp == nullptr || m_sp.unique())) { - m_sp = RefPtr(new T(*tmp)); - } - } - public: COWPtr() : m_sp(nullptr) {} explicit COWPtr(T* t) : m_sp(t) {} - explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {} - const T& Data() const { return operator*(); } + const T& Data() const { return *m_sp; } - T* MutableData() { return operator->(); } + T* MutableData() { + DetachIfNotUnique(); + return m_sp.get(); + } - const T& operator*() const { return *m_sp; } - T& operator*() { - detach(); - return *m_sp; + void DetachIfNotUnique() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + Detach(); + } } - const T* operator->() const { return m_sp.operator->(); } - T* operator->() { - detach(); - return m_sp.operator->(); + + void Detach() { + T* tmp = m_sp.get(); + m_sp = RefPtr(new T(*tmp)); } }; } // namespace details diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index ba2c41eb8968e30bc8a801f4d8d239da8c522de9..77386f4f069489b6ff7b927a281bdc286ff816e0 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,6 +17,7 @@ #include #include #include +#include // NOLINT #include #include #include "paddle/fluid/framework/details/cow_ptr.h" @@ -51,6 +52,7 @@ struct CUDABuffer { ClearMemory(); place_ = boost::get(place); data_ = memory::Alloc(place_, size); + PADDLE_ENFORCE_NOT_NULL(data_); size_ = size; } @@ -62,7 +64,7 @@ struct CUDABuffer { private: void ClearMemory() const { - if (data_) { + if (data_ != nullptr) { memory::Free(place_, data_); } } @@ -89,6 +91,7 @@ class Vector { template explicit VectorData(const std::vector &dat) : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} VectorData(const VectorData &o) { o.ImmutableCPU(); @@ -215,7 +218,7 @@ class Vector { size_t capacity() const { return cpu_.capacity(); } // reserve data - void reserve(size_t size) { cpu_.reserve(size); } + void reserve(size_t size) const { cpu_.reserve(size); } // implicit cast operator. Vector can be cast to std::vector implicitly. operator std::vector() const { @@ -229,6 +232,17 @@ class Vector { return cpu_ == other.cpu_; } + std::mutex &Mutex() const { return mtx_; } + + std::unique_ptr CUDAPlace() const { + if (gpu_.data_ == nullptr) { + return nullptr; + } else { + return std::unique_ptr( + new platform::CUDAPlace(gpu_.place_)); + } + } + private: enum DataFlag { kDataInCPU = 0x01, @@ -239,10 +253,15 @@ class Vector { void CopyToCPU() const { // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::Place(gpu_.place_))); + auto stream = dev_ctx->stream(); void *src = gpu_.data_; void *dst = cpu_.data(); memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, - nullptr); + stream); + dev_ctx->Wait(); } void MutableCPU() { @@ -260,7 +279,7 @@ class Vector { SetFlag(kDataInCUDA); } else if (IsInCUDA() && !(boost::get(place) == gpu_.place_)) { - CopyCUDADataToAnotherPlace(place); + PADDLE_THROW("This situation should not happen"); // Still dirty } else { // Dirty && DataInCUDA && Device is same @@ -272,28 +291,21 @@ class Vector { CopyCPUDataToCUDA(place); SetFlag(kDataInCUDA); } else if (!(boost::get(place) == gpu_.place_)) { - CopyCUDADataToAnotherPlace(place); + PADDLE_THROW("This situation should not happen."); } else { // Not Dirty && DataInCUDA && Device is same // Do nothing. } } } - void CopyCUDADataToAnotherPlace(const platform::Place &place) const { - details::CUDABuffer tmp(place, gpu_.size_); - const void *src = gpu_.data_; - void *dst = tmp.data_; - memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr); - gpu_.Swap(tmp); - } void CopyCPUDataToCUDA(const platform::Place &place) const { void *src = cpu_.data(); gpu_.Resize(place, cpu_.size() * sizeof(T)); void *dst = gpu_.data_; - auto stream = static_cast( - platform::DeviceContextPool::Instance().Get(place)) - ->stream(); + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, stream); } @@ -319,6 +331,8 @@ class Vector { mutable std::vector cpu_; mutable details::CUDABuffer gpu_; mutable int flag_; + + mutable std::mutex mtx_; }; public: @@ -350,81 +364,103 @@ class Vector { Vector(Vector &&other) { m_ = std::move(other.m_); } // CPU data access method. Mutable. - T &operator[](size_t i) { return (*m_)[i]; } + T &operator[](size_t i) { return (*m_.MutableData())[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { return (*m_)[i]; } + const T &operator[](size_t i) const { return m_.Data()[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return m_->size(); } + size_t size() const { return m_.Data().size(); } - iterator begin() { return m_->begin(); } + iterator begin() { return m_.MutableData()->begin(); } - iterator end() { return m_->end(); } + iterator end() { return m_.MutableData()->end(); } - T &front() { return m_->front(); } + T &front() { return m_.MutableData()->front(); } - T &back() { return m_->back(); } + T &back() { return m_.MutableData()->back(); } - const_iterator begin() const { return m_->begin(); } + const_iterator begin() const { return m_.Data().begin(); } - const_iterator end() const { return m_->end(); } + const_iterator end() const { return m_.Data().end(); } const_iterator cbegin() const { return begin(); } const_iterator cend() const { return end(); } - const T &back() const { return m_->back(); } + const T &back() const { return m_.Data().back(); } - T *data() { return m_->data(); } + T *data() { return m_.MutableData()->data(); } - const T *data() const { return m_->data(); } + const T *data() const { return m_.Data().data(); } - const T &front() const { return m_->front(); } + const T &front() const { return m_.Data().front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - m_->assign(begin, end); + m_.MutableData()->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { m_->push_back(elem); } + void push_back(T elem) { m_.MutableData()->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - m_->Extend(begin, end); + m_.MutableData()->Extend(begin, end); } // resize the vector void resize(size_t size) { if (m_.Data().size() != size) { - m_->resize(size); + m_.MutableData()->resize(size); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - return m_.Data().CUDAData(place); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.Data().CUDAData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAData(place); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - return m_->CUDAMutableData(place); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.MutableData()->CUDAMutableData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAMutableData(place); } // clear - void clear() { m_->clear(); } + void clear() { m_.MutableData()->clear(); } - size_t capacity() const { return m_->capacity(); } + size_t capacity() const { return m_.Data().capacity(); } // reserve data - void reserve(size_t size) { m_->reserve(size); } + void reserve(size_t size) { m_.Data().reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -445,7 +481,7 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { return *m_; } + operator std::vector() const { return m_.Data(); } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -463,7 +499,7 @@ class Vector { private: // Vector is an COW object. - details::COWPtr m_; + mutable details::COWPtr m_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 74823dab09cac358f647c074ac2f2ee2fed17e55..abd5dce8f7e7146a1671a387328c177e5e6e0a85 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto gpu_place = boost::get(context.GetPlace()); // TODO(yuyang18): Strange code here. - memory::Copy(platform::CPUPlace(), - new_rows.CUDAMutableData(context.GetPlace()), gpu_place, - ids_data, ids_num * sizeof(int64_t), stream); - + memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 4722be7a666d3e8f3c25c9499f88ddda835f60e3..9190c7720815e8e8b8efbfb8e62bfff701e4874f 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_height, out_dims[0]); auto& in_value = grad->value(); - framework::Vector in_rows(grad->rows()); + auto& in_rows = grad->rows(); int64_t in_row_numel = in_value.numel() / in_rows.size(); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);