未验证 提交 0be1582d 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #13525 from reyoung/fix_mixed_vector

Fix mixed vector
...@@ -20,79 +20,37 @@ namespace paddle { ...@@ -20,79 +20,37 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
// Change it to thread safe flags if needed. template <class T>
class ThreadUnsafeOwnershipFlags { class COWPtr {
public: public:
explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} typedef std::shared_ptr<T> RefPtr;
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private: private:
bool flag_; RefPtr m_sp;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public: public:
// Ctor from raw pointer. COWPtr() : m_sp(nullptr) {}
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} explicit COWPtr(T* t) : m_sp(t) {}
// Move methods. Steal ownership from origin const T& Data() const { return *m_sp; }
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() { T* MutableData() {
ownership_.AcquireOwnershipOnce( DetachIfNotUnique();
[this] { payload_.reset(new T(*payload_)); }); return m_sp.get();
return payload_.get();
} }
private: void DetachIfNotUnique() {
// Actual data pointer. T* tmp = m_sp.get();
std::shared_ptr<T> payload_; if (!(tmp == nullptr || m_sp.unique())) {
Detach();
}
}
// Ownership flag. void Detach() {
OwnershipFlags ownership_; T* tmp = m_sp.get();
m_sp = RefPtr(new T(*tmp));
}
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -30,6 +30,14 @@ TEST(COWPtr, all) { ...@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,10 +17,13 @@ ...@@ -17,10 +17,13 @@
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <memory> #include <memory>
#include <mutex> // NOLINT
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -28,173 +31,167 @@ namespace paddle { ...@@ -28,173 +31,167 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
// Vector<T> implements the std::vector interface, and can get Data or namespace details {
// MutableData from any place. The data will be synced implicitly inside. struct CUDABuffer {
template <typename T> void *data_{nullptr};
class Vector { size_t size_{0};
public: platform::CUDAPlace place_;
using value_type = T;
// Default ctor. Create empty Vector CUDABuffer() {}
Vector() { InitEmpty(); } CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
// Fill vector with value. The vector size is `count`. data_ = memory::Alloc(place_, size);
explicit Vector(size_t count, const T &value = T()) {
InitEmpty();
if (count != 0) {
resize(count);
T *ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
}
} }
// Ctor with init_list ~CUDABuffer() { ClearMemory(); }
Vector(std::initializer_list<T> init) {
if (init.size() == 0) { CUDABuffer(const CUDABuffer &o) = delete;
InitEmpty(); CUDABuffer &operator=(const CUDABuffer &o) = delete;
} else {
InitByIter(init.size(), init.begin(), init.end()); void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
PADDLE_ENFORCE_NOT_NULL(data_);
size_ = size;
} }
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
} }
// implicit cast from std::vector. private:
template <typename U> void ClearMemory() const {
Vector(const std::vector<U> &dat) { // NOLINT if (data_ != nullptr) {
if (dat.size() == 0) { memory::Free(place_, data_);
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
} }
} }
};
} // namespace details
// Copy ctor // Vector<T> implements the std::vector interface, and can get Data or
Vector(const Vector<T> &other) { this->operator=(other); } // MutableData from any place. The data will be synced implicitly inside.
template <typename T>
class Vector {
public:
using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
// Copy operator private:
Vector<T> &operator=(const Vector<T> &other) { // The actual class to implement vector logic
if (other.size() != 0) { class VectorData {
this->InitByIter(other.size(), other.begin(), other.end()); public:
} else { VectorData() : flag_(kDataInCPU) {}
InitEmpty(); VectorData(size_t count, const T &value)
} : cpu_(count, value), flag_(kDataInCPU) {}
return *this; VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
} template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
~VectorData() {}
// Move ctor VectorData(const VectorData &o) {
Vector(Vector<T> &&other) { o.ImmutableCPU();
this->size_ = other.size_; cpu_ = o.cpu_;
this->flag_ = other.flag_; flag_ = kDataInCPU;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
} }
VectorData &operator=(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
} }
// CPU data access method. Mutable.
T &operator[](size_t i) { T &operator[](size_t i) {
MutableCPU(); MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i]; return cpu_[i];
} }
// CPU data access method. Immutable.
const T &operator[](size_t i) const { const T &operator[](size_t i) const {
ImmutableCPU(); ImmutableCPU();
return cpu_vec_.data<T>()[i]; return cpu_[i];
} }
// std::vector iterator methods. Based on CPU data access method size_t size() const { return cpu_.size(); }
size_t size() const { return size_; }
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } iterator begin() {
MutableCPU();
return cpu_.begin();
}
T *end() { iterator end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); MutableCPU();
return cpu_.end();
} }
T &front() { return *begin(); } T &front() {
MutableCPU();
return cpu_.front();
}
T &back() { T &back() {
auto it = end(); MutableCPU();
--it; return cpu_.back();
return *it;
} }
const T *begin() const { const_iterator begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); ImmutableCPU();
return cpu_.begin();
} }
const T *end() const { const_iterator end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); ImmutableCPU();
return cpu_.end();
} }
const T *cbegin() const { return begin(); }
const T *cend() const { return end(); }
const T &back() const { const T &back() const {
auto it = end(); ImmutableCPU();
--it; return cpu_.back();
return *it;
} }
T *data() { return begin(); } T *data() { return &(*this)[0]; }
const T *data() const { return begin(); } const T *data() const { return &(*this)[0]; }
const T &front() const { return *begin(); } const T &front() const {
// end of std::vector iterator methods ImmutableCPU();
return cpu_.front();
}
// assign this from iterator. // assign this from iterator.
// NOTE: the iterator must support `end-begin` // NOTE: the iterator must support `end-begin`
template <typename Iter> template <typename Iter>
void assign(Iter begin, Iter end) { void assign(Iter begin, Iter end) {
InitByIter(end - begin, begin, end); MutableCPU();
cpu_.assign(begin, end);
} }
// push_back. If the previous capacity is not enough, the memory will // push_back. If the previous capacity is not enough, the memory will
// double. // double.
void push_back(T elem) { void push_back(T elem) {
if (size_ + 1 > capacity()) { MutableCPU();
reserve((size_ + 1) << 1); cpu_.push_back(elem);
}
*end() = elem;
++size_;
} }
// extend a vector by iterator. // extend a vector by iterator.
// NOTE: the iterator must support end-begin // NOTE: the iterator must support end-begin
template <typename It> template <typename It>
void Extend(It begin, It end) { void Extend(It begin, It end) {
size_t pre_size = size_; MutableCPU();
resize(pre_size + (end - begin)); auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
T *ptr = this->begin() + pre_size; std::copy(begin, end, out_it);
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
} }
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (size + 1 <= capacity()) {
size_ = size;
} else {
MutableCPU(); MutableCPU();
Tensor cpu_tensor; cpu_.resize(size);
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
}
} }
// get cuda ptr. immutable // get cuda ptr. immutable
...@@ -202,7 +199,7 @@ class Vector { ...@@ -202,7 +199,7 @@ class Vector {
PADDLE_ENFORCE(platform::is_gpu_place(place), PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place"); "CUDA Data must on CUDA place");
ImmutableCUDA(place); ImmutableCUDA(place);
return cuda_vec_.data<T>(); return reinterpret_cast<T *>(gpu_.data_);
} }
// get cuda ptr. mutable // get cuda ptr. mutable
...@@ -214,77 +211,39 @@ class Vector { ...@@ -214,77 +211,39 @@ class Vector {
// clear // clear
void clear() { void clear() {
size_ = 0; cpu_.clear();
flag_ = kDirty | kDataInCPU; flag_ = kDirty | kDataInCPU;
} }
size_t capacity() const { size_t capacity() const { return cpu_.capacity(); }
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
// reserve data // reserve data
void reserve(size_t size) { void reserve(size_t size) const { cpu_.reserve(size); }
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
return data();
}
}
// the unify method to access CPU or CUDA data. mutable.
T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) {
return CUDAMutableData(place);
} else {
return data();
}
}
// implicit cast operator. Vector can be cast to std::vector implicitly. // implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { operator std::vector<T>() const {
std::vector<T> result; ImmutableCPU();
result.resize(size()); return cpu_;
std::copy(begin(), end(), result.begin());
return result;
} }
bool operator==(const Vector<T> &other) const { bool operator==(const VectorData &other) const {
if (size() != other.size()) return false; ImmutableCPU();
auto it1 = cbegin(); other.ImmutableCPU();
auto it2 = other.cbegin(); return cpu_ == other.cpu_;
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}
}
return true;
} }
private: std::mutex &Mutex() const { return mtx_; }
void InitEmpty() {
size_ = 0;
flag_ = kDataInCPU;
}
template <typename Iter> std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
void InitByIter(size_t size, Iter begin, Iter end) { if (gpu_.data_ == nullptr) {
platform::Place cpu = platform::CPUPlace(); return nullptr;
T *ptr = this->cpu_vec_.template mutable_data<T>( } else {
framework::make_ddim({static_cast<int64_t>(size)}), cpu); return std::unique_ptr<platform::CUDAPlace>(
for (size_t i = 0; i < size; ++i) { new platform::CUDAPlace(gpu_.place_));
*ptr++ = *begin++; }
}
flag_ = kDataInCPU | kDirty;
size_ = size;
} }
private:
enum DataFlag { enum DataFlag {
kDataInCPU = 0x01, kDataInCPU = 0x01,
kDataInCUDA = 0x02, kDataInCUDA = 0x02,
...@@ -294,8 +253,15 @@ class Vector { ...@@ -294,8 +253,15 @@ class Vector {
void CopyToCPU() const { void CopyToCPU() const {
// COPY GPU Data To CPU // COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
WaitPlace(cuda_vec_.place()); platform::DeviceContextPool::Instance().Get(
platform::Place(gpu_.place_)));
auto stream = dev_ctx->stream();
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
stream);
dev_ctx->Wait();
} }
void MutableCPU() { void MutableCPU() {
...@@ -308,16 +274,12 @@ class Vector { ...@@ -308,16 +274,12 @@ class Vector {
void ImmutableCUDA(platform::Place place) const { void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) { if (IsDirty()) {
if (IsInCPU()) { if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place), CopyCPUDataToCUDA(place);
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty); UnsetFlag(kDirty);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) { } else if (IsInCUDA() &&
framework::Tensor tmp; !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp); PADDLE_THROW("This situation should not happen");
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty // Still dirty
} else { } else {
// Dirty && DataInCUDA && Device is same // Dirty && DataInCUDA && Device is same
...@@ -326,17 +288,10 @@ class Vector { ...@@ -326,17 +288,10 @@ class Vector {
} else { } else {
if (!IsInCUDA()) { if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data. // Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place), CopyCPUDataToCUDA(place);
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA); SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) { } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
framework::Tensor tmp; PADDLE_THROW("This situation should not happen.");
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else { } else {
// Not Dirty && DataInCUDA && Device is same // Not Dirty && DataInCUDA && Device is same
// Do nothing. // Do nothing.
...@@ -344,9 +299,20 @@ class Vector { ...@@ -344,9 +299,20 @@ class Vector {
} }
} }
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place));
auto stream = dev_ctx->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const { void ImmutableCPU() const {
if (IsDirty() && if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. // CPU has no data.
CopyToCPU(); CopyToCPU();
UnsetFlag(kDirty); UnsetFlag(kDirty);
} }
...@@ -362,23 +328,178 @@ class Vector { ...@@ -362,23 +328,178 @@ class Vector {
bool IsInCPU() const { return flag_ & kDataInCPU; } bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) { mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
mutable std::mutex mtx_;
};
public:
// Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {}
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T())
: m_(new VectorData(count, value)) {}
// Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
// implicit cast from std::vector.
template <typename U>
Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT
}
// Copy ctor
Vector(const Vector<T> &other) { m_ = other.m_; }
// Copy operator
Vector<T> &operator=(const Vector<T> &other) {
m_ = other.m_;
return *this;
}
// Move ctor
Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
// CPU data access method. Mutable.
T &operator[](size_t i) { return (*m_.MutableData())[i]; }
// CPU data access method. Immutable.
const T &operator[](size_t i) const { return m_.Data()[i]; }
// std::vector iterator methods. Based on CPU data access method
size_t size() const { return m_.Data().size(); }
iterator begin() { return m_.MutableData()->begin(); }
iterator end() { return m_.MutableData()->end(); }
T &front() { return m_.MutableData()->front(); }
T &back() { return m_.MutableData()->back(); }
const_iterator begin() const { return m_.Data().begin(); }
const_iterator end() const { return m_.Data().end(); }
const_iterator cbegin() const { return begin(); }
const_iterator cend() const { return end(); }
const T &back() const { return m_.Data().back(); }
T *data() { return m_.MutableData()->data(); }
const T *data() const { return m_.Data().data(); }
const T &front() const { return m_.Data().front(); }
// end of std::vector iterator methods
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
m_.MutableData()->assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) { m_.MutableData()->push_back(elem); }
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
m_.MutableData()->Extend(begin, end);
}
// resize the vector
void resize(size_t size) {
if (m_.Data().size() != size) {
m_.MutableData()->resize(size);
}
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
{
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.Data().CUDAData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAData(place);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
{
auto &mtx = m_.Data().Mutex();
std::lock_guard<std::mutex> guard(mtx);
auto cuda_place = m_.Data().CUDAPlace();
if (cuda_place == nullptr ||
*cuda_place == boost::get<platform::CUDAPlace>(place)) {
return m_.MutableData()->CUDAMutableData(place);
}
}
// If m_ contains CUDAData in a different place. Detach manually.
m_.Detach();
return CUDAMutableData(place);
}
// clear
void clear() { m_.MutableData()->clear(); }
size_t capacity() const { return m_.Data().capacity(); }
// reserve data
void reserve(size_t size) { m_.Data().reserve(size); }
// the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
return data();
}
}
// the unify method to access CPU or CUDA data. mutable.
T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance() return CUDAMutableData(place);
.Get(boost::get<platform::CUDAPlace>(place)) } else {
->Wait(); return data();
} }
} }
static T &EmptyDummy() { // implicit cast operator. Vector can be cast to std::vector implicitly.
static T dummy = T(); operator std::vector<T>() const { return m_.Data(); }
return dummy;
bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false;
auto it1 = cbegin();
auto it2 = other.cbegin();
for (; it1 < cend(); ++it1, ++it2) {
if (*it1 != *it2) {
return false;
}
}
return true;
} }
mutable int flag_; const void *Handle() const { return &m_.Data(); }
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_; private:
size_t size_; // Vector is an COW object.
mutable details::COWPtr<VectorData> m_;
}; };
#else // PADDLE_WITH_CUDA #else // PADDLE_WITH_CUDA
......
...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod(); auto& label_lod = in_label->lod();
auto detect_lod = in_detect->lod(); auto& detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto label_lod = input_label.lod(); auto& label_lod = input_label.lod();
auto detect_lod = input_detect.lod(); auto& detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto label_index = label_lod[0]; auto& label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto pos_data_lod = pos_tensor.lod()[0]; auto& pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto image_gt_boxes = gt_boxes[n]; auto& image_gt_boxes = gt_boxes[n];
for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { for (auto& image_gt_box : image_gt_boxes) {
size_t count = 0; size_t count = 0;
auto labeled_bboxes = it->second; auto& labeled_bboxes = image_gt_box.second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (size_t i = 0; i < labeled_bboxes.size(); ++i) for (auto& box : labeled_bboxes) {
if (!(labeled_bboxes[i].is_difficult)) ++count; if (!box.is_difficult) {
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = it->first; int label = image_gt_box.first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {
......
...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { ...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto in_rows = in.rows(); auto &in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
// TODO(yuyang18): Strange code here. // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(), memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place, gpu_place, ids_data, ids_num * sizeof(int64_t), stream);
ids_data, ids_num * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
auto *d_table_value = d_table->mutable_value(); auto *d_table_value = d_table->mutable_value();
......
...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -60,11 +60,9 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy( memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(out_place), out_data,
boost::get<platform::CUDAPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T), in1_value.numel() * sizeof(T), context.stream());
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -148,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
......
...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
framework::Vector<int64_t> in_rows(grad->rows()); auto& in_rows = grad->rows();
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
......
...@@ -124,7 +124,6 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -124,7 +124,6 @@ class SumKernel : public framework::OpKernel<T> {
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
......
...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -345,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t)) str(actual_t) + " in class " + self.__class__.__name__)
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
......
...@@ -20,6 +20,7 @@ import six ...@@ -20,6 +20,7 @@ import six
import sys import sys
import collections import collections
import math import math
import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): ...@@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32') self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32') self.mAP = np.array(self.mAP).astype('float32')
if (len(self.class_pos_count) > 0): if len(self.class_pos_count) > 0:
self.class_pos_count = np.array(self.class_pos_count).astype( self.class_pos_count = np.array(self.class_pos_count).astype(
'int32') 'int32')
self.true_pos = np.array(self.true_pos).astype('float32') self.true_pos = np.array(self.true_pos).astype('float32')
...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): ...@@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self): def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case() super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1] self.class_pos_count = [0, 2, 1, 0]
self.true_pos_lod = [[0, 3, 2]] self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]] self.false_pos_lod = [[0, 3, 2]]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册