未验证 提交 02cf54d3 编写于 作者: Y Yan Chunwei 提交者: GitHub

bugfix lod cpu performance (#12297)

上级 b41f8b9d
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA)
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
...@@ -37,11 +38,11 @@ class Vector { ...@@ -37,11 +38,11 @@ class Vector {
Vector() { InitEmpty(); } Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`. // Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T& value = T()) { explicit Vector(size_t count, const T &value = T()) {
InitEmpty(); InitEmpty();
if (count != 0) { if (count != 0) {
resize(count); resize(count);
T* ptr = begin(); T *ptr = begin();
for (size_t i = 0; i < count; ++i) { for (size_t i = 0; i < count; ++i) {
ptr[i] = value; ptr[i] = value;
} }
...@@ -59,7 +60,7 @@ class Vector { ...@@ -59,7 +60,7 @@ class Vector {
// implicit cast from std::vector. // implicit cast from std::vector.
template <typename U> template <typename U>
Vector(const std::vector<U>& dat) { // NOLINT Vector(const std::vector<U> &dat) { // NOLINT
if (dat.size() == 0) { if (dat.size() == 0) {
InitEmpty(); InitEmpty();
} else { } else {
...@@ -68,10 +69,10 @@ class Vector { ...@@ -68,10 +69,10 @@ class Vector {
} }
// Copy ctor // Copy ctor
Vector(const Vector<T>& other) { this->operator=(other); } Vector(const Vector<T> &other) { this->operator=(other); }
// Copy operator // Copy operator
Vector<T>& operator=(const Vector<T>& other) { Vector<T> &operator=(const Vector<T> &other) {
if (other.size() != 0) { if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end()); this->InitByIter(other.size(), other.begin(), other.end());
} else { } else {
...@@ -81,7 +82,7 @@ class Vector { ...@@ -81,7 +82,7 @@ class Vector {
} }
// Move ctor // Move ctor
Vector(Vector<T>&& other) { Vector(Vector<T> &&other) {
this->size_ = other.size_; this->size_ = other.size_;
this->flag_ = other.flag_; this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) { if (other.cuda_vec_.memory_size()) {
...@@ -93,13 +94,13 @@ class Vector { ...@@ -93,13 +94,13 @@ class Vector {
} }
// CPU data access method. Mutable. // CPU data access method. Mutable.
T& operator[](size_t i) { T &operator[](size_t i) {
MutableCPU(); MutableCPU();
return const_cast<T*>(cpu_vec_.data<T>())[i]; return const_cast<T *>(cpu_vec_.data<T>())[i];
} }
// CPU data access method. Immutable. // CPU data access method. Immutable.
const T& operator[](size_t i) const { const T &operator[](size_t i) const {
ImmutableCPU(); ImmutableCPU();
return cpu_vec_.data<T>()[i]; return cpu_vec_.data<T>()[i];
} }
...@@ -107,43 +108,43 @@ class Vector { ...@@ -107,43 +108,43 @@ class Vector {
// std::vector iterator methods. Based on CPU data access method // std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; } size_t size() const { return size_; }
T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
T* end() { T *end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
} }
T& front() { return *begin(); } T &front() { return *begin(); }
T& back() { T &back() {
auto it = end(); auto it = end();
--it; --it;
return *it; return *it;
} }
const T* begin() const { const T *begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
} }
const T* end() const { const T *end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
} }
const T* cbegin() const { return begin(); } const T *cbegin() const { return begin(); }
const T* cend() const { return end(); } const T *cend() const { return end(); }
const T& back() const { const T &back() const {
auto it = end(); auto it = end();
--it; --it;
return *it; return *it;
} }
T* data() { return begin(); } T *data() { return begin(); }
const T* data() const { return begin(); } const T *data() const { return begin(); }
const T& front() const { return *begin(); } const T &front() const { return *begin(); }
// end of std::vector iterator methods // end of std::vector iterator methods
// assign this from iterator. // assign this from iterator.
...@@ -169,7 +170,7 @@ class Vector { ...@@ -169,7 +170,7 @@ class Vector {
void Extend(It begin, It end) { void Extend(It begin, It end) {
size_t pre_size = size_; size_t pre_size = size_;
resize(pre_size + (end - begin)); resize(pre_size + (end - begin));
T* ptr = this->begin() + pre_size; T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) { for (; begin < end; ++begin, ++ptr) {
*ptr = *begin; *ptr = *begin;
} }
...@@ -183,9 +184,9 @@ class Vector { ...@@ -183,9 +184,9 @@ class Vector {
MutableCPU(); MutableCPU();
Tensor cpu_tensor; Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace(); platform::Place cpu = platform::CPUPlace();
T* ptr = cpu_tensor.mutable_data<T>( T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu); framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T* old_ptr = const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>(); cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) { if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr); std::copy(old_ptr, old_ptr + size_, ptr);
...@@ -196,7 +197,7 @@ class Vector { ...@@ -196,7 +197,7 @@ class Vector {
} }
// get cuda ptr. immutable // get cuda ptr. immutable
const T* CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place), PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place"); "CUDA Data must on CUDA place");
ImmutableCUDA(place); ImmutableCUDA(place);
...@@ -204,10 +205,10 @@ class Vector { ...@@ -204,10 +205,10 @@ class Vector {
} }
// get cuda ptr. mutable // get cuda ptr. mutable
T* CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
const T* ptr = CUDAData(place); const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA; flag_ = kDirty | kDataInCUDA;
return const_cast<T*>(ptr); return const_cast<T *>(ptr);
} }
// clear // clear
...@@ -228,7 +229,7 @@ class Vector { ...@@ -228,7 +229,7 @@ class Vector {
} }
// the unify method to access CPU or CUDA data. immutable. // the unify method to access CPU or CUDA data. immutable.
const T* Data(platform::Place place) const { const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
return CUDAData(place); return CUDAData(place);
} else { } else {
...@@ -237,7 +238,7 @@ class Vector { ...@@ -237,7 +238,7 @@ class Vector {
} }
// the unify method to access CPU or CUDA data. mutable. // the unify method to access CPU or CUDA data. mutable.
T* MutableData(platform::Place place) { T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
return CUDAMutableData(place); return CUDAMutableData(place);
} else { } else {
...@@ -253,7 +254,7 @@ class Vector { ...@@ -253,7 +254,7 @@ class Vector {
return result; return result;
} }
bool operator==(const Vector<T>& other) const { bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
auto it1 = cbegin(); auto it1 = cbegin();
auto it2 = other.cbegin(); auto it2 = other.cbegin();
...@@ -274,7 +275,7 @@ class Vector { ...@@ -274,7 +275,7 @@ class Vector {
template <typename Iter> template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) { void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace(); platform::Place cpu = platform::CPUPlace();
T* ptr = this->cpu_vec_.template mutable_data<T>( T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu); framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++; *ptr++ = *begin++;
...@@ -368,7 +369,7 @@ class Vector { ...@@ -368,7 +369,7 @@ class Vector {
} }
} }
static T& EmptyDummy() { static T &EmptyDummy() {
static T dummy = T(); static T dummy = T();
return dummy; return dummy;
} }
...@@ -379,5 +380,53 @@ class Vector { ...@@ -379,5 +380,53 @@ class Vector {
size_t size_; size_t size_;
}; };
} // namespace framework #else // PADDLE_WITH_CUDA
template <typename T>
class CPUVector : public std::vector<T, std::allocator<T>> {
public:
CPUVector() : std::vector<T>() {}
CPUVector(size_t count, const T &value = T())
: std::vector<T>(count, value) {}
CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector &operator=(const CPUVector &other) {
this->assign(other.begin(), other.end());
return *this;
}
CPUVector &operator=(const std::vector<T> &other) {
this->assign(other.begin(), other.end());
return *this;
}
friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
std::stringstream ss;
for (auto v : other) {
os << v << " ";
}
return os;
}
void resize(size_t size) { this->resize(size); }
T &operator[](size_t id) { return this->at(id); }
const T &operator[](size_t id) const { return this->at(id); }
template <typename D>
void Extend(const D &begin, const D &end) {
this->reserve(this->size() + size_t(end - begin));
this->insert(this->end(), begin, end);
}
};
template <typename T>
using Vector = CPUVector<T>;
#endif // PADDLE_WITH_CUDA
}; // namespace framework
} // namespace paddle } // namespace paddle
...@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; int64_t* rows = nullptr;
// When compiled without CUDA, the CUDAMutableData() interface should not be
// provided.
#if defined(PADDLE_WITH_CUDA)
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace()); rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
} else { } else {
#endif
rows = grad_merge.mutable_rows()->data(); rows = grad_merge.mutable_rows()->data();
#if defined(PADDLE_WITH_CUDA)
} }
#endif
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( SparseAdamFunctor<T> functor(
......
...@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> { ...@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
int64_t k = x->dims()[2]; int64_t k = x->dims()[2];
auto x_lod = x->lod().back(); auto x_lod = x->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace()); size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
#else
size_t* x_lod_data = x_lod.data();
#endif
TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data, TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
mismatch_value, n, m, p, k, out_data, mismatch_value, n, m, p, k, out_data,
...@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> { ...@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL); PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
const int* neg_idx_data = neg_indices->data<int>(); const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back(); auto neg_lod = neg_indices->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
#else
size_t* neg_lod_data = neg_lod.data();
#endif
NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor; NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k, neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
mismatch_value, out_data, out_wt_data); mismatch_value, out_data, out_wt_data);
......
...@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor { ...@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
auto lods = lod_tensor.lod(); auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
auto lod = lods[0]; const auto& lod = lods[0];
std::vector<SeqInfo> seq_info; std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) { for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册