bugfix lod cpu performance (#12297)

02cf54d3 · Yan Chunwei · GitHub · b41f8b9d · 02cf54d3 · 02cf54d3
4 changed file
--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -26,6 +26,7 @@
 namespace paddle {
 namespace framework {
+#if defined(PADDLE_WITH_CUDA)
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
@@ -37,11 +38,11 @@ class Vector {
  Vector() { InitEmpty(); }
  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
+  explicit Vector(size_t count, const T &value = T()) {
    InitEmpty();
    if (count != 0) {
      resize(count);
-      T* ptr = begin();
+      T *ptr = begin();
      for (size_t i = 0; i < count; ++i) {
        ptr[i] = value;
      }
@@ -59,7 +60,7 @@ class Vector {
  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
    if (dat.size() == 0) {
      InitEmpty();
    } else {
@@ -68,10 +69,10 @@ class Vector {
  }
  // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
+  Vector(const Vector<T> &other) { this->operator=(other); }
  // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
+  Vector<T> &operator=(const Vector<T> &other) {
    if (other.size() != 0) {
      this->InitByIter(other.size(), other.begin(), other.end());
    } else {
@@ -81,7 +82,7 @@ class Vector {
  }
  // Move ctor
-  Vector(Vector<T>&& other) {
+  Vector(Vector<T> &&other) {
    this->size_ = other.size_;
    this->flag_ = other.flag_;
    if (other.cuda_vec_.memory_size()) {
@@ -93,13 +94,13 @@ class Vector {
  }
  // CPU data access method. Mutable.
-  T& operator[](size_t i) {
+  T &operator[](size_t i) {
    MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
  }
  // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
+  const T &operator[](size_t i) const {
    ImmutableCPU();
    return cpu_vec_.data<T>()[i];
  }
@@ -107,43 +108,43 @@ class Vector {
  // std::vector iterator methods. Based on CPU data access method
  size_t size() const { return size_; }
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
-  T* end() {
+  T *end() {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }
-  T& front() { return *begin(); }
+  T &front() { return *begin(); }
-  T& back() {
+  T &back() {
    auto it = end();
    --it;
    return *it;
  }
-  const T* begin() const {
+  const T *begin() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
  }
-  const T* end() const {
+  const T *end() const {
    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
  }
-  const T* cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }
-  const T* cend() const { return end(); }
+  const T *cend() const { return end(); }
-  const T& back() const {
+  const T &back() const {
    auto it = end();
    --it;
    return *it;
  }
-  T* data() { return begin(); }
+  T *data() { return begin(); }
-  const T* data() const { return begin(); }
+  const T *data() const { return begin(); }
-  const T& front() const { return *begin(); }
+  const T &front() const { return *begin(); }
  // end of std::vector iterator methods
  // assign this from iterator.
@@ -169,7 +170,7 @@ class Vector {
  void Extend(It begin, It end) {
    size_t pre_size = size_;
    resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
+    T *ptr = this->begin() + pre_size;
    for (; begin < end; ++begin, ++ptr) {
      *ptr = *begin;
    }
@@ -183,9 +184,9 @@ class Vector {
      MutableCPU();
      Tensor cpu_tensor;
      platform::Place cpu = platform::CPUPlace();
-      T* ptr = cpu_tensor.mutable_data<T>(
+      T *ptr = cpu_tensor.mutable_data<T>(
          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T* old_ptr =
+      const T *old_ptr =
          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
      if (old_ptr != nullptr) {
        std::copy(old_ptr, old_ptr + size_, ptr);
@@ -196,7 +197,7 @@ class Vector {
  }
  // get cuda ptr. immutable
-  const T* CUDAData(platform::Place place) const {
+  const T *CUDAData(platform::Place place) const {
    PADDLE_ENFORCE(platform::is_gpu_place(place),
                   "CUDA Data must on CUDA place");
    ImmutableCUDA(place);
@@ -204,10 +205,10 @@ class Vector {
  }
  // get cuda ptr. mutable
-  T* CUDAMutableData(platform::Place place) {
+  T *CUDAMutableData(platform::Place place) {
-    const T* ptr = CUDAData(place);
+    const T *ptr = CUDAData(place);
    flag_ = kDirty | kDataInCUDA;
-    return const_cast<T*>(ptr);
+    return const_cast<T *>(ptr);
  }
  // clear
@@ -228,7 +229,7 @@ class Vector {
  }
  // the unify method to access CPU or CUDA data. immutable.
-  const T* Data(platform::Place place) const {
+  const T *Data(platform::Place place) const {
    if (platform::is_gpu_place(place)) {
      return CUDAData(place);
    } else {
@@ -237,7 +238,7 @@ class Vector {
  }
  // the unify method to access CPU or CUDA data. mutable.
-  T* MutableData(platform::Place place) {
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
      return CUDAMutableData(place);
    } else {
@@ -253,7 +254,7 @@ class Vector {
    return result;
  }
-  bool operator==(const Vector<T>& other) const {
+  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
    auto it1 = cbegin();
    auto it2 = other.cbegin();
@@ -274,7 +275,7 @@ class Vector {
  template <typename Iter>
  void InitByIter(size_t size, Iter begin, Iter end) {
    platform::Place cpu = platform::CPUPlace();
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
    for (size_t i = 0; i < size; ++i) {
      *ptr++ = *begin++;
@@ -368,7 +369,7 @@ class Vector {
    }
  }
-  static T& EmptyDummy() {
+  static T &EmptyDummy() {
    static T dummy = T();
    return dummy;
  }
@@ -379,5 +380,53 @@ class Vector {
  size_t size_;
 };
-}  // namespace framework
+#else  // PADDLE_WITH_CUDA
+template <typename T>
+class CPUVector : public std::vector<T, std::allocator<T>> {
+ public:
+  CPUVector() : std::vector<T>() {}
+  CPUVector(size_t count, const T &value = T())
+      : std::vector<T>(count, value) {}
+  CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
+  CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
+  explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
+  CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
+  CPUVector &operator=(const CPUVector &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  CPUVector &operator=(const std::vector<T> &other) {
+    this->assign(other.begin(), other.end());
+    return *this;
+  }
+  friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
+    std::stringstream ss;
+    for (auto v : other) {
+      os << v << " ";
+    }
+    return os;
+  }
+  void resize(size_t size) { this->resize(size); }
+  T &operator[](size_t id) { return this->at(id); }
+  const T &operator[](size_t id) const { return this->at(id); }
+  template <typename D>
+  void Extend(const D &begin, const D &end) {
+    this->reserve(this->size() + size_t(end - begin));
+    this->insert(this->end(), begin, end);
+  }
+};
+template <typename T>
+using Vector = CPUVector<T>;
+#endif  // PADDLE_WITH_CUDA
+};  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
      int64_t* rows = nullptr;
+// When compiled without CUDA, the CUDAMutableData() interface should not be
+// provided.
+#if defined(PADDLE_WITH_CUDA)
      if (platform::is_gpu_place(ctx.GetPlace())) {
        rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
      } else {
+#endif
        rows = grad_merge.mutable_rows()->data();
+#if defined(PADDLE_WITH_CUDA)
      }
+#endif
      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
      SparseAdamFunctor<T> functor(

--- a/paddle/fluid/operators/detection/target_assign_op.h
+++ b/paddle/fluid/operators/detection/target_assign_op.h
@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
    int64_t k = x->dims()[2];
    auto x_lod = x->lod().back();
+#if defined(PADDLE_WITH_CUDA)
    size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
+#else
+    size_t* x_lod_data = x_lod.data();
+#endif
    TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
                                       mismatch_value, n, m, p, k, out_data,
@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
      PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
      const int* neg_idx_data = neg_indices->data<int>();
      auto neg_lod = neg_indices->lod().back();
+#if defined(PADDLE_WITH_CUDA)
      size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
+#else
+      size_t* neg_lod_data = neg_lod.data();
+#endif
      NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
      neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
                      mismatch_value, out_data, out_wt_data);

--- a/paddle/fluid/operators/math/sequence2batch.h
+++ b/paddle/fluid/operators/math/sequence2batch.h
@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
    auto lods = lod_tensor.lod();
    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
-    auto lod = lods[0];
+    const auto& lod = lods[0];
    std::vector<SeqInfo> seq_info;
    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {