Code clean of Allocator (#17602)

* Revert "Revert "Fix allocator bug"" This reverts commit 174d0d0b. * Revert "fix travis ci" This reverts commit 5656fa9f. test=develop * add inlined_vector.h, test=develop * add inlined_vector_test,test=develop * clean code of allocator,test=develop * delete zero_size_allocator.h,test=develop * fix failed unittest,test=develop

Code clean of Allocator (#17602)
* Revert "Revert "Fix allocator bug"" This reverts commit 174d0d0b. * Revert "fix travis ci" This reverts commit 5656fa9f. test=develop * add inlined_vector.h, test=develop * add inlined_vector_test,test=develop * clean code of allocator,test=develop * delete zero_size_allocator.h,test=develop * fix failed unittest,test=develop
4aa931dd · Zeng Jinle · GitHub · 430e2565 · 4aa931dd · 4aa931dd
14 changed file
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -28,12 +28,14 @@ namespace framework {
 TEST(LoD, PrintLoDTensor) {
  LoDTensor tensor1;
+  tensor1.Resize({2});
  tensor1.mutable_data<float>(platform::CPUPlace());
  tensor1.data<float>()[0] = 0.2;
  tensor1.data<float>()[1] = 0.5;
  LOG(INFO) << tensor1;
  LoDTensor tensor2;
+  tensor2.Resize({2});
  tensor2.mutable_data<int64_t>(platform::CPUPlace());
  tensor2.data<int64_t>()[0] = 1;
  tensor2.data<int64_t>()[1] = 2;

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -384,7 +384,7 @@ TEST_F(MkldnnQuantizerTest, histogram_empty) {
  // zero tensor
  framework::LoDTensor var_tensor;
  var_tensor.Resize({0});
-  ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
+  var_tensor.mutable_data<double>(platform::CPUPlace());
  ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
 }

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 if (WITH_GPU)
@@ -38,7 +37,7 @@ else ()
    set(AllocatorFacadeDeps)
 endif()
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator)
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)

--- a/paddle/fluid/memory/allocation/allocation_with_underlying.h
+++ b/paddle/fluid/memory/allocation/allocation_with_underlying.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/memory/allocation/allocator.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-class AllocationWithUnderlying : public Allocation {
- public:
-  explicit AllocationWithUnderlying(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
-        allocation_(std::move(allocation)) {}
-  AllocationPtr allocation_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -52,7 +52,7 @@ class Allocator;
 * decorate a RetryAllocator to any allocator to perform allocation retry when
 * first allocation request fails.
 *
- * Explanations of Allocator design is as follows:
+ * Explanations of Allocator design are as follows:
 *
 * Suppose we have an allocator which is decorated by several allocators:
 *
@@ -127,8 +127,15 @@ class Allocation {
  size_t size_;
  platform::Place place_;
-  // NOTE(zjl): Since decorated_allocators_ is usually a small vector
+  /**
-  // We reserve a small buffer to it to prevent frequent heap allocation
+   * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
+   * We reserve a small buffer to it to prevent frequent heap allocation
+   *
+   * Instead, we can use a std::vector<Allocator *> here, and reserve
+   * kReserveAllocatorNum in constructor of Allocation.
+   * But using std::vector<Allocator *> would make ocr recognition model
+   * fail in CE. The train duration is 8% slower than KPI.
+   */
  static constexpr size_t kReserveAllocatorNum = 8;
  using DecoratedAllocatorStack =
      framework::InlinedVector<Allocator*, kReserveAllocatorNum>;

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -29,7 +29,6 @@
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -192,10 +191,6 @@ class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
 class AllocatorFacadePrivate {
 public:
-  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
-  ~AllocatorFacadePrivate() = default;
  AllocatorFacadePrivate() {
    auto strategy = GetAllocatorStrategy();
    switch (strategy) {
@@ -207,7 +202,6 @@ class AllocatorFacadePrivate {
        InitCPUAllocator();
        InitCUDAAllocator();
        InitCUDAPinnedAllocator();
-        WrapZeroSizeAllocator();
        break;
      }
      default: {
@@ -215,6 +209,18 @@ class AllocatorFacadePrivate {
                     static_cast<int>(strategy));
      }
    }
+    InitZeroSizeAllocators();
+  }
+  inline const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, size_t size) {
+    const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
+    auto iter = allocators.find(place);
+    if (iter == allocators.end()) {
+      throw BadAlloc(
+          string::Sprintf("No such allocator for the place, %s", place));
+    }
+    return iter->second;
  }
 private:
@@ -252,12 +258,40 @@ class AllocatorFacadePrivate {
 #endif
  }
-  void WrapZeroSizeAllocator() {
+  class ZeroSizeAllocator : public Allocator {
-    for (auto& pair : allocators_) {
+   public:
-      pair.second =
+    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
+   protected:
+    Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+      return new Allocation(nullptr, 0, place_);
+    }
+    void FreeImpl(Allocation* allocation) override { delete allocation; }
+   private:
+    platform::Place place_;
+  };
+  void InitZeroSizeAllocators() {
+    std::vector<platform::Place> places;
+    places.emplace_back(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+    int device_count = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::CUDAPlace(dev_id));
+    }
+    places.emplace_back(platform::CUDAPinnedPlace());
+#endif
+    for (auto& p : places) {
+      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
    }
  }
+ private:
+  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
+  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
 };
 // Pimpl. Make interface clean.
@@ -276,12 +310,7 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                     Allocator::Attr attr) {
-  auto it = m_->allocators_.find(place);
+  return m_->GetAllocator(place, size)->Allocate(size, attr);
-  if (it == m_->allocators_.end()) {
-    throw BadAlloc(
-        string::Sprintf("No such allocator for the place, %s", place));
-  }
-  return m_->allocators_.at(place)->Allocate(size, attr);
 }
 }  // namespace allocation

--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -16,7 +16,6 @@
 #include <algorithm>
 #include <limits>
 #include <utility>
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include <mutex>  // NOLINT
 #include <utility>
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 namespace paddle {

--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 namespace paddle {
 namespace memory {
 namespace allocation {

--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-bool ZeroSizeAllocator::IsAllocThreadSafe() const {
-  return underlying_allocator_->IsAllocThreadSafe();
-}
-Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  if (size == 0) {
-    return new Allocation(nullptr, 0, place_);
-  } else {
-    return underlying_allocator_->Allocate(size, attr).release();
-  }
-}
-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (allocation->size() == 0) {
-    delete allocation;
-  } else {
-    underlying_allocator_->Free(allocation);
-  }
-}
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ b/paddle/fluid/memory/allocation/zero_size_allocator.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <memory>
-#include <utility>
-#include "paddle/fluid/memory/allocation/allocator.h"
-namespace paddle {
-namespace memory {
-namespace allocation {
-// The allocator handles the request's size is zero. Allocator will always
-// return an allocation even the request size is zero. However, the
-// allocation.ptr() is nullptr
-class ZeroSizeAllocator : public Allocator {
- public:
-  ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                    const platform::Place& p)
-      : underlying_allocator_(std::move(underlying_allocator)), place_(p) {}
-  bool IsAllocThreadSafe() const override;
- protected:
-  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation* allocation) override;
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  const platform::Place& place_;
-};
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include <cstring>  // for memcpy
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -24,6 +25,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                  platform::CPUPlace,
                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -40,6 +42,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
    const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(src_place.device);
  if (stream) {
@@ -59,6 +62,8 @@ template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
    const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(dst_place.device);
  if (stream) {
    platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
@@ -77,6 +82,8 @@ template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
    const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
  if (dst_place == src_place) {
    platform::SetDeviceId(src_place.device);
    if (stream) {
@@ -103,6 +110,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
    platform::CPUPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -110,6 +118,7 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CPUPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -117,6 +126,7 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
  std::memcpy(dst, src, num);
 }
@@ -125,6 +135,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
    platform::CUDAPinnedPlace dst_place, void* dst,
    platform::CUDAPlace src_place, const void* src, size_t num,
    cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(src_place.device);
  if (stream) {
    platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
@@ -140,6 +151,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
    platform::CUDAPlace dst_place, void* dst,
    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
    cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
  platform::SetDeviceId(dst_place.device);
  if (stream) {
    platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -454,6 +454,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
      const auto *running_mean = ctx.Input<Tensor>("Mean");
      const auto *running_variance = ctx.Input<Tensor>("Variance");
      mean_data = running_mean->data<T>();
+      inv_var_tensor.Resize({C});
      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);

--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -264,8 +264,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
    T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
    framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>(gpu_place);
+    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
-    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
    FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
        cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),