diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 15928c18d38b8a513b00f993b57faab43978bf53..d1554113bc366f38d1cfd7603e2848f618794d9f 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -28,12 +28,14 @@ namespace framework {
 
 TEST(LoD, PrintLoDTensor) {
   LoDTensor tensor1;
+  tensor1.Resize({2});
   tensor1.mutable_data<float>(platform::CPUPlace());
   tensor1.data<float>()[0] = 0.2;
   tensor1.data<float>()[1] = 0.5;
   LOG(INFO) << tensor1;
 
   LoDTensor tensor2;
+  tensor2.Resize({2});
   tensor2.mutable_data<int64_t>(platform::CPUPlace());
   tensor2.data<int64_t>()[0] = 1;
   tensor2.data<int64_t>()[1] = 2;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 6bc892638c28ca0b5bab82936bf9700289bed6b2..6f57f8b20287d8427321aa17bf8c6d094d776802 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -384,7 +384,7 @@ TEST_F(MkldnnQuantizerTest, histogram_empty) {
   // zero tensor
   framework::LoDTensor var_tensor;
   var_tensor.Resize({0});
-  ASSERT_TRUE(var_tensor.mutable_data<double>(platform::CPUPlace()));
+  var_tensor.mutable_data<double>(platform::CPUPlace());
 
   ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 4c4ae72effaaf2829267c23cf7bd26753a9fab85..c309febd49905104c259d71f5c56bf58b7294090 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
 
 if (WITH_GPU)
@@ -38,7 +37,7 @@ else ()
     set(AllocatorFacadeDeps)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator)
+list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator)
 
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator)
diff --git a/paddle/fluid/memory/allocation/allocation_with_underlying.h b/paddle/fluid/memory/allocation/allocation_with_underlying.h
deleted file mode 100644
index 69f78667d7d33c59245a9890b9a2ce469f629450..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/allocation_with_underlying.h
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-class AllocationWithUnderlying : public Allocation {
- public:
-  explicit AllocationWithUnderlying(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
-        allocation_(std::move(allocation)) {}
-  AllocationPtr allocation_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 5acdd9d0fcba2d74a4b72e94c81b0b570b17acb1..1fcd9361805aa8a494684fdafc19013338092791 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -52,7 +52,7 @@ class Allocator;
  * decorate a RetryAllocator to any allocator to perform allocation retry when
  * first allocation request fails.
  *
- * Explanations of Allocator design is as follows:
+ * Explanations of Allocator design are as follows:
  *
  * Suppose we have an allocator which is decorated by several allocators:
  *
@@ -127,8 +127,15 @@ class Allocation {
   size_t size_;
   platform::Place place_;
 
-  // NOTE(zjl): Since decorated_allocators_ is usually a small vector
-  // We reserve a small buffer to it to prevent frequent heap allocation
+  /**
+   * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
+   * We reserve a small buffer to it to prevent frequent heap allocation
+   *
+   * Instead, we can use a std::vector<Allocator *> here, and reserve
+   * kReserveAllocatorNum in constructor of Allocation.
+   * But using std::vector<Allocator *> would make ocr recognition model
+   * fail in CE. The train duration is 8% slower than KPI.
+   */
   static constexpr size_t kReserveAllocatorNum = 8;
   using DecoratedAllocatorStack =
       framework::InlinedVector<Allocator*, kReserveAllocatorNum>;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 09328aded58cb0cccd9de0aba399f5c49313042f..1ff719c9e726ac5bc3f39cd552fe5c53b22147a0 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -29,7 +29,6 @@
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -192,10 +191,6 @@ class CUDAPinnedChunkedAllocator : public ChunkedAllocator {
 
 class AllocatorFacadePrivate {
  public:
-  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
-
-  ~AllocatorFacadePrivate() = default;
-
   AllocatorFacadePrivate() {
     auto strategy = GetAllocatorStrategy();
     switch (strategy) {
@@ -207,7 +202,6 @@ class AllocatorFacadePrivate {
         InitCPUAllocator();
         InitCUDAAllocator();
         InitCUDAPinnedAllocator();
-        WrapZeroSizeAllocator();
         break;
       }
       default: {
@@ -215,6 +209,18 @@ class AllocatorFacadePrivate {
                      static_cast<int>(strategy));
       }
     }
+    InitZeroSizeAllocators();
+  }
+
+  inline const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::Place& place, size_t size) {
+    const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_);
+    auto iter = allocators.find(place);
+    if (iter == allocators.end()) {
+      throw BadAlloc(
+          string::Sprintf("No such allocator for the place, %s", place));
+    }
+    return iter->second;
   }
 
  private:
@@ -252,12 +258,40 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-  void WrapZeroSizeAllocator() {
-    for (auto& pair : allocators_) {
-      pair.second =
-          std::make_shared<ZeroSizeAllocator>(pair.second, pair.first);
+  class ZeroSizeAllocator : public Allocator {
+   public:
+    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
+
+   protected:
+    Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override {
+      return new Allocation(nullptr, 0, place_);
+    }
+
+    void FreeImpl(Allocation* allocation) override { delete allocation; }
+
+   private:
+    platform::Place place_;
+  };
+
+  void InitZeroSizeAllocators() {
+    std::vector<platform::Place> places;
+    places.emplace_back(platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+    int device_count = platform::GetCUDADeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::CUDAPlace(dev_id));
+    }
+    places.emplace_back(platform::CUDAPinnedPlace());
+#endif
+
+    for (auto& p : places) {
+      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
     }
   }
+
+ private:
+  std::map<platform::Place, std::shared_ptr<Allocator>> allocators_;
+  std::map<platform::Place, std::shared_ptr<Allocator>> zero_size_allocators_;
 };
 
 // Pimpl. Make interface clean.
@@ -276,12 +310,7 @@ std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
                                      Allocator::Attr attr) {
-  auto it = m_->allocators_.find(place);
-  if (it == m_->allocators_.end()) {
-    throw BadAlloc(
-        string::Sprintf("No such allocator for the place, %s", place));
-  }
-  return m_->allocators_.at(place)->Allocate(size, attr);
+  return m_->GetAllocator(place, size)->Allocate(size, attr);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index e04c0aa34b1cd6200806cc2a012161e3478eca0b..2f3e6205c3c1713756bce254c947f9cd500e3d46 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -16,7 +16,6 @@
 #include <algorithm>
 #include <limits>
 #include <utility>
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index c43099cc88f839ad92d36774d49aafd7192f916f..e9ec39c893255fe297f38e68eedaa68f3e6496b0 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -15,7 +15,6 @@
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include <mutex>  // NOLINT
 #include <utility>
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 #include "paddle/fluid/platform/lock_guard_ptr.h"
 
 namespace paddle {
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 7e888988f9602e362d73f64c1b45552e84e3349c..167dd923dbbe9d04861c015c013c1211046be76c 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
-#include "paddle/fluid/memory/allocation/allocation_with_underlying.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc
deleted file mode 100644
index 39743bcb10c700c9a8446b9040c8a8707d57ec7d..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/zero_size_allocator.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/memory/allocation/zero_size_allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-bool ZeroSizeAllocator::IsAllocThreadSafe() const {
-  return underlying_allocator_->IsAllocThreadSafe();
-}
-
-Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
-  if (size == 0) {
-    return new Allocation(nullptr, 0, place_);
-  } else {
-    return underlying_allocator_->Allocate(size, attr).release();
-  }
-}
-
-void ZeroSizeAllocator::FreeImpl(Allocation *allocation) {
-  if (allocation->size() == 0) {
-    delete allocation;
-  } else {
-    underlying_allocator_->Free(allocation);
-  }
-}
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h
deleted file mode 100644
index 08a7a06dbf290b55994a407fe478f792b0c0964a..0000000000000000000000000000000000000000
--- a/paddle/fluid/memory/allocation/zero_size_allocator.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <utility>
-#include "paddle/fluid/memory/allocation/allocator.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-
-// The allocator handles the request's size is zero. Allocator will always
-// return an allocation even the request size is zero. However, the
-// allocation.ptr() is nullptr
-class ZeroSizeAllocator : public Allocator {
- public:
-  ZeroSizeAllocator(std::shared_ptr<Allocator> underlying_allocator,
-                    const platform::Place& p)
-      : underlying_allocator_(std::move(underlying_allocator)), place_(p) {}
-
-  bool IsAllocThreadSafe() const override;
-
- protected:
-  Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override;
-  void FreeImpl(Allocation* allocation) override;
-
- private:
-  std::shared_ptr<Allocator> underlying_allocator_;
-  const platform::Place& place_;
-};
-
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 1408163e4b5278ddcd65eb4f2900109d772a589a..c08d86eb213310b4e8dbac541c254867bb44b903 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 
 #include <cstring>  // for memcpy
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -24,6 +25,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
                                                   platform::CPUPlace,
                                                   const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -40,6 +42,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
 
   if (stream) {
@@ -59,6 +62,8 @@ template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
+
   platform::SetDeviceId(dst_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU");
@@ -77,6 +82,8 @@ template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
     const void* src, size_t num, cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
+
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     if (stream) {
@@ -103,6 +110,7 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
     platform::CPUPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -110,6 +118,7 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CPUPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -117,6 +126,7 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
@@ -125,6 +135,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CUDAPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
@@ -140,6 +151,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num,
     cudaStream_t stream) {
+  if (UNLIKELY(num == 0)) return;
+
   platform::SetDeviceId(dst_place.device);
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index d583909a666624d86031bb207154c93cf12d5cc2..f6295337d1f1042f021f7b0de15f476225beb3a2 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -454,6 +454,7 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
       const auto *running_mean = ctx.Input<Tensor>("Mean");
       const auto *running_variance = ctx.Input<Tensor>("Variance");
       mean_data = running_mean->data<T>();
+      inv_var_tensor.Resize({C});
       T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
       EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
       ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 3d24e8986df97d4ad5ad1dfc92a2e0a7eed38521..e9a7201bc0826414ec4adbd3bf2804db013a4571 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -264,8 +264,8 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
     T* out_scale_data = out_scale->mutable_data<T>(gpu_place);
 
     framework::Tensor need_find_max, out_size;
-    int* find_max = need_find_max.mutable_data<int>(gpu_place);
-    int* out_size_data = out_size.mutable_data<int>(gpu_place);
+    int* find_max = need_find_max.mutable_data<int>({1}, gpu_place);
+    int* out_size_data = out_size.mutable_data<int>({1}, gpu_place);
 
     FindRangeAbsMaxAndFillArray<T><<<1, 1, 0, ctx.stream()>>>(
         cur_scale.data<T>(), last_scale.data<T>(), iter.data<int64_t>(),