diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index ddd9fe809853a830ca676cc98f1819f683866def..b534de4a9c07f05cc1ae7b2cabfd6785c482f174 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -168,7 +168,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
     startup_exe = fluid.Executor(place)
     startup_exe.run(startup_prog)
     strategy = fluid.ExecutionStrategy()
-    strategy.num_threads = args.cpus
+    strategy.num_threads = 0  #args.cpus
     strategy.allow_op_delay = False
     build_strategy = fluid.BuildStrategy()
     if args.reduce_strategy == "reduce":
@@ -187,6 +187,8 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
         num_trainers = 1
         trainer_id = 0
 
+    print('Use parallel_executor')
+    strategy.type = 2
     exe = fluid.ParallelExecutor(
         True,
         avg_loss.name,
diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py
index f692e7722a1c9a54a4509ce7c78cc68e1f28da74..947c497ce2b9becd1282b62187f743008985a17f 100644
--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -172,7 +172,7 @@ def get_model(args, is_train, main_prog, startup_prog):
     reader, dshape, class_dim = _model_reader_dshape_classdim(args, is_train)
 
     pyreader = None
-    trainer_count = int(os.getenv("PADDLE_TRAINERS"))
+    trainer_count = int(os.getenv("PADDLE_TRAINERS", 1))
     with fluid.program_guard(main_prog, startup_prog):
         with fluid.unique_name.guard():
             if args.use_reader_op:
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 664b3460252aeb711cdab2a5940cd9987a26ed6a..5620b30f5a6466a23263c716663dfb7a84869636 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -48,8 +48,11 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS
         auto_increment_allocator
         zero_size_allocator
         conditional_allocator
+        retry_allocator
         cuda_device_guard)
 
 nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade)
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator naive_managed_allocator best_fit_allocator locked_allocator cpu_allocator)
+
+cc_test(allocator_facade_test SRCS allocator_facade_test.cc DEPS allocator_facade)
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index 98b4b035861fb3bfe1531ecbf780aef395789606..ffaeadcbdc6a8c8e390934355ee82a00e80c098b 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -26,6 +26,11 @@ std::shared_ptr<Allocation> ThinAlignedAllocator::AllocateShared(
     size_t size, Allocator::Attr attr) {
   return std::shared_ptr<Allocation>(Allocate(size, attr).release());
 }
+
+bool ThinAlignedAllocator::IsAllocThreadSafe() const {
+  return underlying_allocator_->IsAllocThreadSafe();
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 13c69c153a2155750fcf4edee151ba277d7a29b4..529943dc3da89187a0a360968f16292e62739f9a 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -77,6 +77,8 @@ class ThinAlignedAllocator : public ManagedAllocator {
 
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override;
 
+  bool IsAllocThreadSafe() const;
+
  protected:
   std::shared_ptr<ManagedAllocator> underlying_allocator_;
 };
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4f07c1610dc3ef45e5bff6df32a71d4af9c55243..02ea5d7e783d1bbfa0f9a2005dfc9e473ef8f4c6 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/allocator.h"
+#include <gflags/gflags.h>
 #include <map>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -24,6 +26,7 @@
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/memory/allocation/zero_size_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
@@ -32,6 +35,11 @@
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #endif
 
+DEFINE_int32(
+    gpu_allocator_retry_time, 0,
+    "The retry time (milliseconds) when allocator fails "
+    "to allocate memory. No retry if this value is not greater than 0");
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -60,6 +68,7 @@ class CPUManagedAllocator : public ManagedAllocator {
       return normal_allocator_->AllocateShared(size, attr);
     }
   }
+
   bool IsAllocThreadSafe() const override { return true; }
 
  private:
@@ -86,8 +95,12 @@ class CUDAManagedAllocator : public ManagedAllocator {
       size_t capacity = available / max_chunk_size_;
 
       if (capacity == 1) {
+        VLOG(10) << "Create BestFitAllocator with chunk_size "
+                 << max_chunk_size_;
         default_allocator_ = BestFitAllocatorCreator();
       } else {
+        VLOG(10) << "Create AutoIncrementAllocator with chunk_size "
+                 << max_chunk_size_ << " and capacity " << capacity;
         default_allocator_ = std::make_shared<AutoIncrementAllocator>(
             [this] { return std::move(BestFitAllocatorCreator()); }, capacity);
       }
@@ -116,6 +129,7 @@ class CUDAManagedAllocator : public ManagedAllocator {
   std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
     return default_allocator_->Allocate(size, attr);
   }
+
   std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
     return default_allocator_->AllocateShared(size, attr);
   }
@@ -123,10 +137,20 @@ class CUDAManagedAllocator : public ManagedAllocator {
   std::shared_ptr<ManagedAllocator> BestFitAllocatorCreator() {
     chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_));
     auto* allocation = chunks_.back().get();
-    return std::make_shared<AlignedAllocator<64u>>(
-        NaiveManagedAllocator::Create(std::unique_ptr<Allocator>(
-            new LockedAllocator(std::unique_ptr<Allocator>(
-                new BestFitAllocator(allocation))))));
+    std::unique_ptr<Allocator> unmanaged_allocator(new LockedAllocator(
+        std::unique_ptr<Allocator>(new BestFitAllocator(allocation))));
+
+    if (FLAGS_gpu_allocator_retry_time <= 0) {
+      VLOG(10) << "Create NaiveManagedAllocator without retry";
+      return std::make_shared<AlignedAllocator<64u>>(
+          NaiveManagedAllocator::Create(std::move(unmanaged_allocator)));
+    } else {
+      VLOG(10) << "Create RetryAllocator with retry_time "
+               << FLAGS_gpu_allocator_retry_time << "ms";
+      return std::make_shared<AlignedAllocator<64u>>(RetryAllocator::Create(
+          std::move(unmanaged_allocator),
+          static_cast<size_t>(FLAGS_gpu_allocator_retry_time)));
+    }
   }
 
   bool IsAllocThreadSafe() const override { return true; }
@@ -141,7 +165,8 @@ class CUDAManagedAllocator : public ManagedAllocator {
 
 class AllocatorFacadePrivate {
  public:
-  std::map<platform::Place, std::shared_ptr<ManagedAllocator>> allocators_;
+  std::unordered_map<platform::Place, std::shared_ptr<ManagedAllocator>>
+      allocators_;
 
   ~AllocatorFacadePrivate() = default;
 
@@ -184,13 +209,13 @@ AllocatorFacade& AllocatorFacade::Instance() {
 
 std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size, Allocator::Attr attr) {
-  return m_->allocators_[place]->AllocateShared(size, attr);
+  return m_->allocators_.at(place)->AllocateShared(size, attr);
 }
 
 std::unique_ptr<Allocation> AllocatorFacade::Alloc(const platform::Place& place,
                                                    size_t size,
                                                    Allocator::Attr attr) {
-  return m_->allocators_[place]->Allocate(size, attr);
+  return m_->allocators_.at(place)->Allocate(size, attr);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator_facade_test.cc b/paddle/fluid/memory/allocation/allocator_facade_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5185bf944460f6b36677bdd77c33048b79b6e6f2
--- /dev/null
+++ b/paddle/fluid/memory/allocation/allocator_facade_test.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_int32(gpu_allocator_retry_time);
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(allocator, allocator) {
+  FLAGS_fraction_of_gpu_memory_to_use = 0.01;
+  FLAGS_gpu_allocator_retry_time = 500;
+
+  auto &instance = AllocatorFacade::Instance();
+
+  {
+    auto cpu_allocation = instance.Alloc(platform::CPUPlace(), 1024);
+    ASSERT_NE(cpu_allocation, nullptr);
+  }
+
+  {
+    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0), 1024);
+    ASSERT_NE(gpu_allocation, nullptr);
+  }
+
+  {
+    // Allocate 2GB gpu memory
+    auto gpu_allocation = instance.Alloc(platform::CUDAPlace(0),
+                                         2 * static_cast<size_t>(1 << 30));
+    ASSERT_NE(gpu_allocation, nullptr);
+  }
+
+  {}
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index e3ee504f3d042d6a99036e34507c4c8bee306750..745a79014a7aa9fe16e4e8d637d944e971422f97 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <functional>
 #include <iostream>
 #include <vector>
 
@@ -130,5 +131,65 @@ typename Visitor::result_type VisitPlace(const Place &place,
   return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
 }
 
+struct PlaceHashVisitor : public boost::static_visitor<size_t> {
+  template <typename Place>
+  inline size_t operator()(const Place &place) const {
+    return place.hash();
+  }
+};
+
 }  // namespace platform
 }  // namespace paddle
+
+namespace std {
+
+template <>
+struct hash<::paddle::platform::CPUPlace> {
+  using argument_type = ::paddle::platform::CPUPlace;
+  using result_type = size_t;
+
+  constexpr inline result_type operator()(const argument_type &place) const {
+    return static_cast<result_type>(-1);
+  }
+};
+
+template <>
+struct hash<::paddle::platform::CUDAPlace> {
+  using argument_type = ::paddle::platform::CUDAPlace;
+  using result_type = size_t;
+
+  inline result_type operator()(const argument_type &place) const {
+    return static_cast<result_type>(place.device);
+  }
+};
+
+template <>
+struct hash<::paddle::platform::CUDAPinnedPlace> {
+  using argument_type = ::paddle::platform::CUDAPinnedPlace;
+  using result_type = size_t;
+
+  constexpr inline result_type operator()(const argument_type &place) const {
+    return static_cast<result_type>(-2);
+  }
+};
+
+namespace {  // NOLINT
+struct PlaceHashVisitor : public boost::static_visitor<size_t> {
+  template <typename Place>
+  inline size_t operator()(const Place &place) const {
+    return std::hash<Place>()(place);
+  }
+};
+}
+
+template <>
+struct hash<::paddle::platform::Place> {
+  using argument_type = ::paddle::platform::Place;
+  using result_type = size_t;
+
+  inline result_type operator()(const argument_type &place) const {
+    return boost::apply_visitor(PlaceHashVisitor(), place);
+  }
+};
+
+}  // namespace std