diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7fda5d460d54e1db5cbaba45fd73ceb6ef5a7180..c31f51a3f7371bd7b1b0ca3234091a35868806ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -294,7 +294,7 @@ if(WITH_PSLIB)
     list(APPEND EXTERNAL_LIBS pslib_brpc)
     list(APPEND EXTERNAL_LIBS libmct)
 endif(WITH_PSLIB)
-
+    
 if(WITH_AMD_GPU)
     find_package(HIP)
     include(hip)
diff --git a/cmake/external/robin_map.cmake b/cmake/external/robin_map.cmake
deleted file mode 100644
index ddaf59536cb2c493cf77f689e0e33a03e2e59ec4..0000000000000000000000000000000000000000
--- a/cmake/external/robin_map.cmake
+++ /dev/null
@@ -1,31 +0,0 @@
-include(ExternalProject)
-
-set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map)
-set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include)
-
-include_directories(${ROBIN_MAP_INCLUDE_DIR})
-
-ExternalProject_Add(
-  extern_robin_map
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "https://github.com/Tessil/robin-map.git"
-  GIT_TAG        "v0.5.0"
-  PREFIX         ${ROBIN_MAP_SOURCE_DIR}
-  UPDATE_COMMAND ""
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
-)
-
-if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c)
-  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
-  add_library(robin_map STATIC ${dummyfile})
-else()
-  add_library(robin_map INTERFACE)
-endif()
-
-add_dependencies(robin_map extern_robin_map)
-
-LIST(APPEND externl_project_dependencies robin_map)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 10a637af445c45baaa013a2a1217c6a23f794638..412bc9cbe88b860a698d17e239d7b94d8956b781 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -83,7 +83,7 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash)
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
 cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 37b07e5736312b3050debe745f2d3c108469c5d6..15c496130c2b6c7643ff96661be09e5ac4870344 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
   size_t num_threads_{0};
   bool use_cuda_{true};
   bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{1};
+  size_t num_iteration_per_drop_scope_{100};
   ExecutorType type_{kDefault};
   bool dry_run_{false};
 };
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index ea783c609081cd4a3b2deb83737939e250492361..57f6fc66c57e2a53d9cf30d7761626a50bc379ea 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -64,24 +64,21 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
   }
 
   platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
-  ++drop_scope_counter_;
+  drop_scope_counter_ += 1;
 
-  if (!fetch_tensors.empty()) {
+  if (!fetch_tensors.empty() ||
+      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
     // Wait All computational streams
     for (auto p : places_) {
       platform::DeviceContextPool::Instance().Get(p)->Wait();
     }
-  }
-
-  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
     for (auto &scope : local_scopes_) {
       auto &local_scope =
           *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
       scope->DeleteScope(local_scope);
     }
   }
-
   if (eptr) {
     std::rethrow_exception(eptr);
   } else {
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 8e67f8f6104cdaa8227c34351c6355ab8fc81b09..8670dcfed7e40473e06cd12cecc1157dd4f54aa0 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -20,10 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 
-DEFINE_bool(enforce_when_check_program, true,
-            "Checking whether the program is correct or not. We will log "
-            "errors rather than throwing exceptions if this flag turned off");
-
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -48,56 +44,27 @@ void CheckProgram(const ProgramDesc &program) {
         break;
       case _INT(OpRole::kBackward):
       case _INT(OpRole::kBackward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add backward operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR)
-                << "Cannot add backward operator %s after optimize operator."
-                << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add backward operator %s after optimize operator.",
+            op->Type());
         break;
       case _INT(OpRole::kForward) | _INT(OpRole::kLoss):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
-                                    _INT(OpRole::kLoss)) == visit.end(),
-                         "Cannot add backward|loss operator before "
-                         "forward|loss operator %s.",
-                         op->Type());
-          PADDLE_ENFORCE(
-              visit.find(_INT(OpRole::kOptimize)) == visit.end(),
-              "Cannot add forward|loss operator %s after optimize operator.",
-              op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward) | _INT(OpRole::kLoss)) !=
-              visit.end()) {
-            LOG(ERROR) << "Cannot add backward|loss operator before "
-                       << "forward|loss operator %s." << op->Type();
-          }
-
-          if (visit.find(_INT(OpRole::kOptimize)) != visit.end()) {
-            LOG(ERROR) << "Cannot add forward|loss operator %s after optimize "
-                          "operator."
-                       << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) |
+                                  _INT(OpRole::kLoss)) == visit.end(),
+                       "Cannot add backward|loss operator before "
+                       "forward|loss operator %s.",
+                       op->Type());
+        PADDLE_ENFORCE(
+            visit.find(_INT(OpRole::kOptimize)) == visit.end(),
+            "Cannot add forward|loss operator %s after optimize operator.",
+            op->Type());
         break;
       case _INT(OpRole::kOptimize):
       case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched):
-        if (!FLAGS_enforce_when_check_program) {
-          PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
-                         "Optimize operators %s must follow backward operator.",
-                         op->Type());
-        } else {
-          if (visit.find(_INT(OpRole::kBackward)) == visit.end()) {
-            LOG(ERROR) << "Optimize operators %s must follow backward operator."
-                       << op->Type();
-          }
-        }
+        PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(),
+                       "Optimize operators %s must follow backward operator.",
+                       op->Type());
         break;
       case _INT(OpRole::kLRSched):
       case _INT(OpRole::kDist):
diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h
index 75e6bef9bf33d438b7b0ee7f6df8d3fa6732fcfa..dbf00f3a79f7d1dcf97b346fccfdb68f119d4aa3 100644
--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -16,9 +16,7 @@ limitations under the License. */
 
 #if !defined(_WIN32)
 #include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
+#endif  // !_WIN32
 
 #include "paddle/fluid/platform/enforce.h"
 
@@ -31,17 +29,17 @@ struct RWLock {
 
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
-  inline void RDLock() {
+  void RDLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                       "acquire read lock failed");
   }
 
-  inline void WRLock() {
+  void WRLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                       "acquire write lock failed");
   }
 
-  inline void UNLock() {
+  void UNLock() {
     PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
   }
 
@@ -53,44 +51,81 @@ struct RWLock {
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
-  // FIXME(minqiyang): use mutex here to do fake lock
-  inline void RDLock() { mutex_.lock(); }
-
-  inline void WRLock() { mutex_.lock(); }
-
-  inline void UNLock() { mutex_.unlock(); }
-
- private:
-  std::mutex mutex_;
+  void RDLock() {}
+  void WRLock() {}
+  void UNLock() {}
 };
 #endif
 
-class AutoWRLock {
+class RWLockGuard {
  public:
-  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-
-  inline void Lock() { lock_->WRLock(); }
-
-  inline void UnLock() { lock_->UNLock(); }
-
-  ~AutoWRLock() { UnLock(); }
-
- private:
-  RWLock* lock_;
-};
+  enum Status { kUnLock, kWRLock, kRDLock };
+
+  RWLockGuard(RWLock* rw_lock, Status init_status)
+      : lock_(rw_lock), status_(Status::kUnLock) {
+    switch (init_status) {
+      case Status::kRDLock: {
+        RDLock();
+        break;
+      }
+      case Status::kWRLock: {
+        WRLock();
+        break;
+      }
+      case Status::kUnLock: {
+        break;
+      }
+    }
+  }
 
-class AutoRDLock {
- public:
-  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  void WRLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->WRLock();
+        status_ = Status::kWRLock;
+        break;
+      }
+      case Status::kWRLock: {
+        break;
+      }
+      case Status::kRDLock: {
+        PADDLE_THROW(
+            "Please unlock read lock first before invoking write lock.");
+        break;
+      }
+    }
+  }
 
-  inline void Lock() { lock_->RDLock(); }
+  void RDLock() {
+    switch (status_) {
+      case Status::kUnLock: {
+        lock_->RDLock();
+        status_ = Status::kRDLock;
+        break;
+      }
+      case Status::kRDLock: {
+        break;
+      }
+      case Status::kWRLock: {
+        PADDLE_THROW(
+            "Please unlock write lock first before invoking read lock.");
+        break;
+      }
+    }
+  }
 
-  inline void UnLock() { lock_->UNLock(); }
+  void UnLock() {
+    if (status_ != Status::kUnLock) {
+      lock_->UNLock();
+      status_ = Status::kUnLock;
+    }
+  }
 
-  ~AutoRDLock() { UnLock(); }
+  ~RWLockGuard() { UnLock(); }
 
  private:
   RWLock* lock_;
+  Status status_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 4f79d9826099e8cb19b6a21a7723c75baa1745f9..6fa5e99f9f3a7e871f1a742a30803853988ea6eb 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -47,15 +47,9 @@ DEFINE_bool(fast_eager_deletion_mode, false,
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
+#define SCOPE_LOCK_GUARD
 #else
-#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_);
-#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_);
-#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
-#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
+#define SCOPE_LOCK_GUARD std::lock_guard<std::mutex> lock(mutex_);
 #endif
 
 namespace paddle {
@@ -73,69 +67,64 @@ bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
 Scope::~Scope() { DropKids(); }
 
 Scope& Scope::NewScope() const {
-  Scope* child = new Scope(this);
-  {
-    SCOPE_KIDS_WRITER_LOCK
-    kids_.push_back(child);
-  }
-  return *child;
+  SCOPE_LOCK_GUARD
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
 }
 
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   return VarInternal(name);
 }
 
 Variable* Scope::Var(std::string* name) {
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   if (name != nullptr) {
     *name = new_name;
   }
-  SCOPE_VARS_WRITER_LOCK
   return VarInternal(new_name);
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindVarInternal(name);
 }
 
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindVarLocally(name);
 }
 
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_VARS_READER_LOCK
+  SCOPE_LOCK_GUARD
   return FindScopeInternal(var);
 }
 
 void Scope::DropKids() {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   for (Scope* s : kids_) delete s;
   kids_.clear();
 }
 
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_KIDS_READER_LOCK
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   return it != this->kids_.end();
 }
 
 std::vector<std::string> Scope::LocalVarNames() const {
+  SCOPE_LOCK_GUARD
   std::vector<std::string> known_vars;
-  {
-    SCOPE_VARS_READER_LOCK
-    known_vars.reserve(this->vars_.size());
-    for (auto& p : vars_) {
-      known_vars.emplace_back(p.first);
-    }
+  known_vars.reserve(this->vars_.size());
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
   }
   return known_vars;
 }
 
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_KIDS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
   PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                  this, scope);
@@ -149,8 +138,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
+  SCOPE_LOCK_GUARD
   std::set<std::string> var_set(var_names.begin(), var_names.end());
-  SCOPE_VARS_WRITER_LOCK
   for (auto it = vars_.begin(); it != vars_.end();) {
     if (var_set.find(it->first) != var_set.end()) {
       it = vars_.erase(it);
@@ -162,12 +151,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 
 void Scope::Rename(const std::string& origin_name,
                    const std::string& new_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   RenameInternal(origin_name, new_name);
 }
 
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_VARS_WRITER_LOCK
+  SCOPE_LOCK_GUARD
   auto new_name = string::Sprintf("%p.%d", this, vars_.size());
   RenameInternal(origin_name, new_name);
   return new_name;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 77ef18414d09cf8e42a7f2a6570c8cd1ef496287..aded1f771cedbf2442ad36d7fab3e6e6caffdc24 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,19 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-extern "C" {
-#include <xxhash.h>
-}
-
-#include <functional>
 #include <list>
-#include <memory>
+#include <mutex>  // NOLINT
 #include <string>
 #include <unordered_map>
-#include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 
@@ -38,14 +31,6 @@ bool IsFastEagerDeletionModeEnabled();
 
 class Scope;
 
-namespace inner {
-struct KeyHasher {
-  std::size_t operator()(const std::string& key) const {
-    return XXH32(key.c_str(), key.size(), 1);
-  }
-};
-}  // namespace inner
-
 /**
  * @brief Scope that manage all variables.
  *
@@ -110,14 +95,7 @@ class Scope {
   std::string Rename(const std::string& origin_name) const;
 
  protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>,
-                             inner::KeyHasher>
-      vars_;
-  // mutable tsl::robin_map<
-  // std::string, std::unique_ptr<Variable>, std::hash<std::string>,
-  // std::equal_to<std::string>,
-  // std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
-  // vars_;
+  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
 
  private:
   // Call Scope::NewScope for a sub-scope.
@@ -146,8 +124,7 @@ class Scope {
   DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
-  mutable RWLock kids_lock_;
-  mutable RWLock vars_lock_;
+  mutable std::mutex mutex_;
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/spin_lock.h b/paddle/fluid/framework/spin_lock.h
deleted file mode 100644
index 11a763d655abb0f422cb2647a820c80e79f33f1b..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/spin_lock.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if !defined(_WIN32)
-#include <pthread.h>
-#else
-#include <mutex>  // NOLINT
-#endif            // !_WIN32
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-#if !defined(_WIN32)
-struct SpinLock {
-  SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); }
-
-  ~SpinLock() { pthread_spin_destroy(&lock_); }
-
-  void Lock() {
-    PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed");
-  }
-
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0,
-                      "release spin lock failed");
-  }
-
- private:
-  pthread_spinlock_t lock_;
-};
-#else
-// FIXME(minqiyang): use mutex here to do fake spin lock
-struct SpinLock {
-  void Lock() { mutex_.lock(); }
-
-  void Unlock() { mutex_.lock(); }
-
- private:
-  std::mutex mutex_;
-};
-#endif
-
-class AutoSpinLock {
- public:
-  explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) {
-    lock_->Lock();
-  }
-
-  ~SpinLockGuard() { lock_->Unlock(); }
-
- private:
-  SpinLock* lock_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 930d851696efdf8dc92f98638f2d09cac7eb304e..760a065c1081d1e55901774b258ba524471b856b 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -180,7 +180,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `inputs_data` may be destructed before
     // kernel ends
-    /* context.Wait(); */
+    context.Wait();
   }
 };
 
@@ -258,7 +258,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     }
     // Wait() must be called because `outputs_data` may be destructed before
     // kernel ends
-    /* context.Wait(); */
+    context.Wait();
   }
 };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c108c8275669feeb4c313dc57107adfc632ce61d..88a2a5276ab52e25b4f790e3a2f1386ed0715b4e 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -822,7 +822,7 @@ All parameter, weight, gradient are variables in Paddle.
           R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                 many iterations to clean up the temp variables which
                 is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations. Default 1.
+                because the temp variable's shape maybe the same between two iterations. Default 100.
 
                 NOTES:
                     1. If you fetch data when calling the 'run', the ParallelExecutor
diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py
index 78f7a6ac085f676d96c26c16fe9f48ae7f1d4d58..e05885f5f5bfc169828c1c6e723dffff098c3c2e 100644
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -92,7 +92,6 @@ def cuda_profiler(output_file, output_mode=None, config=None):
     config_file = 'nvprof_config_file'
     with open(config_file, 'wb') as fp:
         fp.writelines([six.b("%s\n" % item) for item in config])
-    #Comment this for nvprof
     core.nvprof_init(output_file, output_mode, config_file)
     # Enables profiler collection by the active CUDA profiling tool.
     core.nvprof_start()