1. Add SpinLock

2. Seperate the lock of kids and vars in Scope test=develop

1. Add SpinLock
2. Seperate the lock of kids and vars in Scope test=develop
ad6ae0b0 · minqiyang · a61eb543 · ad6ae0b0 · ad6ae0b0 · ad6ae0b0
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -215,6 +215,7 @@ include(external/xxhash)    # download xxhash
 include(external/dlpack)
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+include(external/robin_map) # download tsl::robin_map
 if (NOT WIN32)
 # there is no official support of warpctc, nccl, cupti in windows

--- a/cmake/external/robin_map.cmake
+++ b/cmake/external/robin_map.cmake
+include(ExternalProject)
+set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map)
+set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include)
+include_directories(${ROBIN_MAP_INCLUDE_DIR})
+ExternalProject_Add(
+  extern_robin_map
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/Tessil/robin-map.git"
+  GIT_TAG        "v0.5.0"
+  PREFIX         ${ROBIN_MAP_SOURCE_DIR}
+  UPDATE_COMMAND ""
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+  TEST_COMMAND      ""
+)
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(robin_map STATIC ${dummyfile})
+else()
+  add_library(robin_map INTERFACE)
+endif()
+add_dependencies(robin_map extern_robin_map)
+LIST(APPEND externl_project_dependencies robin_map)
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -25,7 +25,7 @@ struct ExecutionStrategy {
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
-  size_t num_iteration_per_drop_scope_{100};
+  size_t num_iteration_per_drop_scope_{1};
  ExecutorType type_{kDefault};
  bool dry_run_{false};
 };

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
                           : nullptr;
 #endif
-  if (!fetch_tensors.empty() ||
+  if (!fetch_tensors.empty()) {
-      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
-    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
@@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
      }
 #endif
    }
+  }
+  if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
+    drop_scope_counter_ = 0;
    for (auto &scope : local_scopes_) {
      auto &local_scope =
          *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
      scope->DeleteScope(local_scope);
    }
  }
  if (eptr) {
    std::rethrow_exception(eptr);
  } else {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
 }
 bool OperatorBase::HasInputs(const std::string& name) const {
-  if (inputs_.find(name) != inputs_.end()) {
+  return inputs_.find(name) != inputs_.end();
-    return true;
-  } else {
-    return false;
-  }
 }
 std::string OperatorBase::Input(const std::string& name) const {

--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -31,17 +31,17 @@ struct RWLock {
  ~RWLock() { pthread_rwlock_destroy(&lock_); }
-  void RDLock() {
+  inline void RDLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
                      "acquire read lock failed");
  }
-  void WRLock() {
+  inline void WRLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
                      "acquire write lock failed");
  }
-  void UNLock() {
+  inline void UNLock() {
    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
  }
@@ -54,86 +54,43 @@ struct RWLock {
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.
 struct RWLock {
  // FIXME(minqiyang): use mutex here to do fake lock
-  void RDLock() { mutex_.lock(); }
+  inline void RDLock() { mutex_.lock(); }
-  void WRLock() { mutex_.lock(); }
+  inline void WRLock() { mutex_.lock(); }
-  void UNLock() { mutex_.unlock(); }
+  inline void UNLock() { mutex_.unlock(); }
 private:
  std::mutex mutex_;
 };
 #endif
-class RWLockGuard {
+class AutoWRLock {
 public:
-  enum Status { kUnLock, kWRLock, kRDLock };
+  explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
-  RWLockGuard(RWLock* rw_lock, Status init_status)
-      : lock_(rw_lock), status_(Status::kUnLock) {
-    switch (init_status) {
-      case Status::kRDLock: {
-        RDLock();
-        break;
-      }
-      case Status::kWRLock: {
-        WRLock();
-        break;
-      }
-      case Status::kUnLock: {
-        break;
-      }
-    }
-  }
-  void WRLock() {
+  inline void Lock() { lock_->WRLock(); }
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->WRLock();
-        status_ = Status::kWRLock;
-        break;
-      }
-      case Status::kWRLock: {
-        break;
-      }
-      case Status::kRDLock: {
-        PADDLE_THROW(
-            "Please unlock read lock first before invoking write lock.");
-        break;
-      }
-    }
-  }
-  void RDLock() {
+  inline void UnLock() { lock_->UNLock(); }
-    switch (status_) {
-      case Status::kUnLock: {
-        lock_->RDLock();
-        status_ = Status::kRDLock;
-        break;
-      }
-      case Status::kRDLock: {
-        break;
-      }
-      case Status::kWRLock: {
-        PADDLE_THROW(
-            "Please unlock write lock first before invoking read lock.");
-        break;
-      }
-    }
-  }
-  void UnLock() {
+  ~AutoWRLock() { UnLock(); }
-    if (status_ != Status::kUnLock) {
-      lock_->UNLock();
+ private:
-      status_ = Status::kUnLock;
+  RWLock* lock_;
-    }
+};
-  }
+class AutoRDLock {
+ public:
+  explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
+  inline void Lock() { lock_->RDLock(); }
+  inline void UnLock() { lock_->UNLock(); }
-  ~RWLockGuard() { UnLock(); }
+  ~AutoRDLock() { UnLock(); }
 private:
  RWLock* lock_;
-  Status status_;
 };
 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <set>
 #include <unordered_set>
 #include "glog/logging.h"
-#include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/string/printf.h"
 DEFINE_bool(benchmark, false,
@@ -43,13 +42,15 @@ DEFINE_double(
 // the mutex will cause serious performance issue.
 // So the mutex is disabled when `ON_INFER`.
 #ifdef PADDLE_ON_INFERENCE
-#define SCOPE_READER_LOCK
+#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_WRITER_LOCK
+#define SCOPE_KIDS_WRITER_LOCK
+#define SCOPE_VARS_READER_LOCK
+#define SCOPE_VARS_WRITER_LOCK
 #else
-// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one
+#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_);
-// in _WIN32 platform
+#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_);
-#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock);
+#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_);
-#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock);
+#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_);
 #endif
 namespace paddle {
@@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() {
 Scope::~Scope() { DropKids(); }
 Scope& Scope::NewScope() const {
-  SCOPE_WRITER_LOCK
+  Scope* child = new Scope(this);
-  kids_.push_back(new Scope(this));
+  {
-  return *kids_.back();
+    SCOPE_KIDS_WRITER_LOCK
+    kids_.push_back(child);
+  }
+  return *child;
 }
 Variable* Scope::Var(const std::string& name) {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(name);
 }
 Variable* Scope::Var(std::string* name) {
-  SCOPE_WRITER_LOCK
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  if (name != nullptr) {
    *name = new_name;
  }
+  SCOPE_VARS_WRITER_LOCK
  return VarInternal(new_name);
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
  return FindVarInternal(name);
 }
 Variable* Scope::FindLocalVar(const std::string& name) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
  return FindVarLocally(name);
 }
 const Scope* Scope::FindScope(const Variable* var) const {
-  SCOPE_READER_LOCK
+  SCOPE_VARS_READER_LOCK
  return FindScopeInternal(var);
 }
 void Scope::DropKids() {
-  SCOPE_WRITER_LOCK
+  SCOPE_KIDS_WRITER_LOCK
  for (Scope* s : kids_) delete s;
  kids_.clear();
 }
 bool Scope::HasKid(const Scope* scope) const {
-  SCOPE_READER_LOCK
+  SCOPE_KIDS_READER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  return it != this->kids_.end();
 }
 std::vector<std::string> Scope::LocalVarNames() const {
-  SCOPE_READER_LOCK
  std::vector<std::string> known_vars;
-  known_vars.reserve(this->vars_.size());
+  {
-  for (auto& p : vars_) {
+    SCOPE_VARS_READER_LOCK
-    known_vars.emplace_back(p.first);
+    known_vars.reserve(this->vars_.size());
+    for (auto& p : vars_) {
+      known_vars.emplace_back(p.first);
+    }
  }
  return known_vars;
 }
 void Scope::DeleteScope(Scope* scope) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_KIDS_WRITER_LOCK
  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
  PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
                 this, scope);
@@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const {
 }
 void Scope::EraseVars(const std::vector<std::string>& var_names) {
-  SCOPE_WRITER_LOCK
  std::set<std::string> var_set(var_names.begin(), var_names.end());
+  SCOPE_VARS_WRITER_LOCK
  for (auto it = vars_.begin(); it != vars_.end();) {
    if (var_set.find(it->first) != var_set.end()) {
      it = vars_.erase(it);
@@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
 void Scope::Rename(const std::string& origin_name,
                   const std::string& new_name) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
  RenameInternal(origin_name, new_name);
 }
 std::string Scope::Rename(const std::string& origin_name) const {
-  SCOPE_WRITER_LOCK
+  SCOPE_VARS_WRITER_LOCK
  auto new_name = string::Sprintf("%p.%d", this, vars_.size());
  RenameInternal(origin_name, new_name);
  return new_name;
@@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name,
  auto new_it = vars_.find(new_name);
  PADDLE_ENFORCE(new_it == vars_.end(),
                 "The variable with name %s is already in the scope", new_name);
-  vars_[new_name].reset(origin_it->second.release());
+  vars_[new_name].reset(origin_it.value().release());
  vars_.erase(origin_it);
 }

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -14,11 +14,15 @@ limitations under the License. */
 #pragma once
+#include <functional>
 #include <list>
+#include <memory>
 #include <string>
-#include <unordered_map>
+#include <utility>
 #include <vector>
+#include <tsl/robin_map.h>  // NOLINT
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
@@ -94,7 +98,11 @@ class Scope {
  std::string Rename(const std::string& origin_name) const;
 protected:
-  mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  mutable tsl::robin_map<
+      std::string, std::unique_ptr<Variable>, std::hash<std::string>,
+      std::equal_to<std::string>,
+      std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
+      vars_;
 private:
  // Call Scope::NewScope for a sub-scope.
@@ -123,7 +131,8 @@ class Scope {
  DISABLE_COPY_AND_ASSIGN(Scope);
 private:
-  mutable RWLock rw_lock_;
+  mutable RWLock kids_lock_;
+  mutable RWLock vars_lock_;
 };
 // Generate some debug string about the inherience structure of scope, quite

--- a/paddle/fluid/framework/spin_lock.h
+++ b/paddle/fluid/framework/spin_lock.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#if !defined(_WIN32)
+#include <pthread.h>
+#else
+#include <mutex>  // NOLINT
+#endif            // !_WIN32
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+#if !defined(_WIN32)
+struct SpinLock {
+  SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); }
+  ~SpinLock() { pthread_spin_destroy(&lock_); }
+  void Lock() {
+    PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed");
+  }
+  void Unlock() {
+    PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0,
+                      "release spin lock failed");
+  }
+ private:
+  pthread_spinlock_t lock_;
+};
+#else
+// FIXME(minqiyang): use mutex here to do fake spin lock
+struct SpinLock {
+  void Lock() { mutex_.lock(); }
+  void Unlock() { mutex_.lock(); }
+ private:
+  std::mutex mutex_;
+};
+#endif
+class AutoSpinLock {
+ public:
+  explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) {
+    lock_->Lock();
+  }
+  ~SpinLockGuard() { lock_->Unlock(); }
+ private:
+  SpinLock* lock_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
            static_cast<const DeviceContext&>(ctx.device_context()),
            param.numel());
        for_range(functor);
-        auto& dev =
-            *ctx.template device_context<DeviceContext>().eigen_device();
-        const LoDTensor* beta1_pow_ptr = ctx.Input<LoDTensor>("Beta1Pow");
-        auto eigen_in_beta1_pow =
-            framework::EigenVector<T>::Flatten(*beta1_pow_ptr);
-        auto eigen_out_beta1_pow = framework::EigenVector<T>::Flatten(
-            *(const_cast<LoDTensor*>(beta1_pow_ptr)));
-        eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow;
-        const LoDTensor* beta2_pow_ptr = ctx.Input<LoDTensor>("Beta2Pow");
-        auto eigen_in_beta2_pow =
-            framework::EigenVector<T>::Flatten(*beta2_pow_ptr);
-        auto eigen_out_beta2_pow = framework::EigenVector<T>::Flatten(
-            *(const_cast<LoDTensor*>(beta2_pow_ptr)));
-        eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow;
      }
    } else if (grad_var->IsType<framework::SelectedRows>()) {
      auto& grad =

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle.
          R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
                many iterations to clean up the temp variables which
                is generated during execution. It may make the execution faster,
-                because the temp variable's shape maybe the same between two iterations. Default 100.
+                because the temp variable's shape maybe the same between two iterations. Default 1.
                NOTES:
                    1. If you fetch data when calling the 'run', the ParallelExecutor

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -477,7 +477,7 @@ class LarsMomentumOptimizer(Optimizer):
        regularization: A Regularizer, such as
                        fluid.regularizer.L2DecayRegularizer.
        name: A optional name prefix.
    Examples:
        .. code-block:: python
@@ -739,27 +739,26 @@ class AdamOptimizer(Optimizer):
        """
        assert isinstance(block, framework.Block)
        main_block = block.program.global_block()
-        #  for param, grad in param_and_grads:
+        for param, grad in param_and_grads:
+            if grad is None:
-    #  if grad is None:
+                continue
-    #  continue
+            with param.block.program._optimized_guard(
-    #  with param.block.program._optimized_guard(
+                [param, grad]), name_scope("optimizer"):
-    #  [param, grad]), name_scope("optimizer"):
+                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-    #  beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
+                                                      param)
-    #  param)
+                beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-    #  beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
+                                                      param)
-    #  param)
+                main_block.append_op(
-    #  main_block.append_op(
+                    type="scale",
-    #  type="scale",
+                    inputs={"X": beta1_pow_acc},
-    #  inputs={"X": beta1_pow_acc},
+                    outputs={"Out": beta1_pow_acc},
-    #  outputs={"Out": beta1_pow_acc},
+                    attrs={"scale": self._beta1})
-    #  attrs={"scale": self._beta1})
+                main_block.append_op(
-    #  main_block.append_op(
+                    type="scale",
-    #  type="scale",
+                    inputs={"X": beta2_pow_acc},
-    #  inputs={"X": beta2_pow_acc},
+                    outputs={"Out": beta2_pow_acc},
-    #  outputs={"Out": beta2_pow_acc},
+                    attrs={"scale": self._beta2})
-    #  attrs={"scale": self._beta2})
 class AdamaxOptimizer(Optimizer):