提交 ad6ae0b0 编写于 作者: M minqiyang

1. Add SpinLock

2. Seperate the lock of kids and vars in Scope

test=develop
上级 a61eb543
......@@ -215,6 +215,7 @@ include(external/xxhash) # download xxhash
include(external/dlpack)
include(external/snappy) # download snappy
include(external/snappystream) # download snappystream
include(external/robin_map) # download tsl::robin_map
if (NOT WIN32)
# there is no official support of warpctc, nccl, cupti in windows
......
include(ExternalProject)
set(ROBIN_MAP_SOURCE_DIR ${THIRD_PARTY_PATH}/robin_map)
set(ROBIN_MAP_INCLUDE_DIR ${ROBIN_MAP_SOURCE_DIR}/src/extern_robin_map/include)
include_directories(${ROBIN_MAP_INCLUDE_DIR})
ExternalProject_Add(
extern_robin_map
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/Tessil/robin-map.git"
GIT_TAG "v0.5.0"
PREFIX ${ROBIN_MAP_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/robin_map_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(robin_map STATIC ${dummyfile})
else()
add_library(robin_map INTERFACE)
endif()
add_dependencies(robin_map extern_robin_map)
LIST(APPEND externl_project_dependencies robin_map)
......@@ -25,7 +25,7 @@ struct ExecutionStrategy {
size_t num_threads_{0};
bool use_cuda_{true};
bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100};
size_t num_iteration_per_drop_scope_{1};
ExecutorType type_{kDefault};
bool dry_run_{false};
};
......
......@@ -76,9 +76,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
: nullptr;
#endif
if (!fetch_tensors.empty() ||
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
drop_scope_counter_ = 0;
if (!fetch_tensors.empty()) {
// Wait All computational streams
for (auto p : places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait();
......@@ -91,12 +89,17 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
}
#endif
}
}
if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
drop_scope_counter_ = 0;
for (auto &scope : local_scopes_) {
auto &local_scope =
*scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
scope->DeleteScope(local_scope);
}
}
if (eptr) {
std::rethrow_exception(eptr);
} else {
......
......@@ -163,11 +163,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
}
bool OperatorBase::HasInputs(const std::string& name) const {
if (inputs_.find(name) != inputs_.end()) {
return true;
} else {
return false;
}
return inputs_.find(name) != inputs_.end();
}
std::string OperatorBase::Input(const std::string& name) const {
......
......@@ -31,17 +31,17 @@ struct RWLock {
~RWLock() { pthread_rwlock_destroy(&lock_); }
void RDLock() {
inline void RDLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
"acquire read lock failed");
}
void WRLock() {
inline void WRLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
"acquire write lock failed");
}
void UNLock() {
inline void UNLock() {
PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
}
......@@ -54,86 +54,43 @@ struct RWLock {
// In windows, rw_lock seems like a hack. Use empty object and do nothing.
struct RWLock {
// FIXME(minqiyang): use mutex here to do fake lock
void RDLock() { mutex_.lock(); }
inline void RDLock() { mutex_.lock(); }
void WRLock() { mutex_.lock(); }
inline void WRLock() { mutex_.lock(); }
void UNLock() { mutex_.unlock(); }
inline void UNLock() { mutex_.unlock(); }
private:
std::mutex mutex_;
};
#endif
class RWLockGuard {
class AutoWRLock {
public:
enum Status { kUnLock, kWRLock, kRDLock };
RWLockGuard(RWLock* rw_lock, Status init_status)
: lock_(rw_lock), status_(Status::kUnLock) {
switch (init_status) {
case Status::kRDLock: {
RDLock();
break;
}
case Status::kWRLock: {
WRLock();
break;
}
case Status::kUnLock: {
break;
}
}
}
explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
void WRLock() {
switch (status_) {
case Status::kUnLock: {
lock_->WRLock();
status_ = Status::kWRLock;
break;
}
case Status::kWRLock: {
break;
}
case Status::kRDLock: {
PADDLE_THROW(
"Please unlock read lock first before invoking write lock.");
break;
}
}
}
inline void Lock() { lock_->WRLock(); }
void RDLock() {
switch (status_) {
case Status::kUnLock: {
lock_->RDLock();
status_ = Status::kRDLock;
break;
}
case Status::kRDLock: {
break;
}
case Status::kWRLock: {
PADDLE_THROW(
"Please unlock write lock first before invoking read lock.");
break;
}
}
}
inline void UnLock() { lock_->UNLock(); }
void UnLock() {
if (status_ != Status::kUnLock) {
lock_->UNLock();
status_ = Status::kUnLock;
}
}
~AutoWRLock() { UnLock(); }
private:
RWLock* lock_;
};
class AutoRDLock {
public:
explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); }
inline void Lock() { lock_->RDLock(); }
inline void UnLock() { lock_->UNLock(); }
~RWLockGuard() { UnLock(); }
~AutoRDLock() { UnLock(); }
private:
RWLock* lock_;
Status status_;
};
} // namespace framework
......
......@@ -19,7 +19,6 @@ limitations under the License. */
#include <set>
#include <unordered_set>
#include "glog/logging.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(benchmark, false,
......@@ -43,13 +42,15 @@ DEFINE_double(
// the mutex will cause serious performance issue.
// So the mutex is disabled when `ON_INFER`.
#ifdef PADDLE_ON_INFERENCE
#define SCOPE_READER_LOCK
#define SCOPE_WRITER_LOCK
#define SCOPE_KIDS_READER_LOCK
#define SCOPE_KIDS_WRITER_LOCK
#define SCOPE_VARS_READER_LOCK
#define SCOPE_VARS_WRITER_LOCK
#else
// TODO(minqiyang): use rwlock in all platforms, now rwlock is a fake one
// in _WIN32 platform
#define SCOPE_READER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kRDLock);
#define SCOPE_WRITER_LOCK RWLockGuard(&rw_lock_, RWLockGuard::Status::kWRLock);
#define SCOPE_KIDS_READER_LOCK AutoRDLock(&kids_lock_);
#define SCOPE_KIDS_WRITER_LOCK AutoWRLock(&kids_lock_);
#define SCOPE_VARS_READER_LOCK AutoRDLock(&vars_lock_);
#define SCOPE_VARS_WRITER_LOCK AutoWRLock(&vars_lock_);
#endif
namespace paddle {
......@@ -65,64 +66,69 @@ int64_t GetEagerDeletionThreshold() {
Scope::~Scope() { DropKids(); }
Scope& Scope::NewScope() const {
SCOPE_WRITER_LOCK
kids_.push_back(new Scope(this));
return *kids_.back();
Scope* child = new Scope(this);
{
SCOPE_KIDS_WRITER_LOCK
kids_.push_back(child);
}
return *child;
}
Variable* Scope::Var(const std::string& name) {
SCOPE_WRITER_LOCK
SCOPE_VARS_WRITER_LOCK
return VarInternal(name);
}
Variable* Scope::Var(std::string* name) {
SCOPE_WRITER_LOCK
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
if (name != nullptr) {
*name = new_name;
}
SCOPE_VARS_WRITER_LOCK
return VarInternal(new_name);
}
Variable* Scope::FindVar(const std::string& name) const {
SCOPE_READER_LOCK
SCOPE_VARS_READER_LOCK
return FindVarInternal(name);
}
Variable* Scope::FindLocalVar(const std::string& name) const {
SCOPE_READER_LOCK
SCOPE_VARS_READER_LOCK
return FindVarLocally(name);
}
const Scope* Scope::FindScope(const Variable* var) const {
SCOPE_READER_LOCK
SCOPE_VARS_READER_LOCK
return FindScopeInternal(var);
}
void Scope::DropKids() {
SCOPE_WRITER_LOCK
SCOPE_KIDS_WRITER_LOCK
for (Scope* s : kids_) delete s;
kids_.clear();
}
bool Scope::HasKid(const Scope* scope) const {
SCOPE_READER_LOCK
SCOPE_KIDS_READER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
return it != this->kids_.end();
}
std::vector<std::string> Scope::LocalVarNames() const {
SCOPE_READER_LOCK
std::vector<std::string> known_vars;
{
SCOPE_VARS_READER_LOCK
known_vars.reserve(this->vars_.size());
for (auto& p : vars_) {
known_vars.emplace_back(p.first);
}
}
return known_vars;
}
void Scope::DeleteScope(Scope* scope) const {
SCOPE_WRITER_LOCK
SCOPE_KIDS_WRITER_LOCK
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope",
this, scope);
......@@ -136,8 +142,8 @@ void Scope::DeleteScope(Scope* scope) const {
}
void Scope::EraseVars(const std::vector<std::string>& var_names) {
SCOPE_WRITER_LOCK
std::set<std::string> var_set(var_names.begin(), var_names.end());
SCOPE_VARS_WRITER_LOCK
for (auto it = vars_.begin(); it != vars_.end();) {
if (var_set.find(it->first) != var_set.end()) {
it = vars_.erase(it);
......@@ -149,12 +155,12 @@ void Scope::EraseVars(const std::vector<std::string>& var_names) {
void Scope::Rename(const std::string& origin_name,
const std::string& new_name) const {
SCOPE_WRITER_LOCK
SCOPE_VARS_WRITER_LOCK
RenameInternal(origin_name, new_name);
}
std::string Scope::Rename(const std::string& origin_name) const {
SCOPE_WRITER_LOCK
SCOPE_VARS_WRITER_LOCK
auto new_name = string::Sprintf("%p.%d", this, vars_.size());
RenameInternal(origin_name, new_name);
return new_name;
......@@ -188,7 +194,7 @@ void Scope::RenameInternal(const std::string& origin_name,
auto new_it = vars_.find(new_name);
PADDLE_ENFORCE(new_it == vars_.end(),
"The variable with name %s is already in the scope", new_name);
vars_[new_name].reset(origin_it->second.release());
vars_[new_name].reset(origin_it.value().release());
vars_.erase(origin_it);
}
......
......@@ -14,11 +14,15 @@ limitations under the License. */
#pragma once
#include <functional>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <tsl/robin_map.h> // NOLINT
#include "paddle/fluid/framework/rw_lock.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/macros.h"
......@@ -94,7 +98,11 @@ class Scope {
std::string Rename(const std::string& origin_name) const;
protected:
mutable std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
mutable tsl::robin_map<
std::string, std::unique_ptr<Variable>, std::hash<std::string>,
std::equal_to<std::string>,
std::allocator<std::pair<std::string, std::unique_ptr<Variable>>>, true>
vars_;
private:
// Call Scope::NewScope for a sub-scope.
......@@ -123,7 +131,8 @@ class Scope {
DISABLE_COPY_AND_ASSIGN(Scope);
private:
mutable RWLock rw_lock_;
mutable RWLock kids_lock_;
mutable RWLock vars_lock_;
};
// Generate some debug string about the inherience structure of scope, quite
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if !defined(_WIN32)
#include <pthread.h>
#else
#include <mutex> // NOLINT
#endif // !_WIN32
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
#if !defined(_WIN32)
struct SpinLock {
SpinLock() { pthread_spin_init(&lock_, PTHREAD_PROCESS_PRIVATE); }
~SpinLock() { pthread_spin_destroy(&lock_); }
void Lock() {
PADDLE_ENFORCE_EQ(pthread_spin_lock(&lock_), 0, "acquire spin lock failed");
}
void Unlock() {
PADDLE_ENFORCE_EQ(pthread_spin_unlock(&lock_), 0,
"release spin lock failed");
}
private:
pthread_spinlock_t lock_;
};
#else
// FIXME(minqiyang): use mutex here to do fake spin lock
struct SpinLock {
void Lock() { mutex_.lock(); }
void Unlock() { mutex_.lock(); }
private:
std::mutex mutex_;
};
#endif
class AutoSpinLock {
public:
explicit SpinLockGuard(SpinLock* spin_lock) : lock_(spin_lock) {
lock_->Lock();
}
~SpinLockGuard() { lock_->Unlock(); }
private:
SpinLock* lock_;
};
} // namespace framework
} // namespace paddle
......@@ -292,23 +292,6 @@ class AdamOpKernel : public framework::OpKernel<T> {
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
auto& dev =
*ctx.template device_context<DeviceContext>().eigen_device();
const LoDTensor* beta1_pow_ptr = ctx.Input<LoDTensor>("Beta1Pow");
auto eigen_in_beta1_pow =
framework::EigenVector<T>::Flatten(*beta1_pow_ptr);
auto eigen_out_beta1_pow = framework::EigenVector<T>::Flatten(
*(const_cast<LoDTensor*>(beta1_pow_ptr)));
eigen_out_beta1_pow.device(dev) = beta1 * eigen_in_beta1_pow;
const LoDTensor* beta2_pow_ptr = ctx.Input<LoDTensor>("Beta2Pow");
auto eigen_in_beta2_pow =
framework::EigenVector<T>::Flatten(*beta2_pow_ptr);
auto eigen_out_beta2_pow = framework::EigenVector<T>::Flatten(
*(const_cast<LoDTensor*>(beta2_pow_ptr)));
eigen_out_beta2_pow.device(dev) = beta2 * eigen_in_beta2_pow;
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto& grad =
......
......@@ -765,7 +765,7 @@ All parameter, weight, gradient are variables in Paddle.
R"DOC(The type is INT, num_iteration_per_drop_scope indicates how
many iterations to clean up the temp variables which
is generated during execution. It may make the execution faster,
because the temp variable's shape maybe the same between two iterations. Default 100.
because the temp variable's shape maybe the same between two iterations. Default 1.
NOTES:
1. If you fetch data when calling the 'run', the ParallelExecutor
......
......@@ -739,27 +739,26 @@ class AdamOptimizer(Optimizer):
"""
assert isinstance(block, framework.Block)
main_block = block.program.global_block()
# for param, grad in param_and_grads:
# if grad is None:
# continue
# with param.block.program._optimized_guard(
# [param, grad]), name_scope("optimizer"):
# beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
# param)
# beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
# param)
# main_block.append_op(
# type="scale",
# inputs={"X": beta1_pow_acc},
# outputs={"Out": beta1_pow_acc},
# attrs={"scale": self._beta1})
# main_block.append_op(
# type="scale",
# inputs={"X": beta2_pow_acc},
# outputs={"Out": beta2_pow_acc},
# attrs={"scale": self._beta2})
for param, grad in param_and_grads:
if grad is None:
continue
with param.block.program._optimized_guard(
[param, grad]), name_scope("optimizer"):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
main_block.append_op(
type="scale",
inputs={"X": beta1_pow_acc},
outputs={"Out": beta1_pow_acc},
attrs={"scale": self._beta1})
main_block.append_op(
type="scale",
inputs={"X": beta2_pow_acc},
outputs={"Out": beta2_pow_acc},
attrs={"scale": self._beta2})
class AdamaxOptimizer(Optimizer):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册