提交 4a4ccac1 编写于 作者: Y Yancey1989

update by comment test=develop

上级 4f304eaa
...@@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() { ...@@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype), buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
ncclSum, comm, stream)); ncclSum, comm, stream));
if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream); // TODO(Yancey1989): synchronize here can get better performance
// if don't use NCCL group call, but need more profileing.
if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream);
}); });
} }
this->RunAndRecordEvent([&] { this->RunAndRecordEvent([&] {
// TODO(Yancey1989): need allreduce operator to avoid this flag if (all_reduce_calls.size() == 1UL) {
if (nccl_ctxs_->need_group_call_) { all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard; platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) { for (auto &call : all_reduce_calls) {
call(); call();
} }
} else {
// only used in executor_type == ParallalGraph, one thread one GPU
// TODO(Yancey1989): use allreduce operator to avoid this tricky.
PADDLE_ENFORCE(all_reduce_calls.size() == 1UL);
all_reduce_calls[0]();
} }
}); });
......
...@@ -386,8 +386,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -386,8 +386,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
// insert synchronous ops at the backpropagation; and // insert collective ops at the backpropagation; and
// insert synchronous ops if the graph contains mutilple places. // insert collective ops if the graph contains mutilple places.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (!is_forwarding && if (!is_forwarding &&
......
...@@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) {
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE(!use_cuda);
#endif #endif
RunImpl(); RunImpl();
} }
......
...@@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp(
if (LIKELY(!strategy_.dry_run_)) { if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_); op->Run(strategy_.use_cuda_);
} }
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(10) << op << " " << op->Name() << "Signal posted";
......
...@@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor( ...@@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor(
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
ncclUniqueId *nccl_id = nullptr; ncclUniqueId *nccl_id = nullptr;
bool need_group_call = true;
if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) { if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) {
// parallel graph mode should initialize nccl by ncclCommInitRank since // parallel graph mode should initialize nccl by ncclCommInitRank since
// it call nccl operator per device per thread. // it call nccl operator per device per thread.
...@@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor( ...@@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor(
} else { } else {
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>(); nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
} }
need_group_call = false;
} else if (nccl_id_var != nullptr) { // the other executor type. } else if (nccl_id_var != nullptr) { // the other executor type.
// the distributed training with nccl mode would initialize the nccl id in // the distributed training with nccl mode would initialize the nccl id in
// startup_program. // startup_program.
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>(); nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
} else { } else {
// initlize NCCL by ncclCommInitAll, do not need nccl_id. // initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id.
} }
member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, num_trainers, trainer_id, need_group_call)); member_->places_, nccl_id, num_trainers, trainer_id));
#else #else
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
...@@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor( ...@@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph)); graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold();
// TODO(Yancey1989): fix gc failed on ParallelGraph executor.
if (max_memory_size >= 0 &&
exec_strategy.type_ != ExecutionStrategy::kParallelGraph) {
graphs[0] = member_->PrepareGCAndRefCnts(
std::move(graphs[0]), static_cast<size_t>(max_memory_size));
}
// Step 3. Create vars in each scope. Passes may also create new vars. // Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars // skip control vars and empty vars
std::vector<details::VariableInfo> var_infos; std::vector<details::VariableInfo> var_infos;
......
...@@ -27,6 +27,7 @@ limitations under the License. */ ...@@ -27,6 +27,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct ExceptionHandler { struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_; mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler( explicit ExceptionHandler(
......
...@@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase { ...@@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
sin >> num; sin >> num;
place = platform::CUDAPlace(static_cast<int>(num)); place = platform::CUDAPlace(static_cast<int>(num));
} }
out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader, out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
place, 2)); place, 2));
} }
......
...@@ -82,15 +82,12 @@ struct NCCLContext { ...@@ -82,15 +82,12 @@ struct NCCLContext {
struct NCCLContextMap { struct NCCLContextMap {
std::unordered_map<int, NCCLContext> contexts_; std::unordered_map<int, NCCLContext> contexts_;
std::vector<int> order_; std::vector<int> order_;
bool need_group_call_;
explicit NCCLContextMap(const std::vector<platform::Place> &places, explicit NCCLContextMap(const std::vector<platform::Place> &places,
ncclUniqueId *nccl_id = nullptr, ncclUniqueId *nccl_id = nullptr,
size_t num_trainers = 1, size_t trainer_id = 0, size_t num_trainers = 1, size_t trainer_id = 0) {
bool need_group_call = true) {
PADDLE_ENFORCE(!places.empty()); PADDLE_ENFORCE(!places.empty());
order_.reserve(places.size()); order_.reserve(places.size());
need_group_call_ = need_group_call;
for (auto &p : places) { for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device; int dev_id = boost::get<CUDAPlace>(p).device;
order_.emplace_back(dev_id); order_.emplace_back(dev_id);
......
...@@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase):
self.check_simple_fc_convergence(False) self.check_simple_fc_convergence(False)
def test_simple_fc_with_new_strategy(self): def test_simple_fc_with_new_strategy(self):
# use_cuda, use_reducea # use_cuda, use_reduce
self._compare_reduce_and_allreduce(simple_fc_net, True) self._compare_reduce_and_allreduce(simple_fc_net, True)
self._compare_reduce_and_allreduce(simple_fc_net, False) self._compare_reduce_and_allreduce(simple_fc_net, False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册