提交 4a4ccac1 编写于 作者: Y Yancey1989

update by comment test=develop

上级 4f304eaa
......@@ -107,22 +107,20 @@ void AllReduceOpHandle::RunImpl() {
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
buffer, buffer, numel, static_cast<ncclDataType_t>(dtype),
ncclSum, comm, stream));
if (!nccl_ctxs_->need_group_call_) cudaStreamSynchronize(stream);
// TODO(Yancey1989): synchronize here can get better performance
// if don't use NCCL group call, but need more profileing.
if (local_scopes_.size() == 1UL) cudaStreamSynchronize(stream);
});
}
this->RunAndRecordEvent([&] {
// TODO(Yancey1989): need allreduce operator to avoid this flag
if (nccl_ctxs_->need_group_call_) {
if (all_reduce_calls.size() == 1UL) {
all_reduce_calls[0]();
} else {
platform::NCCLGroupGuard guard;
for (auto &call : all_reduce_calls) {
call();
}
} else {
// only used in executor_type == ParallalGraph, one thread one GPU
// TODO(Yancey1989): use allreduce operator to avoid this tricky.
PADDLE_ENFORCE(all_reduce_calls.size() == 1UL);
all_reduce_calls[0]();
}
});
......
......@@ -386,8 +386,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateComputationalOps(&result, node, places_.size());
}
// insert synchronous ops at the backpropagation; and
// insert synchronous ops if the graph contains mutilple places.
// insert collective ops at the backpropagation; and
// insert collective ops if the graph contains mutilple places.
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (!is_forwarding &&
......
......@@ -52,6 +52,7 @@ void OpHandleBase::Run(bool use_cuda) {
#else
PADDLE_ENFORCE(!use_cuda);
#endif
RunImpl();
}
......
......@@ -216,6 +216,7 @@ void ThreadedSSAGraphExecutor::RunOp(
if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
......
......@@ -231,7 +231,6 @@ ParallelExecutor::ParallelExecutor(
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
ncclUniqueId *nccl_id = nullptr;
bool need_group_call = true;
if (exec_strategy.type_ == ExecutionStrategy::kParallelGraph) {
// parallel graph mode should initialize nccl by ncclCommInitRank since
// it call nccl operator per device per thread.
......@@ -243,17 +242,16 @@ ParallelExecutor::ParallelExecutor(
} else {
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
}
need_group_call = false;
} else if (nccl_id_var != nullptr) { // the other executor type.
// the distributed training with nccl mode would initialize the nccl id in
// startup_program.
nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
} else {
// initlize NCCL by ncclCommInitAll, do not need nccl_id.
// initlize NCCL by ncclCommInitAll, do not need to intialize the nccl_id.
}
member_->nccl_ctxs_.reset(new platform::NCCLContextMap(
member_->places_, nccl_id, num_trainers, trainer_id, need_group_call));
member_->places_, nccl_id, num_trainers, trainer_id));
#else
PADDLE_THROW("Not compiled with CUDA");
#endif
......@@ -288,6 +286,14 @@ ParallelExecutor::ParallelExecutor(
graphs.push_back(std::move(graph));
#endif
auto max_memory_size = GetEagerDeletionThreshold();
// TODO(Yancey1989): fix gc failed on ParallelGraph executor.
if (max_memory_size >= 0 &&
exec_strategy.type_ != ExecutionStrategy::kParallelGraph) {
graphs[0] = member_->PrepareGCAndRefCnts(
std::move(graphs[0]), static_cast<size_t>(max_memory_size));
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std::vector<details::VariableInfo> var_infos;
......
......@@ -27,6 +27,7 @@ limitations under the License. */
namespace paddle {
namespace framework {
struct ExceptionHandler {
mutable std::future<std::unique_ptr<platform::EnforceNotMet>> future_;
explicit ExceptionHandler(
......
......@@ -46,6 +46,7 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
sin >> num;
place = platform::CUDAPlace(static_cast<int>(num));
}
out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
place, 2));
}
......
......@@ -82,15 +82,12 @@ struct NCCLContext {
struct NCCLContextMap {
std::unordered_map<int, NCCLContext> contexts_;
std::vector<int> order_;
bool need_group_call_;
explicit NCCLContextMap(const std::vector<platform::Place> &places,
ncclUniqueId *nccl_id = nullptr,
size_t num_trainers = 1, size_t trainer_id = 0,
bool need_group_call = true) {
size_t num_trainers = 1, size_t trainer_id = 0) {
PADDLE_ENFORCE(!places.empty());
order_.reserve(places.size());
need_group_call_ = need_group_call;
for (auto &p : places) {
int dev_id = boost::get<CUDAPlace>(p).device;
order_.emplace_back(dev_id);
......
......@@ -123,7 +123,7 @@ class TestMNIST(TestParallelExecutorBase):
self.check_simple_fc_convergence(False)
def test_simple_fc_with_new_strategy(self):
# use_cuda, use_reducea
# use_cuda, use_reduce
self._compare_reduce_and_allreduce(simple_fc_net, True)
self._compare_reduce_and_allreduce(simple_fc_net, False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册