diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md new file mode 100644 index 0000000000000000000000000000000000000000..567eede1bd59bfb519bdb0b96de1684cfea6c61b --- /dev/null +++ b/doc/design/parallel_executor.md @@ -0,0 +1,52 @@ +# ParallelExecutor Design Doc + +## Introduction + +We introduce `ParallelExecutor` to run multi-GPU training in PaddlePaddle Fluid. It supports +1. keeping a copy of the parameters on each GPU +1. allreduce on a separate stream allowing computation and communication overlap + +An example of switching single GPU training to multiple GPUs: +```python +cost = your_neural_network() +opt = fluid.optimizer.SGDOptimizer() +opt.minimize(avg_cost) + +# change Executor -> ParallelExecutor +exe = fluid.ParallelExecutor(gpu_list=[0, 1]) + +for iter in xranges(iter_num): + exe.run() +``` + +## Design + +In the constructor, a list of parameter, whose gradients need to be allreduced, is given. + +During the runtime, `ParallelExecutor` starts `#gpu` threads to run each `Executor`. For every +operator run on each GPU, it will automatically sync with different streams when necessary. + +```c++ +// if op's input is params' grad: + // sync with allreduce stream + // e.g. sgd should wait for allreduce to be finished +SyncMultipleStreams(op); + +op->Run(*local_scope, place_); + +// if op's output is params' grad: +// sync with computation stream +// e.g. allreduce shoudl wait for fc_grad to be finished. +SyncMultipleStreams(op); +``` + + +## API + +The `ParallelExecutor.run` has similar interface as `Executor.run`. Besides +1. Scope: we don't expose `scope` in `ParallelExecutor.run` since `ParallelExecutor` has its +own scope to maintain NCCL. +1. Feed: we don't expose `feed` in the API either, because the whole point of implementing +parallel_executor is the speed. The input for NN should be implemented in an reader OP. +1. Fetch: we return the fetched value on all GPUs as a list. (e.g. `exe.run(..., fetch=loss)` +with return `[loss_on_gpu0, loss_on_gpu1]`) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 15e5574ecfd406b87db8370948352b7e736937ea..934bb43ffea456d528dddf61dbc04f760246dfe6 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -86,6 +86,8 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope + framework_proto backward glog lod_rank_table feed_fetch_method executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5cae38b2a857b2037f0e5ae4da50d1591da0c11a..6ee3f18dd42ef2d22c165907fdcf4c90a66425a4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -305,10 +305,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } // if (create_vars) for (auto& op : ctx->ops_) { + // TODO(ty): + // e.g. sgd should wait for allreduce to be finished + // if op's input is params' grad: + // sync with allreduce stream + // SyncMultipleStreams(op); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + // TODO(ty): + // e.g. allreduce shoudl wait for fc_grad to be finished. + // if op's output is params' grad: + // sync with computation stream + // apply allreduce on allreduce stream + // SyncMultipleStreams(op); + if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 28ce3315154cea45412984df4daf7385ce2cf572..8d8a7cf4db6907f98b0ad200c40400b1643766f3 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -47,6 +47,7 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + private: static ExecutorPrepareContext* Prepare(const ProgramDesc& program, int block_id); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9f213ae2cff61ccdd0b87fd84ac1b68706f11b0 --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/parallel_executor.h" + +namespace paddle { +namespace framework {} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..47e0005e58d916c15c87a2e9d3cf5f8e8c550102 --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +struct AllReduceCallBack { + void operator()(framework::OperatorBase* op); + + std::unordered_set param_grad_names_; + platform::DeviceContext dev_ctx; +}; + +class ParallelExecutor { + explicit ParallelExecutor(const std::vector& places, + const std::unordered_set& params); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ + void Run(const ProgramDesc& prog, Scope* scope, int block_id, + bool create_local_scope = true, bool create_vars = true); + + private: + std::vector exes_; + std::vector scopes_; + AllReduceCallBack all_reduce_callbacks_; + std::unordered_set params_; // where to initilize it? + platform::Communicator nccl_com_; +}; + +} // namespace framework +} // namespace paddle