Support testing during training by ParallelExecutor. (#9738)

* Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments.

Support testing during training by ParallelExecutor. (#9738)
* Support testing during training by ParallelExecutor. * Add unit test. * Improve the interface. * Follow comments.
2b7e5bd3 · qingqing01 · GitHub · 9a4ce6f1 · 2b7e5bd3 · 2b7e5bd3
5 changed file
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
-#include "paddle/fluid/platform/profiler.h"

 #include <string>
 #include <vector>
@@ -24,6 +23,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
@@ -43,30 +43,40 @@ class ParallelExecutorPrivate {
 #endif
 };

+std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
+  return member_->local_scopes_;
+}
+
 ParallelExecutor::ParallelExecutor(
    size_t num_threads, bool use_event,
    const std::vector<platform::Place> &places,
    const std::unordered_set<std::string> &params,
-    const ProgramDesc &startup_program, const ProgramDesc &main_program,
-    const std::string &loss_var_name, Scope *scope, bool allow_op_delay)
+    const std::unordered_set<std::string> &bcast_vars,
+    const ProgramDesc &main_program, const std::string &loss_var_name,
+    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay)
    : member_(new ParallelExecutorPrivate(places)) {
  member_->global_scope_ = scope;

-  // Step 1. RunStartupProgram and Bcast the params to devs.
-  Executor exe(places[0]);
-  exe.Run(startup_program, scope, 0);
+  // Step 1. Bcast the params to devs.
  // Create local scopes
+  if (local_scopes.empty()) {
    for (size_t i = 0; i < member_->places_.size(); ++i) {
      member_->local_scopes_.push_back(&scope->NewScope());
    }
+  } else {
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size());
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.push_back(local_scopes[i]);
+    }
+  }

 // Bcast Parameters to all GPUs
 #ifdef PADDLE_WITH_CUDA
  member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_));
 #endif
-  if (platform::is_gpu_place(places[0]) &&
-      member_->local_scopes_.size() != 1) {  // Is CUDA
-    BCastParamsToGPUs(startup_program);
+  if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 &&
+      local_scopes.empty()) {  // Is CUDA
+    BCastParamsToGPUs(bcast_vars);
  }
 // Startup Program has been run. All local scopes has correct parameters.

@@ -99,16 +109,17 @@ ParallelExecutor::ParallelExecutor(
 }

 void ParallelExecutor::BCastParamsToGPUs(
-    const ProgramDesc &startup_program) const {
+    const std::unordered_set<std::string> &vars) const {
 #ifdef PADDLE_WITH_CUDA
  auto *main_scope = member_->local_scopes_[0];

-  for (auto *var_desc : startup_program.Block(0).AllVars()) {
-    size_t idx = var_desc->Name().find("@GRAD");
-    if (idx != std::string::npos) continue;
-    if (var_desc->GetType() == proto::VarType::LOD_TENSOR) {
-      auto &main_tensor =
-          main_scope->FindVar(var_desc->Name())->Get<LoDTensor>();
+  for (auto &var : vars) {
+    auto *main_var = main_scope->FindVar(var);
+    if (!main_var->IsType<LoDTensor>()) {
+      continue;
+    }
+
+    auto &main_tensor = main_var->Get<LoDTensor>();

    auto &dims = main_tensor.dims();

@@ -123,8 +134,7 @@ void ParallelExecutor::BCastParamsToGPUs(
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
-            auto *t =
-                local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
          t->Resize(dims);
          buffer = t->mutable_data(place, main_tensor.type());
        }
@@ -136,13 +146,12 @@ void ParallelExecutor::BCastParamsToGPUs(
      platform::CPUPlace cpu;
      for (size_t i = 1; i < member_->places_.size(); ++i) {
        auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var_desc->Name())->GetMutable<LoDTensor>();
+        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
        t->Resize(dims);
        t->mutable_data(cpu, main_tensor.type());
        paddle::framework::TensorCopy(main_tensor, cpu, t);
      }
    }
-    }
    member_->nccl_ctxs_->WaitAll();
  }
 #else

--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -36,11 +36,14 @@ class ParallelExecutor {
  explicit ParallelExecutor(size_t num_threads, bool use_event,
                            const std::vector<platform::Place>& places,
                            const std::unordered_set<std::string>& params,
-                            const ProgramDesc& startup_program,
+                            const std::unordered_set<std::string>& bcast_vars,
                            const ProgramDesc& main_program,
                            const std::string& loss_var_name, Scope* scope,
+                            const std::vector<Scope*>& local_scopes,
                            bool allow_op_delay);

+  std::vector<Scope*>& GetLocalScopes();
+
  void Run(const std::vector<std::string>& fetch_tensors,
           const std::string& fetched_var_name,
           const std::unordered_map<std::string, LoDTensor>& feed_tensors);
@@ -51,7 +54,7 @@ class ParallelExecutor {

  ParallelExecutorPrivate* member_;

-  void BCastParamsToGPUs(const ProgramDesc& startup_program) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };

 }  // namespace framework

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -544,13 +544,20 @@ All parameter, weight, gradient are variables in Paddle.
           [](ParallelExecutor &self, size_t num_threads, bool use_event,
              const std::vector<platform::Place> &places,
              const std::unordered_set<std::string> &params,
-              const ProgramDesc &startup_program,
+              const std::unordered_set<std::string> &bcast_vars,
              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, bool allow_op_delay) {
-             new (&self) ParallelExecutor(num_threads, use_event, places,
-                                          params, startup_program, main_program,
-                                          loss_var_name, scope, allow_op_delay);
+              Scope *scope, std::vector<Scope *> &local_scopes,
+              bool allow_op_delay) {
+             new (&self)
+                 ParallelExecutor(num_threads, use_event, places, params,
+                                  bcast_vars, main_program, loss_var_name,
+                                  scope, local_scopes, allow_op_delay);
           })
+      .def("local_scopes",
+           [](ParallelExecutor &self) -> std::vector<Scope *> * {
+             return &self.GetLocalScopes();
+           },
+           py::return_value_policy::reference)
      .def("run", &ParallelExecutor::Run);

  BindRecordIOWriter(&m);

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -22,10 +22,49 @@ __all__ = ['ParallelExecutor']

 class ParallelExecutor(object):
    def __init__(self,
-                 loss_name,
                 use_cuda,
+                 loss_name=None,
+                 main_program=None,
                 num_threads=None,
-                 allow_op_delay=False):
+                 allow_op_delay=False,
+                 share_vars_from=None):
+        """
+        ParallelExecutor can run program in parallel.
+
+        Args:
+            use_cuda(bool): Whether to use CUDA or not.
+            loss_name(str, default None): The loss name must set in training.
+            main_program(Program, default None): The program that need to run,
+                if not provided, then default_main_program will be used.
+            num_threads(int, default None): How many threads are used for
+                training.
+            allow_op_delay(bool, default False): Whether to delay and buffer
+                some operators together for scheduling or not, which may
+                improve performance in some cases, defalut False.
+            share_vars_from(ParallelExecutor, default None): If provied,
+                it will share variables from the specified ParallelExecutor.
+
+        Returns:
+            A ParallelExecutor object.
+
+        Raises:
+            TypeError: If share_vars_from is provided, but not ParallelExecutor
+                object.
+
+        Examples:
+            .. code-block:: python
+
+              train_exe = fluid.ParallelExecutor(
+                  use_cuda=True, loss_name=loss.name)
+              test_exe = fluid.ParallelExecutor(
+                  use_cuda=True,
+                  main_program=test_program,
+                  share_vars_from=train_exe)
+
+              train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+              test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+        """
+
        self._places = []
        self._act_places = []
        if use_cuda:
@@ -50,10 +89,21 @@ class ParallelExecutor(object):
            else:
                min(len(self._places) * 2, multiprocessing.cpu_count())

-        startup = framework.default_startup_program()
-        main = framework.default_main_program()
+        main = main_program
+        main = main if main else framework.default_main_program()
        scope = executor.global_scope()

+        if share_vars_from and not isinstance(share_vars_from,
+                                              ParallelExecutor):
+            raise TypeError("share_vars_from must be ParallelExecutor.")
+        local_scopes = share_vars_from.executor.local_scopes(
+        ) if share_vars_from else []
+
+        persistable_vars = [
+            v.name
+            for v in filter(lambda var: var.persistable, main.list_vars())
+        ]
+
        self.executor = core.ParallelExecutor(
            num_threads,
            True if use_cuda else False,  # use_event
@@ -62,10 +112,11 @@ class ParallelExecutor(object):
                p.name for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
-            startup.desc,
+            set(persistable_vars),
            main.desc,
-            loss_name,
+            loss_name if loss_name else '',
            scope,
+            local_scopes,
            allow_op_delay)
        self.scope = scope


--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -207,7 +207,11 @@ class TestParallelExecutorBase(unittest.TestCase):
            if memory_opt:
                fluid.memory_optimize(main)

-            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
+            place = fluid.CUDAPlace(0)
+            startup_exe = fluid.Executor(place)
+            startup_exe.run(startup)
+
+            exe = fluid.ParallelExecutor(True, loss_name=loss.name)
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
@@ -453,3 +457,41 @@ class TestTransformer(TestParallelExecutorBase):
    @unittest.skip("transformer is buggy in multi gpu")
    def test_main(self):
        self.check_network_convergence(transformer)
+
+
+class ParallelExecutorTestingDuringTraining(unittest.TestCase):
+    def test_parallel_testing(self):
+        main = fluid.Program()
+        startup = fluid.Program()
+        with fluid.program_guard(main, startup):
+            loss = simple_fc_net(True)
+            test_program = main.clone(for_test=True)
+
+            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt.minimize(loss)
+
+            batch_size = 32
+            image = numpy.random.normal(size=(batch_size,
+                                              784)).astype('float32')
+            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")
+
+            place = fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            exe.run(startup)
+            feed_dict = {'image': image, 'label': label}
+
+            train_exe = fluid.ParallelExecutor(
+                use_cuda=True, loss_name=loss.name, main_program=main)
+
+            test_exe = fluid.ParallelExecutor(
+                use_cuda=True,
+                main_program=test_program,
+                share_vars_from=train_exe)
+
+            for i in xrange(5):
+                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
+                test_loss = numpy.array(test_loss)
+
+                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
+                train_loss = numpy.array(train_loss)
+                self.assertTrue(numpy.allclose(train_loss, test_loss))