From c28beb8a3ce3471cd19cd96e69f6e0d2eb13f008 Mon Sep 17 00:00:00 2001
From: Yu Yang <reyoung@126.com>
Date: Wed, 7 Nov 2018 15:50:28 +0800
Subject: [PATCH] test(Pe): add dry run tests for pe (#14254)

Dry run tests will skip `Op.Run` and just perform job scheduling. It helps to analysis dead lock in PE.

test=develop
---
 .../framework/details/execution_strategy.h    |  2 +
 .../fast_threaded_ssa_graph_executor.cc       |  4 +-
 .../details/threaded_ssa_graph_executor.cc    |  4 +-
 .../details/threaded_ssa_graph_executor.h     |  2 +-
 paddle/fluid/framework/parallel_executor.cc   | 23 +++---
 paddle/fluid/pybind/pybind.cc                 |  7 +-
 python/paddle/fluid/layers/io.py              |  2 +-
 .../test_parallel_executor_dry_run.py         | 80 +++++++++++++++++++
 8 files changed, 108 insertions(+), 16 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
index 5183be878..15c496130 100644
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <cstddef>  // for size_t
 
 namespace paddle {
 namespace framework {
@@ -26,6 +27,7 @@ struct ExecutionStrategy {
   bool allow_op_delay_{false};
   size_t num_iteration_per_drop_scope_{100};
   ExecutorType type_{kDefault};
+  bool dry_run_{false};
 };
 
 }  //  namespace details
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index 98fc390e7..2b2329b96 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     size_t complete = 0;
     while (op_to_run != nullptr) {
       try {
-        op_to_run->Run(strategy_.use_cuda_);
+        if (LIKELY(!strategy_.dry_run_)) {
+          op_to_run->Run(strategy_.use_cuda_);
+        }
         ++complete;
       } catch (...) {
         exception_.Catch(std::current_exception());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index dc63effd1..2d2bdb604 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
       if (VLOG_IS_ON(10)) {
         VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
       }
-      op->Run(strategy_.use_cuda_);
+      if (LIKELY(!strategy_.dry_run_)) {
+        op->Run(strategy_.use_cuda_);
+      }
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index dbb0b498d..5c0bc169e 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   // Use topological sort algorithm
   FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
 
-  ~ThreadedSSAGraphExecutor() {}
+  ~ThreadedSSAGraphExecutor() final = default;
 
  private:
   void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index a45b9ec7a..dfb107688 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
   explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
       : places_(places) {}
 
+  ~ParallelExecutorPrivate() {
+    if (own_local_scope_) {
+      for (size_t i = 1; i < local_scopes_.size(); ++i) {
+        // Skip the first scope, since it is the global scope.
+        Scope *local_scope = local_scopes_[i];
+        if (global_scope_->HasKid(local_scope)) {
+          global_scope_->DeleteScope(local_scope);
+        }
+      }
+    }
+  }
   std::vector<platform::Place> places_;
   std::vector<Scope *> local_scopes_;
-  Scope *global_scope_;
+  Scope *global_scope_;  // not owned
   std::unique_ptr<details::SSAGraphExecutor> executor_;
 
 #ifdef PADDLE_WITH_CUDA
@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
   for (auto &p : member_->places_) {
     platform::DeviceContextPool::Instance().Get(p)->Wait();
   }
-
-  if (member_->own_local_scope_) {
-    for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
-      Scope *local_scope = member_->local_scopes_[i];
-      if (member_->global_scope_->HasKid(local_scope)) {
-        member_->global_scope_->DeleteScope(local_scope);
-      }
-    }
-  }
-
   // member_ must be destructed before gcs_ since the destructor of
   // ReferenceCountOpHandle use raw pointers of gcs_ inside.
   member_.reset();
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index fc821e04a..238cc1918 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle.
                        will clean up the temp variables at the end of the current iteration.
                     2. In some NLP model, it may cause the GPU memory is insufficient,
                        in this case, you should reduce `num_iteration_per_drop_scope`.
-              )DOC");
+              )DOC")
+      .def_property("_dry_run",
+                    [](const ExecutionStrategy &self) { return self.dry_run_; },
+                    [](ExecutionStrategy &self, bool dry_run) {
+                      self.dry_run_ = dry_run;
+                    });
 
   exec_strategy.def_property(
       "use_experimental_executor",
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 80b50022d..d1c926c4e 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -60,7 +60,7 @@ def data(name,
             For example if shape=[1], the resulting shape is [-1, 1].
           2. If shape contains -1, such as shape=[1, -1],
             append_batch_size will be enforced to be be False (ineffective).
-       dtype(int|float): The type of data : float32, float_16, int etc
+       dtype(basestring): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
        stop_gradient(bool): A boolean that mentions whether gradient should flow.
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
new file mode 100644
index 000000000..c93740669
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import logging
+import six
+
+
+class TestBase(unittest.TestCase):
+    def main(self,
+             network_func,
+             iter=100,
+             iter_per_pe=100,
+             use_gpu=True,
+             use_experimental_executor=False):
+        if use_gpu and not fluid.core.is_compiled_with_cuda():
+            logging.warning(
+                "Paddle is not compiled with CUDA, skip GPU unittests")
+            return
+
+        main_prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.Scope()
+        with fluid.program_guard(main_prog, startup_prog):
+            with fluid.scope_guard(scope):
+                loss = network_func()
+                fluid.Executor(
+                    fluid.CUDAPlace(0)
+                    if use_gpu else fluid.CPUPlace()).run(startup_prog)
+
+        for _ in six.moves.xrange(iter):
+            exe_strategy = fluid.ExecutionStrategy()
+            exe_strategy._dry_run = True
+            exe_strategy.use_experimental_executor = use_experimental_executor
+            pe = fluid.ParallelExecutor(
+                use_cuda=True,
+                loss_name=loss.name,
+                main_program=main_prog,
+                exec_strategy=exe_strategy)
+            for _ in six.moves.xrange(iter_per_pe):
+                pe.run([])
+
+
+class TestMNISTDryRun(TestBase):
+    def test_mnist_dry_run(self):
+        for use_gpu in (False, True):
+            for use_experimental_executor in (False, True):
+                self.main(
+                    network_func=TestMNISTDryRun.network_func,
+                    use_gpu=use_gpu,
+                    use_experimental_executor=use_experimental_executor)
+
+    @staticmethod
+    def network_func():
+        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+        hidden = img
+        for _ in six.moves.xrange(10):
+            hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+        prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+        loss = fluid.layers.cross_entropy(input=prediction, label=label)
+        avg_loss = fluid.layers.mean(loss)
+        fluid.optimizer.Adam().minimize(avg_loss)
+        return avg_loss
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab