add some debug info

02dab46a · Qiao Longfei · 7e145b7c · 02dab46a · 02dab46a · 02dab46a
4 changed file
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -84,6 +84,8 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
  }

  if (exception_holder_.IsCaught()) {
+    VLOG(3) << "caught exception " << exception_holder_.Type()
+            << ", rethrow it";
    exception_holder_.ReThrow();
  }


--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,8 @@

 #pragma once

+#include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -64,6 +66,21 @@ class ExceptionHolder {
    ClearImpl();
  }

+  std::string Type() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        return "None";
+      case kEnforceNotMet: {
+        return "EnforceNotMet";
+      }
+      case kEOF: {
+        return "EOF";
+      }
+    }
+    return "unknown";
+  }
+
 private:
  void ClearImpl() {
    exception_.reset();

--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -79,6 +79,7 @@ class BlockingQueue {
      return true;
    } else {
      PADDLE_ENFORCE(closed_);
+      VLOG(3) << "queue is closed! return nothing.";
      return false;
    }
  }

--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -59,6 +59,13 @@ def train(use_cuda, thread_num, cpu_num):
    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

+    py_reader = fluid.layers.create_py_reader_by_data(
+        capacity=64,
+        feed_list=[img, label],
+        name='py_reader',
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(py_reader)
+
    prediction, avg_loss, acc = convolutional_neural_network(img, label)

    test_program = fluid.default_main_program().clone(for_test=True)
@@ -103,7 +110,7 @@ def train(use_cuda, thread_num, cpu_num):

    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_run = 2
+    exec_strategy.num_iteration_per_run = 1

    main_program = fluid.default_main_program()
    pe = fluid.ParallelExecutor(
@@ -113,6 +120,22 @@ def train(use_cuda, thread_num, cpu_num):
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

+    py_reader.decorate_paddle_reader(train_reader)
+    py_reader.start()
+
+    step = 0
+    try:
+        while True:
+            print("step %d in" % step)
+            loss_val = pe.run(fetch_list=[avg_loss.name])
+            loss_val = numpy.mean(loss_val)
+            if step % 1 == 0:
+                print("Batch %d, Cost %f, queue size %d" %
+                      (step, loss_val, py_reader.queue.size()))
+            step += 1
+    except fluid.core.EOFException:
+        py_reader.reset()
+    """
    step = 0
    for step_id, data in enumerate(train_reader()):
        loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name])
@@ -120,6 +143,8 @@ def train(use_cuda, thread_num, cpu_num):
        if step % 100 == 0:
            print("Batch %d, Cost %f" % (step, loss_val))
        step += 1
+    """
+
    # test for epoch
    avg_loss_val, acc_val = train_test(
        train_test_program=test_program,