From 02dab46ab8101873663a63614f88931ead7846d9 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Mon, 28 Jan 2019 16:23:06 +0800
Subject: [PATCH] add some debug info

---
 .../details/async_ssa_graph_executor.cc       |  2 ++
 .../framework/details/exception_holder.h      | 17 ++++++++++++
 .../fluid/operators/reader/blocking_queue.h   |  1 +
 .../test_async_ssa_graph_executor_mnist.py    | 27 ++++++++++++++++++-
 4 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index c259ff4f747..e21d5fb96dc 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -84,6 +84,8 @@ FeedFetchList AsyncSSAGraphExecutor::Run(
   }
 
   if (exception_holder_.IsCaught()) {
+    VLOG(3) << "caught exception " << exception_holder_.Type()
+            << ", rethrow it";
     exception_holder_.ReThrow();
   }
 
diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h
index 1b1afce04eb..77ca03b86e6 100644
--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <string>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -64,6 +66,21 @@ class ExceptionHolder {
     ClearImpl();
   }
 
+  std::string Type() {
+    std::lock_guard<std::mutex> lock(mu_);
+    switch (type_) {
+      case kNone:
+        return "None";
+      case kEnforceNotMet: {
+        return "EnforceNotMet";
+      }
+      case kEOF: {
+        return "EOF";
+      }
+    }
+    return "unknown";
+  }
+
  private:
   void ClearImpl() {
     exception_.reset();
diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h
index 51b980acb5a..45c3ad802fc 100644
--- a/paddle/fluid/operators/reader/blocking_queue.h
+++ b/paddle/fluid/operators/reader/blocking_queue.h
@@ -79,6 +79,7 @@ class BlockingQueue {
       return true;
     } else {
       PADDLE_ENFORCE(closed_);
+      VLOG(3) << "queue is closed! return nothing.";
       return false;
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
index 03d7df8852e..6a2f829654c 100644
--- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py
@@ -59,6 +59,13 @@ def train(use_cuda, thread_num, cpu_num):
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
+    py_reader = fluid.layers.create_py_reader_by_data(
+        capacity=64,
+        feed_list=[img, label],
+        name='py_reader',
+        use_double_buffer=True)
+    img, label = fluid.layers.read_file(py_reader)
+
     prediction, avg_loss, acc = convolutional_neural_network(img, label)
 
     test_program = fluid.default_main_program().clone(for_test=True)
@@ -103,7 +110,7 @@ def train(use_cuda, thread_num, cpu_num):
 
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.num_threads = thread_num
-    exec_strategy.num_iteration_per_run = 2
+    exec_strategy.num_iteration_per_run = 1
 
     main_program = fluid.default_main_program()
     pe = fluid.ParallelExecutor(
@@ -113,6 +120,22 @@ def train(use_cuda, thread_num, cpu_num):
         build_strategy=build_strategy,
         exec_strategy=exec_strategy)
 
+    py_reader.decorate_paddle_reader(train_reader)
+    py_reader.start()
+
+    step = 0
+    try:
+        while True:
+            print("step %d in" % step)
+            loss_val = pe.run(fetch_list=[avg_loss.name])
+            loss_val = numpy.mean(loss_val)
+            if step % 1 == 0:
+                print("Batch %d, Cost %f, queue size %d" %
+                      (step, loss_val, py_reader.queue.size()))
+            step += 1
+    except fluid.core.EOFException:
+        py_reader.reset()
+    """
     step = 0
     for step_id, data in enumerate(train_reader()):
         loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name])
@@ -120,6 +143,8 @@ def train(use_cuda, thread_num, cpu_num):
         if step % 100 == 0:
             print("Batch %d, Cost %f" % (step, loss_val))
         step += 1
+    """
+
     # test for epoch
     avg_loss_val, acc_val = train_test(
         train_test_program=test_program,
-- 
GitLab