From 02dab46ab8101873663a63614f88931ead7846d9 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 28 Jan 2019 16:23:06 +0800 Subject: [PATCH] add some debug info --- .../details/async_ssa_graph_executor.cc | 2 ++ .../framework/details/exception_holder.h | 17 ++++++++++++ .../fluid/operators/reader/blocking_queue.h | 1 + .../test_async_ssa_graph_executor_mnist.py | 27 ++++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index c259ff4f747..e21d5fb96dc 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -84,6 +84,8 @@ FeedFetchList AsyncSSAGraphExecutor::Run( } if (exception_holder_.IsCaught()) { + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; exception_holder_.ReThrow(); } diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 1b1afce04eb..77ca03b86e6 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -14,6 +14,8 @@ #pragma once +#include + #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -64,6 +66,21 @@ class ExceptionHolder { ClearImpl(); } + std::string Type() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + return "None"; + case kEnforceNotMet: { + return "EnforceNotMet"; + } + case kEOF: { + return "EOF"; + } + } + return "unknown"; + } + private: void ClearImpl() { exception_.reset(); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 51b980acb5a..45c3ad802fc 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -79,6 +79,7 @@ class BlockingQueue { return true; } else { PADDLE_ENFORCE(closed_); + VLOG(3) << "queue is closed! return nothing."; return false; } } diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py index 03d7df8852e..6a2f829654c 100644 --- a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -59,6 +59,13 @@ def train(use_cuda, thread_num, cpu_num): img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') + py_reader = fluid.layers.create_py_reader_by_data( + capacity=64, + feed_list=[img, label], + name='py_reader', + use_double_buffer=True) + img, label = fluid.layers.read_file(py_reader) + prediction, avg_loss, acc = convolutional_neural_network(img, label) test_program = fluid.default_main_program().clone(for_test=True) @@ -103,7 +110,7 @@ def train(use_cuda, thread_num, cpu_num): exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = thread_num - exec_strategy.num_iteration_per_run = 2 + exec_strategy.num_iteration_per_run = 1 main_program = fluid.default_main_program() pe = fluid.ParallelExecutor( @@ -113,6 +120,22 @@ def train(use_cuda, thread_num, cpu_num): build_strategy=build_strategy, exec_strategy=exec_strategy) + py_reader.decorate_paddle_reader(train_reader) + py_reader.start() + + step = 0 + try: + while True: + print("step %d in" % step) + loss_val = pe.run(fetch_list=[avg_loss.name]) + loss_val = numpy.mean(loss_val) + if step % 1 == 0: + print("Batch %d, Cost %f, queue size %d" % + (step, loss_val, py_reader.queue.size())) + step += 1 + except fluid.core.EOFException: + py_reader.reset() + """ step = 0 for step_id, data in enumerate(train_reader()): loss_val = pe.run(feed=feeder.feed(data), fetch_list=[avg_loss.name]) @@ -120,6 +143,8 @@ def train(use_cuda, thread_num, cpu_num): if step % 100 == 0: print("Batch %d, Cost %f" % (step, loss_val)) step += 1 + """ + # test for epoch avg_loss_val, acc_val = train_test( train_test_program=test_program, -- GitLab