From 4061aa6488ae8a84d7b70d3875bb0bcee62f69fd Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 10 Jul 2020 14:36:53 +0800 Subject: [PATCH] Polish ParallelExecutor exception process logic (#25449) * polish pe exception process logic, test=develop * fix unittest, test=develop * add unittests, test=develop --- .../framework/details/exception_holder.h | 30 +++++++--- .../details/exception_holder_test.cc | 56 ++++++++++++++++++- .../fast_threaded_ssa_graph_executor.cc | 9 ++- .../fluid/framework/details/op_handle_base.cc | 2 +- 4 files changed, 83 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 25c62877bf..f378566b60 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -107,21 +107,31 @@ class ExceptionHolder { type_ = kNone; } + // NOTE: currently in PE, multiple exceptions may occured in multiple + // threads, and the exception that occur later will overwrite that + // occur earlier, but what we want should be the first triggered exception. + // However, EOF exception is lower priority exception and can be overwritten, + // but other exceptions should not be prioritized. void Catch(const platform::EnforceNotMet& exp) { std::lock_guard lock(mu_); - exception_.reset(new platform::EnforceNotMet(exp)); - type_ = kEnforceNotMet; + if (exception_.get() == nullptr || type_ == kEOF) { + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } else { + VLOG(2) << "Non-first exception is discarded, the error message is" + << exception_->what(); + } } void Catch(const memory::allocation::BadAlloc& exp) { std::lock_guard lock(mu_); - // BadAlloc have the highest priority - if (exception_.get() != nullptr) { - VLOG(2) << "exception is reset by BadAlloc, the original error message is" + if (exception_.get() == nullptr || type_ == kEOF) { + exception_.reset(new paddle::memory::allocation::BadAlloc(exp)); + type_ = kBadAlloc; + } else { + VLOG(2) << "Non-first exception is discarded, the error message is" << exception_->what(); } - exception_.reset(new paddle::memory::allocation::BadAlloc(exp)); - type_ = kBadAlloc; } void Catch(const platform::EOFException& exp) { @@ -138,10 +148,12 @@ class ExceptionHolder { void Catch(const std::exception& exp) { std::lock_guard lock(mu_); - // std::exception will not cover anything - if (exception_.get() == nullptr) { + if (exception_.get() == nullptr || type_ == kEOF) { exception_.reset(new std::exception(exp)); type_ = kBaseException; + } else { + VLOG(2) << "Non-first exception is discarded, the error message is" + << exception_->what(); } } diff --git a/paddle/fluid/framework/details/exception_holder_test.cc b/paddle/fluid/framework/details/exception_holder_test.cc index 48a250a331..c20563a086 100644 --- a/paddle/fluid/framework/details/exception_holder_test.cc +++ b/paddle/fluid/framework/details/exception_holder_test.cc @@ -24,6 +24,29 @@ namespace details { namespace f = paddle::framework; namespace p = paddle::platform; +TEST(ExceptionHolderTester, TestEnforceNotMetCatch) { + ExceptionHolder exception_holder; + + try { + throw platform::EnforceNotMet("enforce not met test", "test_file", 0); + } catch (...) { + exception_holder.Catch(std::current_exception()); + } + ASSERT_TRUE(exception_holder.IsCaught()); + ASSERT_EQ(exception_holder.Type(), "EnforceNotMet"); + + bool catch_enforce_not_met = false; + try { + exception_holder.ReThrow(); + } catch (platform::EnforceNotMet& ex) { + catch_enforce_not_met = true; + } catch (...) { + catch_enforce_not_met = false; + } + + ASSERT_TRUE(catch_enforce_not_met); +} + TEST(ExceptionHolderTester, TestBadAllocCatch) { ExceptionHolder exception_holder; @@ -70,15 +93,24 @@ TEST(ExceptionHolderTester, TestBaseExpceptionCatch) { ASSERT_TRUE(catch_base_exception); } -TEST(ExceptionHolderTester, TestBadAllocCatchReplace) { +TEST(ExceptionHolderTester, TestExceptionReplace) { ExceptionHolder exception_holder; + + try { + throw platform::EnforceNotMet("enforce not met test", "test_file", 0); + } catch (...) { + exception_holder.Catch(std::current_exception()); + } + ASSERT_TRUE(exception_holder.IsCaught()); + ASSERT_EQ(exception_holder.Type(), "EnforceNotMet"); + try { throw std::exception(); } catch (...) { exception_holder.Catch(std::current_exception()); } ASSERT_TRUE(exception_holder.IsCaught()); - ASSERT_EQ(exception_holder.Type(), "BaseException"); + ASSERT_EQ(exception_holder.Type(), "EnforceNotMet"); try { throw memory::allocation::BadAlloc("bad alloc test", "test_file", 0); @@ -86,13 +118,31 @@ TEST(ExceptionHolderTester, TestBadAllocCatchReplace) { exception_holder.Catch(std::current_exception()); } ASSERT_TRUE(exception_holder.IsCaught()); - ASSERT_EQ(exception_holder.Type(), "BadAlloc"); + ASSERT_EQ(exception_holder.Type(), "EnforceNotMet"); try { throw platform::EOFException("eof test", "test_file", 0); } catch (...) { exception_holder.Catch(std::current_exception()); } + ASSERT_EQ(exception_holder.Type(), "EnforceNotMet"); + + exception_holder.Clear(); + + try { + throw memory::allocation::BadAlloc("bad alloc test", "test_file", 0); + } catch (...) { + exception_holder.Catch(std::current_exception()); + } + ASSERT_TRUE(exception_holder.IsCaught()); + ASSERT_EQ(exception_holder.Type(), "BadAlloc"); + + try { + throw platform::EnforceNotMet("enforce not met test", "test_file", 0); + } catch (...) { + exception_holder.Catch(std::current_exception()); + } + ASSERT_TRUE(exception_holder.IsCaught()); ASSERT_EQ(exception_holder.Type(), "BadAlloc"); } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 9d1395c035..f5ec78f44b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -269,7 +269,14 @@ void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { void FastThreadedSSAGraphExecutor::ExecutionFinal( std::vector *fetch_ops) { VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it"; - ClearFetchOp(graph_, fetch_ops); + // NOTE: If a new exception occurs in this ClearFetchOp operation, it will + // cause the loss of exception triggered firstly not thrown. + // Instead, the cleanup operation should only be performed when an EOF + // exception is caught. If other exceptions are triggered, the ClearFetchOp + // should not be continued. + if (exception_.Type() == "EOF") { + ClearFetchOp(graph_, fetch_ops); + } exception_.ReThrow(); } diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 39303447d2..35fe5d631f 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -36,7 +36,7 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW { #ifdef PADDLE_WITH_CUDA for (auto &ev : events_) { if (ev.second) { - PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second)); } } #endif -- GitLab