提交 e6a8b025 编写于 作者: M Megvii Engine Team

fix(mge): ignore errors caused by earlier async errors

GitOrigin-RevId: ce2028d38acba75cf7e31ed5eac7de38f3204b45
上级 0708bc78
...@@ -12,6 +12,7 @@ from ..core.ops.builtin import AssertEqual ...@@ -12,6 +12,7 @@ from ..core.ops.builtin import AssertEqual
from ..tensor import Tensor from ..tensor import Tensor
from ..utils.deprecation import deprecated_func from ..utils.deprecation import deprecated_func
from .elemwise import abs, maximum, minimum from .elemwise import abs, maximum, minimum
from .tensor import ones, zeros
__all__ = ["topk_accuracy"] __all__ = ["topk_accuracy"]
...@@ -59,6 +60,13 @@ def _assert_equal( ...@@ -59,6 +60,13 @@ def _assert_equal(
return result return result
def _simulate_error():
x1 = zeros(100)
x2 = ones(100)
(ret,) = apply(AssertEqual(maxerr=0, verbose=False), x1, x2, x2)
return ret
topk_accuracy = deprecated_func( topk_accuracy = deprecated_func(
"1.3", "megengine.functional.metric", "topk_accuracy", True "1.3", "megengine.functional.metric", "topk_accuracy", True
) )
......
...@@ -90,3 +90,11 @@ with megengine.core.option("enable_host_compute", 0): ...@@ -90,3 +90,11 @@ with megengine.core.option("enable_host_compute", 0):
y.numpy() y.numpy()
""" """
subprocess.check_call([sys.executable, "-c", prog]) subprocess.check_call([sys.executable, "-c", prog])
def test_regression_2870():
x = F.zeros(1000)
y = F.utils._simulate_error()
with pytest.raises(RuntimeError):
y.numpy()
(x + x).numpy()
...@@ -373,7 +373,7 @@ SmallVector<Handle> ChannelImpl::apply_op_impl( ...@@ -373,7 +373,7 @@ SmallVector<Handle> ChannelImpl::apply_op_impl(
MGB_LOCK_GUARD(m_mutex); MGB_LOCK_GUARD(m_mutex);
for (auto i : inputs) { for (auto i : inputs) {
auto info = reinterpret_cast<TensorInfo*>(i); auto info = reinterpret_cast<TensorInfo*>(i);
mgb_assert(!info->invalid, "Invalid tensor, unable to apply_op!"); mgb_assert(!info->invalid, "an input tensor is unusable due to previous error");
input_infos.push_back(info); input_infos.push_back(info);
input_descs.push_back(info->desc); input_descs.push_back(info->desc);
} }
...@@ -403,7 +403,7 @@ HostTensorND ChannelImpl::get_value(Handle handle) { ...@@ -403,7 +403,7 @@ HostTensorND ChannelImpl::get_value(Handle handle) {
"invalid handle: %p", handle); "invalid handle: %p", handle);
auto info = reinterpret_cast<TensorInfo*>(handle); auto info = reinterpret_cast<TensorInfo*>(handle);
// donnot use info->value_fetched, it's unsafe // donnot use info->value_fetched, it's unsafe
mgb_assert(!info->invalid, "Invalid tensor, unable to get_value!"); mgb_assert(!info->invalid, "tensor is unusable due to previous error");
return wait_tensor(info, TensorProp::HostValue)->get_value(); return wait_tensor(info, TensorProp::HostValue)->get_value();
} }
...@@ -776,7 +776,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) { ...@@ -776,7 +776,7 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd) {
RECORD_EVENT(OpExecuteFinishEvent, apply_id); RECORD_EVENT(OpExecuteFinishEvent, apply_id);
// End profiling operator // End profiling operator
} }
void ChannelImpl::flush_apply_stack() { void ChannelImpl::flush_apply_stack() {
m_applying = true; m_applying = true;
auto& state = get_worker_state(); auto& state = get_worker_state();
...@@ -1002,7 +1002,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt ...@@ -1002,7 +1002,7 @@ std::tuple<SmallVector<MemoryDesc>, SmallVector<TensorPtr>, SmallVector<TensorPt
} }
return tensors; return tensors;
}; };
return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)}; return {outputs_desc, alloc_storage(outputs_desc), alloc_storage(workspaces_desc)};
} }
...@@ -1021,6 +1021,15 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1021,6 +1021,15 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Put);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, ApplyOp>) { } else if constexpr (std::is_same_v<T, ApplyOp>) {
for (auto& i : cmd.inputs) {
if (i->invalid) {
MGB_LOCK_GUARD(m_mutex);
for (auto& i : cmd.outputs) {
i->invalid = true;
}
return;
}
}
m_apply_stack.push({cmd, 0, nullptr}); m_apply_stack.push({cmd, 0, nullptr});
flush_apply_stack(); flush_apply_stack();
for (size_t i = 0; i < cmd.outputs.size(); ++i) { for (size_t i = 0; i < cmd.outputs.size(); ++i) {
...@@ -1085,21 +1094,23 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1085,21 +1094,23 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del); RECORD_EVENT(TensorCommandFinishEvent, tensor_id, TensorCommandFinishEvent::Del);
sample_on_device(device, false); sample_on_device(device, false);
} else if constexpr (std::is_same_v<T, GetValue>) { } else if constexpr (std::is_same_v<T, GetValue>) {
if (cmd.dest->invalid) return;
imperative_log_profile_begin("GetValue"); imperative_log_profile_begin("GetValue");
if (!cmd.dest->ptr && cmd.dest->evict_type != EvictType::NONE) { if (!cmd.dest->ptr && cmd.dest->evict_type != EvictType::NONE) {
regenerate(cmd.dest); regenerate(cmd.dest);
} }
mgb_assert(cmd.dest->ptr, "Invalid tensor ptr!");
cmd.dest->ptr->fetch_value(); cmd.dest->ptr->fetch_value();
MGB_LOCK_GUARD(m_mutex); MGB_LOCK_GUARD(m_mutex);
notify_tensor_unsafe(cmd.dest); notify_tensor_unsafe(cmd.dest);
imperative_log_profile_end("GetValue"); imperative_log_profile_end("GetValue");
} else if constexpr (std::is_same_v<T, SwapIn>) { } else if constexpr (std::is_same_v<T, SwapIn>) {
if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapIn);
produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value)); produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value));
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapIn);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, SwapOut>) { } else if constexpr (std::is_same_v<T, SwapOut>) {
if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::SwapOut);
cmd.dest->h_value = cmd.dest->ptr->get_value(); cmd.dest->h_value = cmd.dest->ptr->get_value();
if (cmd.dest->evict_type == EvictType::NONE) { if (cmd.dest->evict_type == EvictType::NONE) {
...@@ -1110,6 +1121,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) { ...@@ -1110,6 +1121,7 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::SwapOut);
sample_on_device(cmd.dest->desc.comp_node, false); sample_on_device(cmd.dest->desc.comp_node, false);
} else if constexpr (std::is_same_v<T, Drop>) { } else if constexpr (std::is_same_v<T, Drop>) {
if (cmd.dest->invalid) return;
RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop); RECORD_EVENT(TensorCommandEvent, cmd.dest->id, TensorCommandEvent::Drop);
do_drop(cmd.dest, true); do_drop(cmd.dest, true);
RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop); RECORD_EVENT(TensorCommandFinishEvent, cmd.dest->id, TensorCommandFinishEvent::Drop);
...@@ -1186,7 +1198,11 @@ void ChannelImpl::check_worker_exc_unsafe() { ...@@ -1186,7 +1198,11 @@ void ChannelImpl::check_worker_exc_unsafe() {
m_waitee = nullptr; m_waitee = nullptr;
std::exception_ptr exc; std::exception_ptr exc;
std::swap(exc, m_worker_exc); std::swap(exc, m_worker_exc);
std::rethrow_exception(exc); try {
std::rethrow_exception(exc);
} catch (...) {
throw AsyncError();
}
} }
} }
......
...@@ -16,6 +16,17 @@ ...@@ -16,6 +16,17 @@
namespace mgb::imperative::interpreter { namespace mgb::imperative::interpreter {
struct AsyncError : std::nested_exception, std::exception {
const char* what() const noexcept {
try {
rethrow_nested();
} catch (const std::exception& e) {
return e.what();
} catch (...) {}
return "unkown async error";
}
};
struct Interpreter { struct Interpreter {
using Handle = void*; using Handle = void*;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册