提交 a92aea1f 编写于 作者: M Megvii Engine Team

fix(imperative/dtr): fix dtr crash issue

GitOrigin-RevId: 3de477593690838ffc2b5281aba44ab2b9facb7f
上级 2809316e
......@@ -149,11 +149,17 @@ def test_dtr_resnet1202():
assert p.exitcode == 0
# FIXME: fix dtr crash
# @pytest.mark.require_ngpu(1)
# @pytest.mark.isolated_distributed
# def test_dtr_drop_copy_dev_tensor():
# p = mp.Process(target=run_dtr_drop_copy_dev_tensor)
# p.start()
# p.join()
# assert p.exitcode == 0
@pytest.mark.require_ngpu(1)
@pytest.mark.isolated_distributed
def test_dtr_drop_copy_dev_tensor():
p = mp.Process(target=run_dtr_drop_copy_dev_tensor)
p.start()
p.join()
assert p.exitcode == 0
@pytest.mark.require_ngpu(1)
@pytest.mark.isolated_distributed
def test_dtr_drop_tensor():
for i in range(50):
test_dtr_drop_copy_dev_tensor()
......@@ -493,7 +493,18 @@ HostTensorND ChannelImpl::get_value(Handle handle) {
auto info = reinterpret_cast<TensorInfo*>(handle);
// donnot use info->value_fetched, it's unsafe
mgb_assert(!info->invalid, "tensor is unusable due to previous error");
return wait_tensor(info, TensorProp::HostValue)->get_value();
// pin
SmallVector<TensorInfo*> vec({info});
m_dtr.pin(vec);
auto ret = wait_tensor(info, TensorProp::HostValue)->get_value();
// unpin
auto& state = get_channel_state();
auto dtr_evictee_minimum_size = state.options.dtr_evictee_minimum_size;
m_dtr.unpin(vec, dtr_evictee_minimum_size);
return ret;
}
TensorShape ChannelImpl::get_shape(Handle handle) {
......@@ -916,7 +927,9 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
i->compute_time = estimate_compute_time;
}
}
m_dtr.unpin(cmd.inputs, state);
auto& state = get_worker_state();
auto dtr_evictee_minimum_size = state.options.dtr_evictee_minimum_size;
m_dtr.unpin(cmd.inputs, dtr_evictee_minimum_size);
}
MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason);
// End profiling operator
......@@ -1098,11 +1111,12 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
return require_host ? host_available() : static_cast<bool>(info->ptr);
});
}
auto ptr = info->ptr;
MGB_RECORD_EVENT(
TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, backtrace_getter);
m_waitee = nullptr;
if (wait_host) {
auto err = info->ptr->comp_node().check_async_error();
auto err = ptr->comp_node().check_async_error();
mgb_assert(!err, "%s", err->what());
}
if (wait_regen) {
......@@ -1119,7 +1133,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
}
lock.lock();
}
return info->ptr;
return ptr;
}
void ChannelImpl::notify_tensor_unsafe(TensorInfo* info) {
......@@ -1556,11 +1570,10 @@ void ChannelImpl::DynamicSublinear::pin(const SmallVector<TensorInfo*>& vec) {
}
void ChannelImpl::DynamicSublinear::unpin(
const SmallVector<TensorInfo*>& vec, WorkerState& state) {
const SmallVector<TensorInfo*>& vec, size_t& dtr_evictee_minimum_size) {
for (auto i : vec) {
i->unpin();
if (i->pinned == 0 &&
i->size_exceeds_thd(state.options.dtr_evictee_minimum_size) &&
if (i->pinned == 0 && i->size_exceeds_thd(dtr_evictee_minimum_size) &&
i->cand_index == UINT_MAX) {
insert_candidate(i);
}
......
......@@ -265,7 +265,8 @@ private:
/*!
* \brief unpin the tensors in vec
*/
void unpin(const SmallVector<TensorInfo*>& vec, WorkerState& state);
void unpin(
const SmallVector<TensorInfo*>& vec, size_t& dtr_evictee_minimum_size);
/*!
* \brief add the tensor to the candidate set
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册