fix(imperative/dtr): fix dtr crash issue

GitOrigin-RevId: 3de477593690838ffc2b5281aba44ab2b9facb7f

fix(imperative/dtr): fix dtr crash issue
GitOrigin-RevId: 3de477593690838ffc2b5281aba44ab2b9facb7f
a92aea1f · Megvii Engine Team · 2809316e · a92aea1f · a92aea1f · a92aea1f
3 changed file
--- a/imperative/python/test/integration/test_dtr.py
+++ b/imperative/python/test/integration/test_dtr.py
@@ -149,11 +149,17 @@ def test_dtr_resnet1202():
    assert p.exitcode == 0


-# FIXME: fix dtr crash
-# @pytest.mark.require_ngpu(1)
-# @pytest.mark.isolated_distributed
-# def test_dtr_drop_copy_dev_tensor():
-#     p = mp.Process(target=run_dtr_drop_copy_dev_tensor)
-#     p.start()
-#     p.join()
-#     assert p.exitcode == 0
+@pytest.mark.require_ngpu(1)
+@pytest.mark.isolated_distributed
+def test_dtr_drop_copy_dev_tensor():
+    p = mp.Process(target=run_dtr_drop_copy_dev_tensor)
+    p.start()
+    p.join()
+    assert p.exitcode == 0
+
+
+@pytest.mark.require_ngpu(1)
+@pytest.mark.isolated_distributed
+def test_dtr_drop_tensor():
+    for i in range(50):
+        test_dtr_drop_copy_dev_tensor()
--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -493,7 +493,18 @@ HostTensorND ChannelImpl::get_value(Handle handle) {
    auto info = reinterpret_cast<TensorInfo*>(handle);
    // donnot use info->value_fetched, it's unsafe
    mgb_assert(!info->invalid, "tensor is unusable due to previous error");
-    return wait_tensor(info, TensorProp::HostValue)->get_value();
+
+    // pin
+    SmallVector<TensorInfo*> vec({info});
+    m_dtr.pin(vec);
+
+    auto ret = wait_tensor(info, TensorProp::HostValue)->get_value();
+
+    // unpin
+    auto& state = get_channel_state();
+    auto dtr_evictee_minimum_size = state.options.dtr_evictee_minimum_size;
+    m_dtr.unpin(vec, dtr_evictee_minimum_size);
+    return ret;
 }

 TensorShape ChannelImpl::get_shape(Handle handle) {
@@ -916,7 +927,9 @@ void ChannelImpl::do_apply_op(const ApplyOp& cmd, std::string reason) {
                i->compute_time = estimate_compute_time;
            }
        }
-        m_dtr.unpin(cmd.inputs, state);
+        auto& state = get_worker_state();
+        auto dtr_evictee_minimum_size = state.options.dtr_evictee_minimum_size;
+        m_dtr.unpin(cmd.inputs, dtr_evictee_minimum_size);
    }
    MGB_RECORD_EVENT(OpExecuteFinishEvent, apply_id, {}, reason);
    // End profiling operator
@@ -1098,11 +1111,12 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
            return require_host ? host_available() : static_cast<bool>(info->ptr);
        });
    }
+    auto ptr = info->ptr;
    MGB_RECORD_EVENT(
            TensorWaitPropFinishEvent, info->id, m_waitee_id, prop, backtrace_getter);
    m_waitee = nullptr;
    if (wait_host) {
-        auto err = info->ptr->comp_node().check_async_error();
+        auto err = ptr->comp_node().check_async_error();
        mgb_assert(!err, "%s", err->what());
    }
    if (wait_regen) {
@@ -1119,7 +1133,7 @@ TensorPtr ChannelImpl::wait_tensor(TensorInfo* info, TensorProp prop) {
        }
        lock.lock();
    }
-    return info->ptr;
+    return ptr;
 }

 void ChannelImpl::notify_tensor_unsafe(TensorInfo* info) {
@@ -1556,11 +1570,10 @@ void ChannelImpl::DynamicSublinear::pin(const SmallVector<TensorInfo*>& vec) {
 }

 void ChannelImpl::DynamicSublinear::unpin(
-        const SmallVector<TensorInfo*>& vec, WorkerState& state) {
+        const SmallVector<TensorInfo*>& vec, size_t& dtr_evictee_minimum_size) {
    for (auto i : vec) {
        i->unpin();
-        if (i->pinned == 0 &&
-            i->size_exceeds_thd(state.options.dtr_evictee_minimum_size) &&
+        if (i->pinned == 0 && i->size_exceeds_thd(dtr_evictee_minimum_size) &&
            i->cand_index == UINT_MAX) {
            insert_candidate(i);
        }

--- a/imperative/src/impl/interpreter/interpreter_impl.h
+++ b/imperative/src/impl/interpreter/interpreter_impl.h
@@ -265,7 +265,8 @@ private:
        /*!
         * \brief unpin the tensors in vec
         */
-        void unpin(const SmallVector<TensorInfo*>& vec, WorkerState& state);
+        void unpin(
+                const SmallVector<TensorInfo*>& vec, size_t& dtr_evictee_minimum_size);

        /*!
         * \brief add the tensor to the candidate set