diff --git a/src/tensorrt/impl/tensorrt_opr.cpp b/src/tensorrt/impl/tensorrt_opr.cpp index f0a2b56af5039dc2aaac7178ce37b010fd854505..eecba29bc4cebe290770ac15fee0a47ce7f2c157 100644 --- a/src/tensorrt/impl/tensorrt_opr.cpp +++ b/src/tensorrt/impl/tensorrt_opr.cpp @@ -156,15 +156,20 @@ void TensorRTOpr::GpuAllocator::free(void* memory) { void TensorRTManager::create_trt_context( mgb::CompNode cn, const TensorShapeArray& inp_shape, nvinfer1::ICudaEngine* engine) { - if (!m_context) { + bool has_no_context = (!m_context); + if (has_no_context) { m_context = {engine->createExecutionContextWithoutDeviceMemory(), {}}; - MGB_MARK_USED_VAR(cn); + } + MGB_MARK_USED_VAR(cn); #if NV_TENSOR_RT_VERSION >= 6001 - auto profile_num = engine->getNbOptimizationProfiles(); - auto bindings_per_profile = engine->getNbBindings() / profile_num; - // choose nearest profile - int profile_idx = 0; + auto profile_num = engine->getNbOptimizationProfiles(); + auto bindings_per_profile = engine->getNbBindings() / profile_num; + // choose nearest profile #if NV_TENSOR_RT_VERSION >= 7200 + bool has_select_profile = false; + if (has_no_context) { + has_select_profile = true; + int profile_idx = 0; if (profile_num > 1) { double dist = DBL_MAX; for (int i = 0; i < profile_num; i++) { @@ -207,59 +212,76 @@ void TensorRTManager::create_trt_context( auto&& env = mgb::CompNodeEnv::from_comp_node(cn); m_context->setOptimizationProfileAsync(profile_idx, env.cuda_env().stream); } -#endif m_offset = profile_idx * bindings_per_profile; - for (size_t i = m_offset; i < m_offset + inp_shape.size(); ++i) { - auto dims = m_context->getBindingDimensions(i); - for (int j = 0; j < dims.nbDims; j++) { - if (dims.d[j] == -1) { - dims.d[j] = inp_shape.at(i - m_offset)[j]; - } + } +#endif + bool is_set_correct = true; + for (size_t i = m_offset; i < m_offset + inp_shape.size(); ++i) { + auto dims = m_context->getBindingDimensions(i); + auto dims_check = engine->getBindingDimensions(i); + for (int j = 0; j < dims.nbDims; j++) { + if (dims_check.d[j] == -1) { + dims.d[j] = inp_shape.at(i - m_offset)[j]; } - m_context->setBindingDimensions(m_offset, dims); } - // check if input shape is set correctly - for (int i = m_offset + inp_shape.size(); i < m_offset + bindings_per_profile; - ++i) { - auto dims = m_context->getBindingDimensions(i); - if (dims.nbDims == -1) { - for (int j = 0; j < profile_num; j++) { - mgb_log_error("TensorRT profile %d:\n", j); - for (size_t k = m_offset; k < m_offset + inp_shape.size(); k++) { - mgb_log_error( - "input[%zu]'s minimum shape is: %s\n", k - m_offset, - TensorRTOpr::dims2shape( - engine->getProfileDimensions( - k, j, - nvinfer1::OptProfileSelector::kMIN)) - .to_string() - .c_str()); - mgb_log_error( - "input[%zu]'s optimum shape is: %s\n", k - m_offset, - TensorRTOpr::dims2shape( - engine->getProfileDimensions( - k, j, - nvinfer1::OptProfileSelector::kOPT)) - .to_string() - .c_str()); - mgb_log_error( - "input[%zu]'s maximum shape is: %s\n", k - m_offset, - TensorRTOpr::dims2shape( - engine->getProfileDimensions( - k, j, - nvinfer1::OptProfileSelector::kMAX)) - .to_string() - .c_str()); - } + is_set_correct &= m_context->setBindingDimensions(i, dims); + } + // check if input shape is set correctly + if (!is_set_correct) { +#if NV_TENSOR_RT_VERSION >= 7200 + if (has_select_profile) { +#endif + for (size_t j = 0; j < inp_shape.size(); ++j) { + mgb_log_error( + "TensorRT input[%zu]'s shape is %s\n", j, + inp_shape.at(j).to_string().c_str()); + } + mgb_log_error( + "The selected profile's idx is %d\n", + m_offset / bindings_per_profile); + for (int j = 0; j < profile_num; j++) { + mgb_log_error("TensorRT profile %d:\n", j); + for (size_t k = m_offset; k < m_offset + inp_shape.size(); k++) { + mgb_log_error( + "input[%zu]'s minimum shape is: %s\n", k - m_offset, + TensorRTOpr::dims2shape( + engine->getProfileDimensions( + k, j, nvinfer1::OptProfileSelector::kMIN)) + .to_string() + .c_str()); + mgb_log_error( + "input[%zu]'s optimum shape is: %s\n", k - m_offset, + TensorRTOpr::dims2shape( + engine->getProfileDimensions( + k, j, nvinfer1::OptProfileSelector::kOPT)) + .to_string() + .c_str()); + mgb_log_error( + "input[%zu]'s maximum shape is: %s\n", k - m_offset, + TensorRTOpr::dims2shape( + engine->getProfileDimensions( + k, j, nvinfer1::OptProfileSelector::kMAX)) + .to_string() + .c_str()); } - mgb_throw( - MegBrainError, - "Invalid network output, this might be caused by inconsistent " - "input shapes.Correct input optimization profiles as above."); } + mgb_throw( + MegBrainError, + "Invalid network output, this might be caused by " + "inconsistent " + "input shapes.Correct input optimization profiles as " + "above."); +#if NV_TENSOR_RT_VERSION >= 7200 + } else { + // must clear context before create a new context, because + // setOptimizationProfileAsync() must be called before calling + // setBindingDimensions() + clear_trt_context(); + create_trt_context(cn, inp_shape, engine); } #endif } +#endif } #if NV_TENSOR_RT_VERSION >= 6001 @@ -284,12 +306,15 @@ void TensorRTManager::exec( } auto workspace_ptr = opr->output().back()->dev_tensor().raw_ptr(); bool should_reinit_device_memory = - !m_context || m_device_workspace_memory_ptr != workspace_ptr; - TensorShapeArray arr; - for (auto&& i : opr->input()) { - arr.push_back(i->shape()); + !m_context || (m_device_workspace_memory_ptr != workspace_ptr) || + (workspace_ptr == nullptr); + if (!m_context) { + TensorShapeArray arr; + for (auto&& i : opr->input()) { + arr.push_back(i->shape()); + } + create_trt_context(comp_node, arr, engine); } - create_trt_context(comp_node, arr, engine); m_trt_iobuf.resize(engine->getNbBindings()); bool is_trt_opr = false; if (opr->same_type()) { diff --git a/src/tensorrt/impl/tensorrt_opr.sereg.h b/src/tensorrt/impl/tensorrt_opr.sereg.h index 1db3513c994ea79e2aa59210521e921cf5601704..d534dda542a557cf1bbb74894c04a4f953a09ebf 100644 --- a/src/tensorrt/impl/tensorrt_opr.sereg.h +++ b/src/tensorrt/impl/tensorrt_opr.sereg.h @@ -42,6 +42,7 @@ cg::OperatorNodeBase* opr_shallow_copy_tensor_rt_runtime_opr( const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs, const OperatorNodeConfig& config) { auto&& opr = opr_.cast_final_safe(); + opr.clear_trt_context(); return TensorRTRuntimeOpr::make( opr.trt_cuda_engine(), opr.trt_gpu_allocator(), cg::to_symbol_var_array(inputs), config) diff --git a/src/tensorrt/impl/tensorrt_runtime_opr.cpp b/src/tensorrt/impl/tensorrt_runtime_opr.cpp index 822116bb49c8bfaf6c4a3e40a17392477f486b07..f1864ee9d169fe95e7a6ea0eb6848aefe6d2822e 100644 --- a/src/tensorrt/impl/tensorrt_runtime_opr.cpp +++ b/src/tensorrt/impl/tensorrt_runtime_opr.cpp @@ -107,7 +107,6 @@ TensorRTRuntimeOpr::TensorRTRuntimeOpr( void TensorRTRuntimeOpr::get_output_var_shape( const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { auto batch = inp_shape.at(0)[0]; - m_manager.clear_trt_context(); m_manager.create_trt_context(this->comp_node(), inp_shape, m_engine.get()); auto get_mgb_shape = [&](int binding_idx) -> TensorShape { auto dims = m_engine->getBindingDimensions(binding_idx); @@ -160,8 +159,6 @@ void TensorRTRuntimeOpr::get_output_var_shape( out_shape[i] = get_mgb_shape(i + input().size()); } out_shape.back() = {intl::workspace_size(m_engine.get())}; - // must clear context, otherwise it may cause unknwon error. - m_manager.clear_trt_context(); } void TensorRTRuntimeOpr::add_input_layout_constraint() { diff --git a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h index baa6702f12e85ed8be1d82cbc33a283fad39e849..bf0a742da46a3ca4f134e14c9e63b115c2491f11 100644 --- a/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h +++ b/src/tensorrt/include/megbrain/tensorrt/tensorrt_opr.h @@ -50,7 +50,7 @@ class TensorRTManager { std::vector m_trt_iobuf; TensorRTUniquePtr m_context; void* m_device_workspace_memory_ptr; - int m_offset; + int m_offset = 0; public: void create_trt_context( diff --git a/src/tensorrt/include/megbrain/tensorrt/tensorrt_runtime_opr.h b/src/tensorrt/include/megbrain/tensorrt/tensorrt_runtime_opr.h index 8b72022965b091322f2ce56e6a4107bf9ab1098c..ae45f0871c788ef165634008fdaf827602503d5c 100644 --- a/src/tensorrt/include/megbrain/tensorrt/tensorrt_runtime_opr.h +++ b/src/tensorrt/include/megbrain/tensorrt/tensorrt_runtime_opr.h @@ -82,6 +82,9 @@ public: return m_gpu_allocator; } + //! used in shallow copy to avoid create context twice error + void clear_trt_context() const { m_manager.clear_trt_context(); } + private: // note: gpu allocator must be released after other trt objects std::shared_ptr m_gpu_allocator;