diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 01d7f700da9cc67d0ebbd3d9649e3823f58a8811..c5a413221ebff6b9be114151dbb93fd23a148440 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -29,9 +29,9 @@ TEST(OpConverter, ConvertBlock) { // init trt engine cudaStream_t stream_; std::unique_ptr engine_; - engine_.reset(new TensorRTEngine(5, 1 << 15, &stream_)); - engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3(2, 5, 5)); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f313beb73bb0d21cab1d62859a46fcc76a373548..e83961f3d7bda03a7659f175c59105dcb60708e9 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -78,11 +78,9 @@ class TRTConvertValidation { scope_(scope), if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { - // create engine. - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); - engine_->InitNetwork(); - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_->InitNetwork(); } // Declare a Variable as input with random initialization. @@ -175,7 +173,7 @@ class TRTConvertValidation { op_->Run(scope_, place); // Execute TRT. engine_->Execute(batch_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); const size_t output_space_size = 3000; @@ -184,7 +182,7 @@ class TRTConvertValidation { std::vector fluid_out; std::vector trt_out(output_space_size); engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(*engine_->stream()); + cudaStreamSynchronize(engine_->stream()); auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index f739752cbc44805cb0fb3246385609cf16ba744a..78b590f15d639f7b21b403413760948c6343d998 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -42,14 +42,13 @@ void TensorRTEngine::Execute(int batch_size) { PADDLE_ENFORCE(buf.device == DeviceType::GPU); buffers.push_back(buf.buffer); } - PADDLE_ENFORCE_NOT_NULL(stream_); - infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); - cudaStreamSynchronize(*stream_); + infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); + cudaStreamSynchronize(stream_); SetRuntimeBatch(batch_size); } TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(*stream_); + cudaStreamSynchronize(stream_); // clean buffer for (auto &buf : buffers_) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { @@ -173,7 +172,7 @@ void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, *stream_), + cudaMemcpyDeviceToDevice, stream_), 0); } @@ -194,7 +193,7 @@ void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, *stream_)); + cudaMemcpyDeviceToHost, stream_)); } Buffer &TensorRTEngine::buffer(const std::string &name) { @@ -211,12 +210,11 @@ void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); buf.size = size; PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, *stream_)); + cudaMemcpyHostToDevice, stream_)); } void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, @@ -227,7 +225,7 @@ void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, *stream_)); + cudaMemcpyDeviceToDevice, stream_)); } void TensorRTEngine::SetITensor(const std::string &name, diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f5b2c28ba9e6fefc1d6c14640d696c3bf3ac8249..65ab7f3caaa746cf339de67706939070a0b7d87d 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -54,17 +54,14 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, - cudaStream_t* stream = nullptr, int device = 0, + TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, + int device = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream ? stream : &default_stream_), + stream_(stream), logger_(logger), - device_(device) { - freshDeviceId(); - cudaStreamCreate(stream_); - } + device_(device) {} virtual ~TensorRTEngine(); @@ -102,7 +99,7 @@ class TensorRTEngine : public EngineBase { // NOTE this should be used after calling `FreezeNetwork`. Buffer& buffer(const std::string& name) override; - cudaStream_t* stream() { return stream_; } + cudaStream_t stream() { return stream_; } // Fill an input from CPU memory with name and size. void SetInputFromCPU(const std::string& name, const void* data, size_t size); @@ -158,9 +155,8 @@ class TensorRTEngine : public EngineBase { // batch size of the current data, will be updated each Executation. int batch_size_{-1}; - cudaStream_t* stream_; - // If stream_ is not set from outside, hold its own stream. - cudaStream_t default_stream_; + cudaStream_t stream_; + nvinfer1::ILogger& logger_; std::vector buffers_; @@ -208,38 +204,6 @@ class TensorRTEngine : public EngineBase { #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ engine__->network()->add##layer__(ARGS); -/* - * Helper to control the TensorRT engine's creation and deletion. - */ -class TRT_EngineManager { - public: - bool HasEngine(const std::string& name) const { - return engines_.count(name) != 0; - } - - // Get an engine called `name`. - TensorRTEngine* Get(const std::string& name) const { - return engines_.at(name).get(); - } - - // Create or get an engine called `name` - TensorRTEngine* Create(int max_batch, int max_workspace, cudaStream_t* stream, - const std::string& name, int gpu_device = 0) { - auto* p = new TensorRTEngine(max_batch, max_workspace, stream, gpu_device); - engines_[name].reset(p); - return p; - } - - void DeleteALl() { - for (auto& item : engines_) { - item.second.reset(nullptr); - } - } - - private: - std::unordered_map> engines_; -}; - } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index da1f6535cb3b2476cd475797861d6d2bb6d88856..9eed0f6ee9ce4d9e35bec718dc8e8435921dbd81 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -27,8 +27,8 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - // ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, &stream_); + ASSERT_EQ(0, cudaStreamCreate(&stream_)); + engine_ = new TensorRTEngine(10, 1 << 10, stream_); engine_->InitNetwork(); } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d2ca1d0b0098fd377725cc0fdf002d8d2e3539ec..b1f7a3464ac6027faffe283bccaf9793eae939e1 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -56,6 +56,13 @@ DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { +float Random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { const auto *analysis_config = reinterpret_cast(config); @@ -176,7 +183,7 @@ void SetFakeImageInput(std::vector> *inputs, float *input_data = static_cast(input.data.data()); // fill input data, for profile easily, do not use random data here. for (size_t j = 0; j < len; ++j) { - *(input_data + j) = static_cast(j) / len; + *(input_data + j) = Random(0.0, 1.0) / 10.; } } (*inputs).emplace_back(input_slots); @@ -344,6 +351,16 @@ void CompareNativeAndAnalysis( CompareResult(analysis_outputs, native_outputs); } +void CompareNativeAndAnalysis( + PaddlePredictor *native_pred, PaddlePredictor *analysis_pred, + const std::vector> &inputs) { + int batch_size = FLAGS_batch_size; + std::vector native_outputs, analysis_outputs; + native_pred->Run(inputs[0], &native_outputs, batch_size); + analysis_pred->Run(inputs[0], &analysis_outputs, batch_size); + CompareResult(analysis_outputs, native_outputs); +} + template std::string LoDTensorSummary(const framework::LoDTensor &tensor) { std::stringstream ss; diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 5aca807ee3aee1bb323abe6d5c3700dfc08e30b4..db7109b7505d4fe4dcfcf88f303aa262bc5b44fb 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -107,6 +107,27 @@ void compare(std::string model_dir, bool use_tensorrt) { inputs_all); } +void compare_continuous_input(std::string model_dir, bool use_tensorrt) { + contrib::AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, + use_tensorrt, FLAGS_batch_size); + auto config = + reinterpret_cast(&analysis_config); + auto native_pred = CreateTestPredictor(config, false); + auto analysis_pred = CreateTestPredictor(config, true); + for (int i = 0; i < 100; i++) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(), + inputs_all); + } +} + TEST(TensorRT_mobilenet, compare) { std::string model_dir = FLAGS_infer_model + "/mobilenet"; compare(model_dir, /* use_tensorrt */ true); @@ -162,5 +183,15 @@ TEST(TensorRT_mobilenet, profile) { profile(model_dir, true, false); } +TEST(resnet50, compare_continuous_input) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, true); +} + +TEST(resnet50, compare_continuous_input_native) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, false); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 88c4f508474e66953b79fb92ff1eb0b53a539f07..e7e990f759ba411f6954c51fb697a6befbad31b1 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -96,9 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { void RunTrt(const framework::Scope &scope, const platform::Place &dev_place) const { int runtime_batch = 1; + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); if (trt_engine_.get() == nullptr) { trt_engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, nullptr, + max_batch_size_, workspace_size_, stream, boost::get(dev_place).device)); Prepare(scope, dev_place, trt_engine_.get()); } @@ -126,6 +130,7 @@ class TensorRTEngineOp : public framework::OperatorBase { } } + cudaStreamSynchronize(stream); PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. engine->Execute(runtime_batch); @@ -163,7 +168,7 @@ class TensorRTEngineOp : public framework::OperatorBase { output_index += 1; } - cudaStreamSynchronize(*engine->stream()); + cudaStreamSynchronize(stream); } void Prepare(const framework::Scope &scope, const platform::Place &dev_place, diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 287b0edc96e5e312b0ff1725ee188ff319d44d23..391e7a1c070e040f6e90f820634c0d8b7cd40a96 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -99,7 +99,7 @@ TEST(TensorRTEngineOp, manual) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); + SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -193,7 +193,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); - SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); + SetAttr(engine_op_desc.Proto(), "workspace_size", 1 << 20); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"}));