diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h index 3359987874f2d74d7e4646baa38790431c4b28fd..4bf1840fdda8508b52d7274a338c5b1c95baf354 100644 --- a/paddle/fluid/inference/analysis/dot.h +++ b/paddle/fluid/inference/analysis/dot.h @@ -21,6 +21,7 @@ #include #include +#include #include #include diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index de0375551e16ec53b90414c7446234fda98bf706..ce2b8161715a3fa2278ce950dbac82c6d0042bef 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/framework.pb.h" namespace paddle { @@ -58,8 +59,8 @@ class EngineBase { struct Buffer { void* buffer{nullptr}; // buffer should be allocated only once. - int max_size; // buffer allocated space. - int size; // data size. + size_t max_size; // buffer allocated space. + size_t size; // data size. DeviceType device{DeviceType::UNK}; // tells which device this buffer is on. }; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 677b3e04af8e7f5662a15fb32e3b03f45d262733..b52d083f280e5e7713600a7b748dedd37aca0a1e 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,5 +1,4 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) - add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 286abf736e8ff8a357482419e85ad1258a6c6acd..4fb4511d99179e4ea14cde66feb13bc9e114581a 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,4 +1,4 @@ -nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc op_converter.h DEPS ${FLUID_CORE_MODULES}) -nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc +nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) +nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc index 32e8631fde3f748669d2008b4a060455a37e154e..854f434d93e81237dc85c5df62debcf3b3824b78 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc @@ -23,26 +23,42 @@ namespace tensorrt { using platform::is_gpu_place; using platform::is_cpu_place; -class DefaultInputConverter : public EngineInputConverter { +class DefaultIOConverter : public EngineIOConverter { public: - DefaultInputConverter() {} + DefaultIOConverter() {} // NOTE out is GPU memory. virtual void operator()(const LoDTensor& in, void* out, size_t max_size) override { PADDLE_ENFORCE(out != nullptr); - PADDLE_ENFORCE_LE(in.memory_size(), max_size); + PADDLE_ENFORCE(stream_ != nullptr); const auto& place = in.place(); + size_t size = in.memory_size(); + PADDLE_ENFORCE_LE(size, max_size); if (is_cpu_place(place)) { - PADDLE_ENFORCE(stream_ != nullptr); - PADDLE_ENFORCE_EQ(0, - cudaMemcpyAsync(out, in.data(), in.memory_size(), - cudaMemcpyHostToDevice, *stream_)); - + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data(), size, + cudaMemcpyHostToDevice, *stream_)); } else if (is_gpu_place(place)) { - PADDLE_ENFORCE_EQ(0, - cudaMemcpyAsync(out, in.data(), in.memory_size(), - cudaMemcpyHostToHost, *stream_)); - + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data(), size, + cudaMemcpyDeviceToDevice, *stream_)); + } else { + PADDLE_THROW("Unknown device for converter"); + } + cudaStreamSynchronize(*stream_); + } + // NOTE in is GPU memory. + virtual void operator()(const void* in, LoDTensor* out, + size_t max_size) override { + PADDLE_ENFORCE(in != nullptr); + PADDLE_ENFORCE(stream_ != nullptr); + const auto& place = out->place(); + size_t size = out->memory_size(); + PADDLE_ENFORCE_LE(size, max_size); + if (is_cpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToHost, *stream_)); + } else if (is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToDevice, *stream_)); } else { PADDLE_THROW("Unknown device for converter"); } @@ -50,7 +66,8 @@ class DefaultInputConverter : public EngineInputConverter { } }; -REGISTER_TENSORRT_INPUT_CONVERTER(default, DefaultInputConverter); +// fluid LodTensor <-> tensorrt ITensor +REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter); } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h index 8972dae92be2c2d261a13c48d98e675f64e51d31..71c48e085d25d2bc6720d93735f661f9e3af7b40 100644 --- a/paddle/fluid/inference/tensorrt/convert/io_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -25,43 +26,57 @@ namespace tensorrt { using framework::LoDTensor; /* - * Convert Input from Fluid to an Engine. - * TensorRT's ITensor follows row major, NCHW. Fluid is also row major, so in - * most cases just need to copy the data. + * Convert Input from Fluid to TensorRT Engine. + * Convert Output from TensorRT Engine to Fluid. + * + * Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row + * major, + * so in the default case just need to copy the data. */ -class EngineInputConverter { +class EngineIOConverter { public: - EngineInputConverter() {} + EngineIOConverter() {} virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} + virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {} void SetStream(cudaStream_t* stream) { stream_ = stream; } - static void Run(const std::string& in_op_type, const LoDTensor& in, void* out, - size_t max_size, cudaStream_t* stream) { + static void ConvertInput(const std::string& op_type, const LoDTensor& in, + void* out, size_t max_size, cudaStream_t* stream) { PADDLE_ENFORCE(stream != nullptr); - auto* converter = Registry::Lookup( - in_op_type, "default" /* default_type */); + auto* converter = Registry::Lookup( + op_type, "default" /* default_type */); PADDLE_ENFORCE_NOT_NULL(converter); converter->SetStream(stream); (*converter)(in, out, max_size); } - virtual ~EngineInputConverter() {} + static void ConvertOutput(const std::string& op_type, const void* in, + LoDTensor* out, size_t max_size, + cudaStream_t* stream) { + PADDLE_ENFORCE(stream != nullptr); + auto* converter = Registry::Lookup( + op_type, "default" /* default_type */); + PADDLE_ENFORCE_NOT_NULL(converter); + converter->SetStream(stream); + (*converter)(in, out, max_size); + } + + virtual ~EngineIOConverter() {} protected: cudaStream_t* stream_{nullptr}; }; +#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \ + struct trt_io_##op_type__##_converter { \ + trt_io_##op_type__##_converter() { \ + Registry::Register(#op_type__); \ + } \ + }; \ + trt_io_##op_type__##_converter trt_io_##op_type__##_converter__; + } // namespace tensorrt } // namespace inference } // namespace paddle - -#define REGISTER_TENSORRT_INPUT_CONVERTER(in_op_type__, Converter__) \ - struct trt_input_##in_op_type__##_converter { \ - trt_input_##in_op_type__##_converter() { \ - ::paddle::inference::Registry::Register< \ - Converter__>(#in_op_type__); \ - } \ - }; \ - trt_input_##in_op_type__##_converter trt_input_##in_op_type__##_converter__; diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 669fba1eb81c5caacea039522ea70a2d0523d022..ec33f97c8240dfc09a203d68599bffe78a4abb12 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" @@ -26,7 +27,7 @@ namespace paddle { namespace inference { namespace tensorrt { -void Compare(float input, float expect) { +void Compare(const std::string op_type, float input, float expect) { framework::Scope scope; platform::CUDAPlace place; platform::CUDADeviceContext ctx(place); @@ -35,6 +36,7 @@ void Compare(float input, float expect) { auto x_var = scope.Var("X"); auto x_tensor = x_var->GetMutable(); x_tensor->Resize({1, 1}); + x_tensor->mutable_data(place); std::vector init; init.push_back(input); framework::TensorFromVector(init, ctx, x_tensor); @@ -45,14 +47,15 @@ void Compare(float input, float expect) { out_tensor->mutable_data(place); framework::OpDesc op_desc; - op_desc.SetType("relu"); + op_desc.SetType(op_type); op_desc.SetInput("X", {"X"}); op_desc.SetOutput("Out", {"Out"}); - auto relu_op = framework::OpRegistry::CreateOp(*op_desc.Proto()); + auto op = framework::OpRegistry::CreateOp(*op_desc.Proto()); // run fluid op - relu_op->Run(scope, place); + op->Run(scope, place); + // get fluid output std::vector out1; framework::TensorToVector(*out_tensor, ctx, &out1); @@ -63,21 +66,28 @@ void Compare(float input, float expect) { engine->InitNetwork(); engine->DeclareInput("X", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 1, 1}); - + // convert op OpConverter op_converter; op_converter.ConvertOp(*op_desc.Proto(), engine); engine->DeclareOutput("Out"); engine->FreezeNetwork(); - engine->SetInputFromCPU("X", &input, 1 * sizeof(float)); - // run tensorrt op + // convert LoDTensor to ITensor + size_t size = x_tensor->memory_size(); + EngineIOConverter::ConvertInput(op_type, *x_tensor, + engine->buffer("X").buffer, size, &stream); + // run tensorrt Outp engine->Execute(1); - - float out2; - engine->GetOutputInCPU("Out", &out2, 1 * sizeof(float)); - - ASSERT_EQ(out1[0], out2); + // convert ITensor to LoDTensor + EngineIOConverter::ConvertOutput(op_type, engine->buffer("Out").buffer, + out_tensor, size, &stream); + // get tensorrt output + std::vector out2; + framework::TensorToVector(*out_tensor, ctx, &out2); + + // compare + ASSERT_EQ(out1[0], out2[0]); ASSERT_EQ(out1[0], expect); delete engine; @@ -85,8 +95,8 @@ void Compare(float input, float expect) { } TEST(OpConverter, ConvertRelu) { - Compare(1, 1); // relu(1) = 1 - Compare(-5, 0); // relu(-5) = 0 + Compare("relu", 1, 1); // relu(1) = 1 + Compare("relu", -5, 0); // relu(-5) = 0 } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc index afcc516e6b76d58e37ce0e60746704cf3933fac7..8f91309a0a00d5131268f026c319e25ba3cb964a 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -12,40 +12,63 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/tensorrt/convert/io_converter.h" -#include - namespace paddle { namespace inference { namespace tensorrt { -class EngineInputConverterTester : public ::testing::Test { - public: - void SetUp() override { tensor.Resize({10, 10}); } +void IOConverterTester(const platform::DeviceContext& ctx) { + cudaStream_t stream; + ASSERT_EQ(0, cudaStreamCreate(&stream)); - framework::LoDTensor tensor; -}; + // init fluid in_tensor + framework::LoDTensor in_tensor; + in_tensor.Resize({10, 10}); + auto place = ctx.GetPlace(); + in_tensor.mutable_data(place); + std::vector init; + for (int64_t i = 0; i < 10 * 10; ++i) { + init.push_back(i); + } + framework::TensorFromVector(init, ctx, &in_tensor); -TEST_F(EngineInputConverterTester, DefaultCPU) { + // init tensorrt buffer void* buffer; - tensor.mutable_data(platform::CPUPlace()); - ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); + size_t size = in_tensor.memory_size(); + ASSERT_EQ(cudaMalloc(&buffer, size), 0); - cudaStream_t stream; - EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), - &stream); + // convert fluid in_tensor to tensorrt buffer + EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream); + + // convert tensorrt buffer to fluid out_tensor + framework::LoDTensor out_tensor; + out_tensor.Resize({10, 10}); + out_tensor.mutable_data(place); + EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream); + + // compare in_tensor and out_tensor + std::vector result; + framework::TensorToVector(out_tensor, ctx, &result); + EXPECT_EQ(init.size(), result.size()); + for (size_t i = 0; i < init.size(); i++) { + EXPECT_EQ(init[i], result[i]); + } + cudaStreamDestroy(stream); } -TEST_F(EngineInputConverterTester, DefaultGPU) { - void* buffer; - tensor.mutable_data(platform::CUDAPlace()); - ASSERT_EQ(cudaMalloc(&buffer, tensor.memory_size()), 0); +TEST(EngineIOConverterTester, DefaultCPU) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + IOConverterTester(ctx); +} - cudaStream_t stream; - EngineInputConverter::Run("test", tensor, buffer, tensor.memory_size(), - &stream); +TEST(EngineIOConverterTester, DefaultGPU) { + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); + IOConverterTester(ctx); } } // namespace tensorrt