diff --git a/CMakeLists.txt b/CMakeLists.txt index c649aafeddaf9f28c213d086236c3779d3137d92..de47086dbd6a440cd413c7843c83b1c69d9841b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_TENSORRT "Compile PaddlePaddle with TensorRT support." OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) @@ -181,6 +182,11 @@ if(WITH_GPU) include(cuda) endif(WITH_GPU) +# TensorRT depends on GPU. +if (NOT WITH_GPU) + set(WITH_TENSORRT OFF) +endif() + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/Dockerfile b/Dockerfile index 0f13acabc3e5e1f6a46c5712ca6ad199266dd5ed..9097bb657d2366997112ec7662762a93358aa647 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,6 +45,13 @@ ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin # install glide RUN curl -s -q https://glide.sh/get | sh +# Install TensorRT +# The unnecessary files has been removed to make the library small. +RUN wget -qO- http://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + tar -xz -C /usr/local && \ + cp -rf /usr/local/TensorRT/include /usr && \ + cp -rf /usr/local/TensorRT/lib /usr + # git credential to skip password typing RUN git config --global credential.helper store diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index e53bcf2384e54e21c7dd5638f3b7469a35b571bf..8494edee6c2c714c285c45bbb4fe1d8cb1a524aa 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -21,4 +21,7 @@ endif() if(WITH_TESTING) add_subdirectory(tests/book) + if (WITH_TENSORRT) + add_subdirectory(tensorrt) + endif() endif() diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e39c0daac76e0993382868289f66351da3d16f8f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -0,0 +1 @@ +nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc new file mode 100644 index 0000000000000000000000000000000000000000..a81a708e7a79225fd52c4b8e081afdcd8fe7e9ad --- /dev/null +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -0,0 +1,155 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include "NvInfer.h" +#include "cuda.h" +#include "cuda_runtime_api.h" +#include "paddle/fluid/platform/dynload/tensorrt.h" + +namespace dy = paddle::platform::dynload; + +class Logger : public nvinfer1::ILogger { + public: + void log(nvinfer1::ILogger::Severity severity, const char* msg) override { + switch (severity) { + case Severity::kINFO: + LOG(INFO) << msg; + break; + case Severity::kWARNING: + LOG(WARNING) << msg; + break; + case Severity::kINTERNAL_ERROR: + case Severity::kERROR: + LOG(ERROR) << msg; + break; + default: + break; + } + } +}; + +class ScopedWeights { + public: + ScopedWeights(float value) : value_(value) { + w.type = nvinfer1::DataType::kFLOAT; + w.values = &value_; + w.count = 1; + } + const nvinfer1::Weights& get() { return w; } + + private: + float value_; + nvinfer1::Weights w; +}; + +// The following two API are implemented in TensorRT's header file, cannot load +// from the dynamic library. So create our own implementation and directly +// trigger the method from the dynamic library. +nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger& logger) { + return static_cast( + dy::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); +} +nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger& logger) { + return static_cast( + dy::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); +} + +const char* kInputTensor = "input"; +const char* kOutputTensor = "output"; + +// Creates a network to compute y = 2x + 3 +nvinfer1::IHostMemory* CreateNetwork() { + Logger logger; + // Create the engine. + nvinfer1::IBuilder* builder = createInferBuilder(logger); + ScopedWeights weights(2.); + ScopedWeights bias(3.); + + nvinfer1::INetworkDefinition* network = builder->createNetwork(); + // Add the input + auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT, + nvinfer1::DimsCHW{1, 1, 1}); + EXPECT_NE(input, nullptr); + // Add the hidden layer. + auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get()); + EXPECT_NE(layer, nullptr); + // Mark the output. + auto output = layer->getOutput(0); + output->setName(kOutputTensor); + network->markOutput(*output); + // Build the engine. + builder->setMaxBatchSize(1); + builder->setMaxWorkspaceSize(1 << 10); + auto engine = builder->buildCudaEngine(*network); + EXPECT_NE(engine, nullptr); + // Serialize the engine to create a model, then close. + nvinfer1::IHostMemory* model = engine->serialize(); + network->destroy(); + engine->destroy(); + builder->destroy(); + return model; +} + +void Execute(nvinfer1::IExecutionContext& context, const float* input, + float* output) { + const nvinfer1::ICudaEngine& engine = context.getEngine(); + // Two binds, input and output + ASSERT_EQ(engine.getNbBindings(), 2); + const int input_index = engine.getBindingIndex(kInputTensor); + const int output_index = engine.getBindingIndex(kOutputTensor); + // Create GPU buffers and a stream + void* buffers[2]; + ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float))); + ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float))); + cudaStream_t stream; + ASSERT_EQ(0, cudaStreamCreate(&stream)); + // Copy the input to the GPU, execute the network, and copy the output back. + ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), + cudaMemcpyHostToDevice, stream)); + context.enqueue(1, buffers, stream, nullptr); + ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), + cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release the stream and the buffers + cudaStreamDestroy(stream); + ASSERT_EQ(0, cudaFree(buffers[input_index])); + ASSERT_EQ(0, cudaFree(buffers[output_index])); +} + +TEST(TensorrtTest, BasicFunction) { + // Create the network serialized model. + nvinfer1::IHostMemory* model = CreateNetwork(); + + // Use the model to create an engine and an execution context. + Logger logger; + nvinfer1::IRuntime* runtime = createInferRuntime(logger); + nvinfer1::ICudaEngine* engine = + runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); + model->destroy(); + nvinfer1::IExecutionContext* context = engine->createExecutionContext(); + + // Execute the network. + float input = 1234; + float output; + Execute(*context, &input, &output); + EXPECT_EQ(output, input * 2 + 3); + + // Destroy the engine. + context->destroy(); + engine->destroy(); + runtime->destroy(); +} diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 84dac2937de02b3374156ebc83e19dac9f9a3e7a..b93b925a72a55442c105e4280a3580f4ea5b93a1 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,6 +1,11 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc nccl.cc) +if (WITH_TENSORRT) + list(APPEND CUDA_SRCS tensorrt.cc) +endif() + + configure_file(cupti_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/cupti_lib_path.h) if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 3c1ccc7445ed27c711ab250aa223c66ae0da45dc..19c01dc5a968c7e1d2b0f15cf9a0e8427004e58b 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -45,6 +45,10 @@ DEFINE_string(nccl_dir, "", DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); +DEFINE_string( + tensorrt_dir, "", + "Specify path for loading tensorrt library, such as libnvinfer.so."); + namespace paddle { namespace platform { namespace dynload { @@ -194,6 +198,14 @@ void* GetNCCLDsoHandle() { #endif } +void* GetTensorRtDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 4c85093a43e0e8d75b64c5b29d1ec68db1b44909..0de3559b6088086cb52c254535b6ec42da7dd724 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -25,6 +25,7 @@ void* GetCurandDsoHandle(); void* GetWarpCTCDsoHandle(); void* GetLapackDsoHandle(); void* GetNCCLDsoHandle(); +void* GetTensorRtDsoHandle(); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc new file mode 100644 index 0000000000000000000000000000000000000000..f3c8e27944ca9b6419de87d752df3a83751039b1 --- /dev/null +++ b/paddle/fluid/platform/dynload/tensorrt.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/platform/dynload/tensorrt.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag tensorrt_dso_flag; +void *tensorrt_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h new file mode 100644 index 0000000000000000000000000000000000000000..f584a49da0fefe0b064b5fb55b01ec132225ce5e --- /dev/null +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag tensorrt_dso_flag; +extern void* tensorrt_dso_handle; + +#ifdef PADDLE_USE_DSO + +#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = \ + paddle::platform::dynload::GetTensorRtDsoHandle(); \ + PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \ + }); \ + void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#else +#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + tensorrtResult_t operator()(Args... args) { \ + return __name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name +#endif + +#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ + __macro(createInferBuilder_INTERNAL); \ + __macro(createInferRuntime_INTERNAL); + +TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/utils/DynamicLoader.cpp b/paddle/utils/DynamicLoader.cpp index 5604a90038b06d2c1a4d9db70e4185cddfd25d3e..9ac4a56c6e300d299467630b39a32567af72cf40 100644 --- a/paddle/utils/DynamicLoader.cpp +++ b/paddle/utils/DynamicLoader.cpp @@ -32,6 +32,8 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so."); +DEFINE_string(tensorrt_dir, "", "Specify path for loading libnvinfer.so."); + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -157,3 +159,12 @@ void GetLapackDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle); #endif } + +void GetTensorRtDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath( + FLAGS_tensorrt_dir, "libnvinfer.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so", dso_handle); +#endif +} diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h index 2e5ff76a06152b6a12818f06baaeaa6a69726ba8..02f519de4b3988fb6aca323aaa1751ee2c4bd738 100644 --- a/paddle/utils/DynamicLoader.h +++ b/paddle/utils/DynamicLoader.h @@ -58,3 +58,11 @@ void GetWarpCTCDsoHandle(void** dso_handle); * */ void GetLapackDsoHandle(void** dso_handle); + +/** + * @brief load the DSO of tensorrt + * + * @param **dso_handle dso handler + * + */ +void GetTensorRtDsoHandle(void** dso_handle);