From f5fc9c3bc15397b3e2368a943184f4f490d5230a Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Tue, 29 May 2018 13:06:41 +0800 Subject: [PATCH] feature/mul converter (#10841) --- paddle/fluid/inference/analysis/helper.h | 11 +- .../inference/tensorrt/convert/CMakeLists.txt | 5 +- .../inference/tensorrt/convert/mul_op.cc | 16 +- .../tensorrt/convert/test_activation_op.cc | 2 + .../inference/tensorrt/convert/test_mul_op.cc | 47 ++++++ .../tensorrt/convert/test_op_converter.cc | 2 - .../inference/tensorrt/convert/ut_helper.h | 156 ++++++++++++++++++ paddle/fluid/inference/tensorrt/engine.cc | 15 +- paddle/fluid/inference/tensorrt/helper.h | 9 - 9 files changed, 242 insertions(+), 21 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/test_mul_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/ut_helper.h diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index ea39ba4ddb5..24ea9a4bae7 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -24,6 +24,15 @@ namespace paddle { namespace inference { namespace analysis { +template +int AccuDims(Vec &&vec, int size) { + int res = 1; + for (int i = 0; i < size; i++) { + res *= std::forward(vec)[i]; + } + return res; +} + #define SET_TYPE(type__) dic_[typeid(type__).hash_code()] = #type__; /* * Map typeid to representation. @@ -101,7 +110,5 @@ class OrderedRegistry { } // namespace paddle #define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \ - \ type__(const type__ &) = delete; \ - \ void operator=(const type__ &) = delete; diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 7cd777de27e..5ada1d63126 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,10 @@ -nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES}) +# Add TRT tests +nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine) # This test is not stable # See https://paddleci.ngrok.io/viewLog.html?tab=buildLog&buildTypeId=Paddle_PrCi2&buildId=36834&_focus=8828 #nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc # DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine # SERIAL) nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) +nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 3ca58b139bd..ed09f54bde0 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -18,11 +18,25 @@ namespace paddle { namespace inference { namespace tensorrt { +/* + * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + */ class MulOpConverter : public OpConverter { public: MulOpConverter() {} void operator()(const framework::proto::OpDesc& op) override { - LOG(INFO) << "convert a fluid mul op to tensorrt fc layer without bias"; + VLOG(4) << "convert a fluid mul op to tensorrt fc layer without bias"; + + framework::OpDesc op_desc(op, nullptr, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + // Both the input1 and input2 do not need transpose. + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, MatrixMultiply, *const_cast(input1), false, + *const_cast(input2), false); + + engine_->DeclareOutput(layer, 0, op_desc.Output("Out")[0]); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index ec33f97c824..86ca2ca08eb 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -102,3 +102,5 @@ TEST(OpConverter, ConvertRelu) { } // namespace tensorrt } // namespace inference } // namespace paddle + +USE_OP(activation); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc new file mode 100644 index 00000000000..d8b61d5f08f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(MulOpConverter, main) { + TRTConvertValidation validator(10, 1000); + validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); + validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul-X"}); + desc.SetInput("Y", {"mul-Y"}); + desc.SetOutput("Out", {"mul-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(10); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index 8d66543eb76..9ae7de9cbfa 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -23,8 +23,6 @@ namespace tensorrt { TEST(OpConverter, ConvertBlock) { framework::ProgramDesc prog; auto* block = prog.MutableBlock(0); - auto* mul_op = block->AppendOp(); - mul_op->SetType("mul"); auto* conv2d_op = block->AppendOp(); conv2d_op->SetType("conv2d"); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h new file mode 100644 index 00000000000..37fcb5c5030 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -0,0 +1,156 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file implements a UT framework to make the validation of transforming + * Fluid Op to TRT Layer. + */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Get a random float value between [low, high] + */ +float random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(1.0, 10.0); + return dist(mt); +} + +void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, + const platform::DeviceContext& ctx) { + auto dims = tensor->dims(); + size_t num_elements = analysis::AccuDims(dims, dims.size()); + PADDLE_ENFORCE_GT(num_elements, 0); + auto* data = tensor->mutable_data(place); + for (size_t i = 0; i < num_elements; i++) { + *(data + i) = random(0., 1.); + } +} + +/* + * Help to validate the correctness between Fluid Op and the corresponding TRT + * layer. + */ +class TRTConvertValidation { + public: + TRTConvertValidation() = delete; + + TRTConvertValidation(int batch_size, int workspace_size = 1 << 10) { + // create engine. + engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); + engine_->InitNetwork(); + + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + } + + // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims); + // Declare TRT inputs. + engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims); + } + + void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims); + } + + void DeclVar(const std::string& name, const nvinfer1::Dims& dims) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + + // Init Fluid tensor. + std::vector dim_vec(dims.nbDims); + for (int i = 0; i < dims.nbDims; i++) { + dim_vec[i] = dims.d[i]; + } + auto* x = scope_.Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, place, ctx); + } + + void SetOp(const framework::proto::OpDesc& desc) { + op_ = framework::OpRegistry::CreateOp(desc); + + OpConverter op_converter; + op_converter.ConvertOp(desc, engine_.get()); + + engine_->FreezeNetwork(); + + // Declare outputs. + op_desc_.reset(new framework::OpDesc(desc, nullptr, nullptr)); + + // Set Inputs. + for (const auto& input : op_desc_->InputArgumentNames()) { + auto* var = scope_.FindVar(input); + PADDLE_ENFORCE(var); + auto tensor = var->GetMutable(); + engine_->SetInputFromCPU( + input, static_cast(tensor->data()), + sizeof(float) * + analysis::AccuDims(tensor->dims(), tensor->dims().size())); + } + } + + void Execute(int batch_size) { + // Execute Fluid Op + // Execute TRT + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + engine_->Execute(batch_size); + + op_->Run(scope_, place); + + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + for (const auto& output : op_desc_->OutputArgumentNames()) { + std::vector fluid_out; + std::vector trt_out(200); + engine_->GetOutputInCPU(output, &trt_out[0], 200 * sizeof(float)); + + auto* var = scope_.FindVar(output); + auto tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &fluid_out); + // Compare two output + ASSERT_FALSE(fluid_out.empty()); + for (size_t i = 0; i < fluid_out.size(); i++) { + EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 0.001); + } + } + } + + framework::Scope& scope() { return scope_; } + + private: + std::unique_ptr engine_; + cudaStream_t stream_; + framework::Scope scope_; + std::unique_ptr op_; + std::unique_ptr op_desc_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 1c296e33a61..fb27c8394c1 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/platform/enforce.h" @@ -71,9 +72,10 @@ void TensorRTEngine::FreezeNetwork() { for (auto& item : buffer_sizes_) { if (item.second == 0) { auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); + auto dims = infer_engine_->getBindingDimensions(slot_offset); item.second = kDataTypeSize[static_cast( infer_engine_->getBindingDataType(slot_offset))] * - AccumDims(infer_engine_->getBindingDimensions(slot_offset)); + analysis::AccuDims(dims.d, dims.nbDims); } auto& buf = buffer(item.first); CHECK(buf.buffer == nullptr); // buffer should be allocated only once. @@ -85,14 +87,15 @@ void TensorRTEngine::FreezeNetwork() { nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, nvinfer1::DataType dtype, - const nvinfer1::Dims& dim) { + const nvinfer1::Dims& dims) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s", name); PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); - auto* input = infer_network_->addInput(name.c_str(), dtype, dim); + auto* input = infer_network_->addInput(name.c_str(), dtype, dims); PADDLE_ENFORCE(input, "infer network add input %s failed", name); - buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * AccumDims(dim); + buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * + analysis::AccuDims(dims.d, dims.nbDims); TensorRTEngine::SetITensor(name, input); return input; } @@ -162,13 +165,13 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data, void TensorRTEngine::SetITensor(const std::string& name, nvinfer1::ITensor* tensor) { PADDLE_ENFORCE(tensor != nullptr); - PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate itensor name %s", + PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", name); itensor_map_[name] = tensor; } nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) { - PADDLE_ENFORCE(itensor_map_.count(name), "no itensor %s", name); + PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); return itensor_map_[name]; } diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 2b402cce607..b6e79681084 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -26,15 +26,6 @@ namespace tensorrt { namespace dy = paddle::platform::dynload; -static size_t AccumDims(nvinfer1::Dims dims) { - size_t num = dims.nbDims == 0 ? 0 : 1; - for (int i = 0; i < dims.nbDims; i++) { - PADDLE_ENFORCE_GT(dims.d[i], 0); - num *= dims.d[i]; - } - return num; -} - // TensorRT data type to size const int kDataTypeSize[] = { 4, // kFLOAT -- GitLab