未验证 提交 26ae6d49 编写于 作者: Z Zhaolong Xing 提交者: GitHub

Update trt5 for paddle-trt (#18645)

* update paddle-trt for:
    1. fix bug: when batch > 2, core in split plugin.
    2. add leaky_relu trt5.0 support (yolov3 from 65ms to 42ms.)
    3. add new attr to dropout.
    4. shuffle channel, swish, relu6 support
    test=develop

* 1. fix ci
test=develop
上级 d8396281
...@@ -1880,6 +1880,9 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, ...@@ -1880,6 +1880,9 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input,
void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) { void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) {
auto reshape1_op = auto reshape1_op =
pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2"); pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2");
reshape1_op->assert_more([&](Node *x) {
return boost::get<std::vector<int>>(x->Op()->GetAttr("shape")).size() == 5;
});
auto reshape1_out = pattern->NewNode(reshape1_out_repr()) auto reshape1_out = pattern->NewNode(reshape1_out_repr())
->assert_is_op_output("reshape2", "Out") ->assert_is_op_output("reshape2", "Out")
......
...@@ -968,6 +968,8 @@ USE_TRT_CONVERTER(split); ...@@ -968,6 +968,8 @@ USE_TRT_CONVERTER(split);
USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(prelu);
USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(conv2d_transpose);
USE_TRT_CONVERTER(leaky_relu); USE_TRT_CONVERTER(leaky_relu);
USE_TRT_CONVERTER(shuffle_channel);
USE_TRT_CONVERTER(swish);
#endif #endif
#if PADDLE_WITH_ANAKIN #if PADDLE_WITH_ANAKIN
......
...@@ -74,6 +74,7 @@ const std::vector<std::string> kTRTSubgraphPasses({ ...@@ -74,6 +74,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
"infer_clean_graph_pass", // "infer_clean_graph_pass", //
"conv_affine_channel_fuse_pass", // "conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", // "quant_conv2d_dequant_fuse_pass", //
"delete_quant_dequant_op_pass", // "delete_quant_dequant_op_pass", //
// "fc_fuse_pass", // // "fc_fuse_pass", //
......
...@@ -3,6 +3,7 @@ nv_library(tensorrt_converter ...@@ -3,6 +3,7 @@ nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc
shuffle_channel_op.cc swish_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
...@@ -42,3 +43,9 @@ nv_test(test_op_converter SRCS test_op_converter.cc DEPS ...@@ -42,3 +43,9 @@ nv_test(test_op_converter SRCS test_op_converter.cc DEPS
# prelu_op) # prelu_op)
#nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc #nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc
# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op) # DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op)
#nv_test(test_shuffle_channel_op SRCS test_shuffle_channel_op.cc shuffle_channel_op.cc
# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine shuffle_channel_op)
#nv_test(test_swish_op SRCS test_swish_op.cc swish_op.cc
# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op tensorrt_plugin)
...@@ -42,11 +42,20 @@ class ActivationOpConverter : public OpConverter { ...@@ -42,11 +42,20 @@ class ActivationOpConverter : public OpConverter {
nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor), engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
op_pair->second); op_pair->second);
#if IS_TRT_VERSION_GE(5130)
// max(alpha, min(beta, x))
if (op_type_ == "relu6") {
layer->setAlpha(0.);
layer->setBeta(6.);
}
#endif
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode); RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
if (op_desc.HasAttr("out_scale")) { if (op_desc.HasAttr("out_scale")) {
#if IS_TRT_VERSION_GE(5000) #if IS_TRT_VERSION_GE(5130)
float out_scale = boost::get<float>(op_desc.GetAttr("out_scale")); float out_scale = boost::get<float>(op_desc.GetAttr("out_scale"));
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
#endif #endif
...@@ -63,6 +72,9 @@ const std::unordered_map<std::string, nvinfer1::ActivationType> ...@@ -63,6 +72,9 @@ const std::unordered_map<std::string, nvinfer1::ActivationType>
{"relu", nvinfer1::ActivationType::kRELU}, {"relu", nvinfer1::ActivationType::kRELU},
{"sigmoid", nvinfer1::ActivationType::kSIGMOID}, {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
{"tanh", nvinfer1::ActivationType::kTANH}, {"tanh", nvinfer1::ActivationType::kTANH},
#if IS_TRT_VERSION_GE(5130)
{"relu6", nvinfer1::ActivationType::kCLIP},
#endif
}; };
class ReluOpConverter : public ActivationOpConverter { class ReluOpConverter : public ActivationOpConverter {
...@@ -80,6 +92,11 @@ class TanhOpConverter : public ActivationOpConverter { ...@@ -80,6 +92,11 @@ class TanhOpConverter : public ActivationOpConverter {
TanhOpConverter() { op_type_ = "tanh"; } TanhOpConverter() { op_type_ = "tanh"; }
}; };
class Relu6OpConverter : public ActivationOpConverter {
public:
Relu6OpConverter() { op_type_ = "relu6"; }
};
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -87,3 +104,4 @@ class TanhOpConverter : public ActivationOpConverter { ...@@ -87,3 +104,4 @@ class TanhOpConverter : public ActivationOpConverter {
REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter); REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter); REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
REGISTER_TRT_OP_CONVERTER(relu6, Relu6OpConverter);
...@@ -31,6 +31,20 @@ class DropoutOpConverter : public OpConverter { ...@@ -31,6 +31,20 @@ class DropoutOpConverter : public OpConverter {
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob")); float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
std::string downgrade_in_infer = "";
if (op_desc.HasAttr("dropout_implementation")) {
downgrade_in_infer =
boost::get<std::string>(op_desc.GetAttr("dropout_implementation"));
}
if (!downgrade_in_infer.empty() &&
downgrade_in_infer == "upscale_in_train") {
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode);
return;
}
platform::CPUPlace cpu_place; platform::CPUPlace cpu_place;
std::unique_ptr<framework::LoDTensor> weight_tensor( std::unique_ptr<framework::LoDTensor> weight_tensor(
new framework::LoDTensor()); new framework::LoDTensor());
......
...@@ -35,7 +35,14 @@ class LeakyReluOpConverter : public OpConverter { ...@@ -35,7 +35,14 @@ class LeakyReluOpConverter : public OpConverter {
PADDLE_ENFORCE(output_num == 1); PADDLE_ENFORCE(output_num == 1);
// Get attrs // Get attrs
float alpha = boost::get<float>(op_desc.GetAttr("alpha")); float alpha = boost::get<float>(op_desc.GetAttr("alpha"));
nvinfer1::ILayer* output_layer = nullptr;
#if IS_TRT_VERSION_GE(5100)
nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
engine_, Activation, *input, nvinfer1::ActivationType::kLEAKY_RELU);
layer->setAlpha(alpha);
output_layer = layer;
#else
platform::CPUPlace place; platform::CPUPlace place;
std::unique_ptr<framework::LoDTensor> alpha_tensor( std::unique_ptr<framework::LoDTensor> alpha_tensor(
new framework::LoDTensor()); new framework::LoDTensor());
...@@ -65,7 +72,7 @@ class LeakyReluOpConverter : public OpConverter { ...@@ -65,7 +72,7 @@ class LeakyReluOpConverter : public OpConverter {
nvinfer1::ScaleMode::kUNIFORM, shift.get(), nvinfer1::ScaleMode::kUNIFORM, shift.get(),
sub_scale.get(), power.get()); sub_scale.get(), power.get());
PADDLE_ENFORCE(nullptr != scale_relu_layer); PADDLE_ENFORCE(nullptr != scale_relu_layer);
auto* output_layer = output_layer =
TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)), TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)),
*(scale_relu_layer->getOutput(0)), *(scale_relu_layer->getOutput(0)),
nvinfer1::ElementWiseOperation::kSUM); nvinfer1::ElementWiseOperation::kSUM);
...@@ -75,7 +82,7 @@ class LeakyReluOpConverter : public OpConverter { ...@@ -75,7 +82,7 @@ class LeakyReluOpConverter : public OpConverter {
PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) == PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) ==
engine_->weight_map.end()); engine_->weight_map.end());
engine_->weight_map[alpha_name] = std::move(alpha_tensor); engine_->weight_map[alpha_name] = std::move(alpha_tensor);
#endif
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name}, RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name},
test_mode); test_mode);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* ConcatOp
*/
class ShuffleChannelOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
auto input_dims = input->getDimensions();
PADDLE_ENFORCE(input_dims.nbDims == 3);
int c = input_dims.d[0];
int h = input_dims.d[1];
int w = input_dims.d[2];
int group = boost::get<int>(op_desc.GetAttr("group"));
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
layer->setReshapeDimensions(reshape_dim);
layer->setSecondTranspose({1, 0, 2, 3});
auto* output = layer->getOutput(0);
auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output);
nvinfer1::DimsCHW reshape_dim2(c, h, w);
reshape_layer->setReshapeDimensions(reshape_dim2);
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(reshape_layer, "concat", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
class SwishOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(4) << "convert fluid swish op to tensorrt layer";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
int input_num = op_desc.Input("X").size();
PADDLE_ENFORCE(input_num == 1);
auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
// Get output
size_t output_num = op_desc.Output("Out").size();
PADDLE_ENFORCE(output_num == 1);
// Get attrs
float beta = boost::get<float>(op_desc.GetAttr("beta"));
plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta);
nvinfer1::IPluginLayer* layer =
engine_->AddPlugin(&input, input_num, plugin);
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "swish", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(swish, SwishOpConverter);
...@@ -46,6 +46,8 @@ TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); } ...@@ -46,6 +46,8 @@ TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
TEST(TanhOpConverter, main) { test_activation("tanh"); } TEST(TanhOpConverter, main) { test_activation("tanh"); }
TEST(Relu6OpConverter, main) { test_activation("relu6"); }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -53,3 +55,4 @@ TEST(TanhOpConverter, main) { test_activation("tanh"); } ...@@ -53,3 +55,4 @@ TEST(TanhOpConverter, main) { test_activation("tanh"); }
USE_OP(relu); USE_OP(relu);
USE_OP(sigmoid); USE_OP(sigmoid);
USE_OP(tanh); USE_OP(tanh);
USE_OP(relu6);
...@@ -34,6 +34,7 @@ TEST(DropoutOpConverter, main) { ...@@ -34,6 +34,7 @@ TEST(DropoutOpConverter, main) {
framework::OpDesc desc; framework::OpDesc desc;
int is_test = 1; int is_test = 1;
float dropout_prob = 0.4; float dropout_prob = 0.4;
std::string dropout_implementation = "upscale_in_train";
desc.SetType("dropout"); desc.SetType("dropout");
desc.SetInput("X", {"dropout-X"}); desc.SetInput("X", {"dropout-X"});
...@@ -42,6 +43,8 @@ TEST(DropoutOpConverter, main) { ...@@ -42,6 +43,8 @@ TEST(DropoutOpConverter, main) {
desc.SetAttr("is_test", is_test); desc.SetAttr("is_test", is_test);
desc.SetAttr("dropout_prob", dropout_prob); desc.SetAttr("dropout_prob", dropout_prob);
desc.SetAttr("dropout_implementation", dropout_implementation);
LOG(INFO) << "set OP"; LOG(INFO) << "set OP";
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(leaky_relu_op, test_leaky_relu) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("sc_input", nvinfer1::DimsCHW(4, 2, 2));
validator.DeclOutputVar("sc_out", nvinfer1::DimsCHW(4, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("shuffle_channel");
desc.SetInput("X", {"sc_input"});
desc.SetOutput("Out", {"sc_out"});
int group = 2;
desc.SetAttr("group", group);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// USE_OP(leaky_relu);
USE_OP(shuffle_channel);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace paddle {
namespace inference {
namespace tensorrt {
TEST(swish_op, test_swish) {
std::unordered_set<std::string> parameters;
framework::Scope scope;
TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("sw_input", nvinfer1::DimsCHW(3, 2, 2));
validator.DeclOutputVar("sw_out", nvinfer1::DimsCHW(3, 2, 2));
// Prepare Op description
framework::OpDesc desc;
desc.SetType("swish");
desc.SetInput("X", {"sw_input"});
desc.SetOutput("Out", {"sw_out"});
desc.SetAttr("beta", 2.0f);
validator.SetOp(*desc.Proto());
validator.Execute(1);
}
} // namespace tensorrt
} // namespace inference
} // namespace paddle
USE_OP(swish);
...@@ -20,7 +20,11 @@ namespace tensorrt { ...@@ -20,7 +20,11 @@ namespace tensorrt {
// Just tell by the op_types. // Just tell by the op_types.
struct SimpleOpTypeSetTeller : public Teller { struct SimpleOpTypeSetTeller : public Teller {
SimpleOpTypeSetTeller() {} SimpleOpTypeSetTeller() {
#if IS_TRT_VERSION_GE(5130)
teller_set.insert("relu6");
#endif
}
bool operator()(const std::string& op_type, bool operator()(const std::string& op_type,
const framework::OpDesc& desc) override { const framework::OpDesc& desc) override {
...@@ -28,11 +32,27 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -28,11 +32,27 @@ struct SimpleOpTypeSetTeller : public Teller {
} }
private: private:
std::unordered_set<std::string> teller_set{ std::unordered_set<std::string> teller_set{{"mul",
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", "conv2d",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", "pool2d",
"elementwise_add", "elementwise_mul", "dropout", "prelu", "relu",
"conv2d_transpose", "leaky_relu", "fc"}}; "softmax",
"sigmoid",
"depthwise_conv2d",
"batch_norm",
"concat",
"tanh",
"pad",
"elementwise_add",
"elementwise_mul",
"dropout",
"prelu",
"conv2d_transpose",
"leaky_relu",
"fc",
"shuffle_channel",
"swish",
"split"}};
}; };
bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) {
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
nv_library(tensorrt_plugin nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
prelu_op_plugin.cu trt_plugin_factory.cc prelu_op_plugin.cu trt_plugin_factory.cc
avg_pool_op_plugin.cu avg_pool_op_plugin.cu swish_op_plugin.cu
DEPS enforce tensorrt_engine prelu) DEPS enforce tensorrt_engine prelu)
...@@ -34,6 +34,7 @@ int PReluPlugin::initialize() { ...@@ -34,6 +34,7 @@ int PReluPlugin::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
return 0;
} }
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
......
...@@ -27,50 +27,20 @@ SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { ...@@ -27,50 +27,20 @@ SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
} }
REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
// copied from operators::math::SplitFunctor
template <typename T> template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row, __device__ int upper_bound(T const* vals, int n, T const& key) {
const int in_col, const int* out_cols, int i = 0;
int out_cols_size, T** outputs_data) { while (n > 0) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int m = n / 2;
int curr_segment = 0; int j = i + m;
int curr_offset = out_cols[0]; if (!(key < vals[j])) {
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { i = j + 1;
int curr_col_offset = out_cols[curr_segment + 1]; n -= m + 1;
while (curr_col_offset <= tid_x) { } else {
curr_offset = curr_col_offset; n = m;
++curr_segment;
curr_col_offset = out_cols[curr_segment + 1];
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs_data[curr_segment];
if (output_ptr != nullptr) {
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] =
input_data[tid_y * in_col + tid_x];
}
}
}
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int fixed_out_col,
T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
int split = tid_x / fixed_out_col;
int in_offset = tid_x - split * fixed_out_col;
T* output_ptr = outputs_data[split];
if (output_ptr != nullptr) {
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * fixed_out_col + in_offset] =
input_data[tid_y * in_col + tid_x];
} }
} }
return i;
} }
nvinfer1::Dims SplitPlugin::getOutputDimensions( nvinfer1::Dims SplitPlugin::getOutputDimensions(
...@@ -101,80 +71,61 @@ int SplitPlugin::initialize() { ...@@ -101,80 +71,61 @@ int SplitPlugin::initialize() {
if (output_length_[i] != output_length_[0]) { if (output_length_[i] != output_length_[0]) {
same_shape_ = false; same_shape_ = false;
} }
segment_offsets.push_back(segment_offsets.back() + segment_offsets.push_back(segment_offsets.back() + output_length_[i]);
output_length_[i] * inner_cols_);
} }
inner_cols_ *= dims.d[axis_]; axis_shape_ = dims.d[axis_];
d_segment_offsets_ = segment_offsets; d_segment_offsets_ = segment_offsets;
segment_offsets_ = std::move(segment_offsets); segment_offsets_ = std::move(segment_offsets);
d_output_ptrs_.resize(this->getNbOutputs(), nullptr); d_output_ptrs_.resize(this->getNbOutputs(), nullptr);
return 0; return 0;
} }
// The following part of the code refers to onnx-tensorrt
// https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu
template <typename T> template <typename T>
inline void Split(cudaStream_t stream, const bool same_shape, __global__ void split_kernel(int nsegment,
const int outer_rows, const int inner_cols, int const* __restrict__ segment_offsets,
const std::vector<int>& segment_offsets, T const* __restrict__ idata, T* const* odatas,
const int* d_segment_offsets, const T* input, T** outputs) { int inner_cols, int axis_shape, int outer_rows) {
const int kThreadsPerBlock = 1024; int x0 = threadIdx.x + blockIdx.x * blockDim.x;
const int kMaxBlocks = 65535; int src_y0 = threadIdx.y + blockIdx.y * blockDim.y;
int block_cols = kThreadsPerBlock; int z0 = threadIdx.z + blockIdx.z * blockDim.z;
if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. for (int z = z0; z < outer_rows; z += blockDim.z * gridDim.z) {
block_cols = ((inner_cols + 31) >> 5) << 5; for (int src_y = src_y0; src_y < axis_shape;
} src_y += blockDim.y * gridDim.y) {
int block_rows = kThreadsPerBlock / block_cols; for (int x = x0; x < inner_cols; x += blockDim.x * gridDim.x) {
dim3 block_size = dim3(block_cols, block_rows, 1); int segment = upper_bound(segment_offsets, nsegment, src_y) - 1;
int dst_y = src_y - segment_offsets[segment];
int grid_cols = int dst_ny = segment_offsets[segment + 1] - segment_offsets[segment];
std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); odatas[segment][x + inner_cols * (dst_y + dst_ny * z)] =
int grid_rows = idata[x + inner_cols * (src_y + axis_shape * z)];
std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); }
dim3 grid_size = dim3(grid_cols, grid_rows, 1); }
if (same_shape) {
SplitKernel<<<grid_size, block_size, 0, stream>>>(
input, outer_rows, inner_cols, segment_offsets[1], outputs);
} else {
SplitKernel<<<grid_size, block_size, 0, stream>>>(
input, outer_rows, inner_cols, d_segment_offsets,
static_cast<int>(segment_offsets.size()), outputs);
} }
} }
int SplitPlugin::enqueue(int batchSize, const void* const* inputs, int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
void** outputs, void* workspace, cudaStream_t stream) { void** outputs, void* workspace, cudaStream_t stream) {
const int* d_segment_offsets_ptr =
thrust::raw_pointer_cast(&d_segment_offsets_[0]);
float const* input_ptr = reinterpret_cast<float const*>(inputs[0]); float const* input_ptr = reinterpret_cast<float const*>(inputs[0]);
if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && float* const* h_odatas = reinterpret_cast<float* const*>(outputs);
this->getNbOutputs() < 10) { float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
float** output_ptrs = reinterpret_cast<float**>(outputs); PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, h_odatas,
int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) d_output_ptrs_.size() * sizeof(float*),
? sizeof(float) cudaMemcpyHostToDevice,
: sizeof(__half); stream) == cudaSuccess);
for (int i = 0; i < this->getNbOutputs(); ++i) {
PADDLE_ENFORCE( int outer_rows = outer_rows_ * batchSize;
cudaMemcpyAsync(
output_ptrs[i], input_ptr + segment_offsets_[i], dim3 block(32, 16);
(segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, dim3 grid(std::min((inner_cols_ - 1) / block.x + 1, 65535u),
cudaMemcpyDeviceToDevice, stream) == cudaSuccess); std::min((axis_shape_ - 1) / block.y + 1, 65535u),
} std::min((outer_rows_ - 1) / block.z + 1, 65535u));
} else {
outer_rows_ *= batchSize; split_kernel<<<grid, block, 0, stream>>>(
const int* d_segment_offsets_ptr = d_segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
thrust::raw_pointer_cast(&d_segment_offsets_[0]); inner_cols_, axis_shape_, outer_rows);
float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]);
PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs,
this->getNbOutputs() * sizeof(float*),
cudaMemcpyHostToDevice,
stream) == cudaSuccess);
if (this->getDataType() == nvinfer1::DataType::kFLOAT) {
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
d_segment_offsets_ptr, input_ptr, output_ptrs);
} else {
Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_,
d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT
(__half**)output_ptrs); // NOLINT
}
}
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
......
...@@ -66,6 +66,7 @@ class SplitPlugin : public PluginTensorRT { ...@@ -66,6 +66,7 @@ class SplitPlugin : public PluginTensorRT {
int axis_; int axis_;
int outer_rows_; int outer_rows_;
int inner_cols_; int inner_cols_;
int axis_shape_;
bool same_shape_; bool same_shape_;
std::vector<int> output_length_; std::vector<int> output_length_;
std::vector<int> segment_offsets_; std::vector<int> segment_offsets_;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <cassert>
#include <vector>
#include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
SwishPlugin *CreateSwishPluginDeserialize(const void *buffer, size_t length) {
return new SwishPlugin(buffer, length);
}
REGISTER_TRT_PLUGIN("swish_plugin", CreateSwishPluginDeserialize);
int SwishPlugin::initialize() { return 0; }
nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims,
int nbInputs) {
assert(nbInputs == 1);
assert(index < this->getNbOutputs());
nvinfer1::Dims const &input_dims = inputDims[0];
nvinfer1::Dims output_dims = input_dims;
return output_dims;
}
__global__ void swish_kernel(int num, const float *input, float *output,
float beta) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
#if __CUDA_ARCH__ >= 350
output[index] =
__ldg(input + index) / (1.0f + expf(-beta * __ldg(input + index)));
#else
output[index] = input[index] / (1.0f + expf(-beta * input[index]));
#endif
}
}
int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW.
const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]);
float *output = reinterpret_cast<float **>(outputs)[0];
int num = batch_size;
for (int i = 0; i < input_dims.nbDims; i++) {
num *= input_dims.d[i];
}
int threads = 1024;
int blocks = (num + threads - 1) / threads;
swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
return cudaGetLastError() != cudaSuccess;
}
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace plugin {
class SwishPlugin : public PluginTensorRT {
private:
float beta_;
protected:
size_t getSerializationSize() override {
return getBaseSerializationSize() + SerializedSize(beta_);
}
// TRT will call this func when we need to serialize the configuration of
// tensorrt.
// It should not be called by users.
void serialize(void *buffer) override {
SerializeValue(&buffer, getPluginType());
serializeBase(buffer);
SerializeValue(&buffer, beta_);
}
public:
explicit SwishPlugin(const float beta) : beta_(beta) {}
// It was used for tensorrt deserialization.
// It should not be called by users.
SwishPlugin(void const *serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &beta_);
}
~SwishPlugin() {}
int initialize() override;
SwishPlugin *clone() const override { return new SwishPlugin(beta_); }
const char *getPluginType() const override { return "swish_plugin"; }
int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
int nbInputDims) override;
int enqueue(int batchSize, const void *const *inputs, void **outputs,
void *workspace, cudaStream_t stream) override;
};
} // namespace plugin
} // namespace tensorrt
} // namespace inference
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册