未验证 提交 3acb047a 编写于 作者: Z Zhaolong Xing 提交者: GitHub

[Paddle-TRT]: add eltwise,pool2d, prelu, scale, concat, gelu dynamic shape support (#23396)

* add elementwise pool2d, prelu, shuffle channel
test=develop

* add scale and refine concat eltwise conveter
test=develop

* refine elementwise converter
test=develop

* refine ut test and enforce error.
test=develop

* modify const cast
test=develop
上级 c1133028
...@@ -973,4 +973,5 @@ USE_TRT_CONVERTER(gelu); ...@@ -973,4 +973,5 @@ USE_TRT_CONVERTER(gelu);
USE_TRT_CONVERTER(multihead_matmul); USE_TRT_CONVERTER(multihead_matmul);
USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm); USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
USE_TRT_CONVERTER(skip_layernorm); USE_TRT_CONVERTER(skip_layernorm);
USE_TRT_CONVERTER(scale);
#endif #endif
...@@ -3,7 +3,8 @@ nv_library(tensorrt_converter ...@@ -3,7 +3,8 @@ nv_library(tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
shuffle_channel_op.cc swish_op.cc instance_norm_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc shuffle_channel_op.cc swish_op.cc instance_norm_op.cc
emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc
DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
nv_test(test_op_converter SRCS test_op_converter.cc DEPS nv_test(test_op_converter SRCS test_op_converter.cc DEPS
......
...@@ -39,7 +39,9 @@ class ConcatOpConverter : public OpConverter { ...@@ -39,7 +39,9 @@ class ConcatOpConverter : public OpConverter {
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(), auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
itensors.size()); itensors.size());
axis = axis - 1; // Remove batch dim if (!engine_->with_dynamic_shape()) {
axis = axis - 1; // Remove batch dim
}
layer->setAxis(axis); layer->setAxis(axis);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode); RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode);
......
...@@ -48,20 +48,68 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -48,20 +48,68 @@ class ElementwiseWeightOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* X = engine_->GetITensor(op_desc.Input("X").front());
nvinfer1::Dims dims_x = X->getDimensions();
std::vector<int> no_batch_dims;
int start_index = 0;
if (engine_->with_dynamic_shape()) start_index = 1;
for (; start_index < dims_x.nbDims; start_index++)
no_batch_dims.push_back(dims_x.d[start_index]);
auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
auto* Y_t = Y_v->GetMutable<framework::LoDTensor>(); auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
float* weight_data = nullptr; float* weight_data = nullptr;
weight_data = weight_data =
engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false); engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false);
nvinfer1::Dims dims_x = X->getDimensions();
auto regist_eltwise_weight = [&](nvinfer1::ScaleMode scale_mode) {
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
if (op_type_ == "add") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, shift_weights.get(),
scale_weights.get(), power_weights.get());
layer = scale_layer;
} else if (op_type_ == "mul") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, scale_weights.get(),
shift_weights.get(), power_weights.get());
layer = scale_layer;
}
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
test_mode);
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("X_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
#endif
}
};
if (engine_->with_dynamic_shape()) {
if (Y_t->dims().size() == 1) {
auto scale_mode = nvinfer1::ScaleMode::kCHANNEL;
PADDLE_ENFORCE_EQ(Y_t->dims()[0], dims_x.d[1],
platform::errors::InvalidArgument(
"The Bias's size(%d) should be equal to the "
"first dim(%d) of the Input.",
Y_t->dims()[0], dims_x.d[1]));
regist_eltwise_weight(scale_mode);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"TensorRT Dynamic shape unsupported weight shape for Elementwise "
"op!"));
}
return;
}
std::vector<int> no_batch_dims;
int start_index = 0;
for (; start_index < dims_x.nbDims; start_index++)
no_batch_dims.push_back(dims_x.d[start_index]);
auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
...@@ -91,36 +139,7 @@ class ElementwiseWeightOpConverter : public OpConverter { ...@@ -91,36 +139,7 @@ class ElementwiseWeightOpConverter : public OpConverter {
} else { } else {
PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!");
} }
regist_eltwise_weight(scale_mode);
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data),
static_cast<size_t>(Y_t->numel())};
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
if (op_type_ == "add") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, shift_weights.get(),
scale_weights.get(), power_weights.get());
layer = scale_layer;
} else if (op_type_ == "mul") {
nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *X, scale_mode, scale_weights.get(),
shift_weights.get(), power_weights.get());
layer = scale_layer;
}
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name},
test_mode);
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("X_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
#endif
}
} }
protected: protected:
...@@ -146,44 +165,62 @@ class ElementwiseTensorOpConverter : public OpConverter { ...@@ -146,44 +165,62 @@ class ElementwiseTensorOpConverter : public OpConverter {
auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* X = engine_->GetITensor(op_desc.Input("X").front());
auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
std::vector<nvinfer1::ITensor*> itensors;
itensors.push_back(X);
itensors.push_back(Y);
nvinfer1::Dims dims_x = X->getDimensions(); nvinfer1::Dims dims_x = X->getDimensions();
nvinfer1::Dims dims_y = Y->getDimensions(); nvinfer1::Dims dims_y = Y->getDimensions();
int axis = boost::get<int>(op_desc.GetAttr("axis")); int axis = boost::get<int>(op_desc.GetAttr("axis"));
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
auto common_func = [&](nvinfer1::ILayer* layer) {
RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("X_scale"));
CHECK(op_desc.HasAttr("Y_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
float y_scale = boost::get<float>(op_desc.GetAttr("Y_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
engine_->SetTensorDynamicRange(Y, y_scale);
#endif
}
};
if (CheckDims(dims_x, dims_y)) { if (CheckDims(dims_x, dims_y)) {
// The two input tensor should have the same dims // The two input tensor should have the same dims
VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER( nvinfer1::IElementWiseLayer* elet_layer =
engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X), TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *X, *Y, op_pair->second);
*const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
layer = elet_layer; layer = elet_layer;
} else { } else {
VLOG(3) << "Convert a fluid elementwise op to TensorRT " VLOG(3) << "Convert a fluid elementwise op to TensorRT "
"ElementWisePluginLayer"; "ElementWisePluginLayer";
if (engine_->with_dynamic_shape()) {
plugin::ElementWisePlugin* plugin = #if IS_TRT_VERSION_GE(6000)
new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin::ElementwisePluginDynamic* plugin =
plugin->AddInput(X); new plugin::ElementwisePluginDynamic(op_type_, axis);
plugin->AddInput(Y); layer = engine_->AddPluginV2(itensors.data(), 2, plugin);
nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin( #else
const_cast<nvinfer1::ITensor* const*>(plugin->GetInputs().data()), 2, PADDLE_THROW(platform::errors::Fatal(
reinterpret_cast<plugin::PluginTensorRT*>(plugin)); "You are running the TRT Dynamic Shape mode, need to confirm that "
"your TRT version is no less than 6.0"));
layer = plugin_layer;
}
RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
if (op_desc.HasAttr("enable_int8")) {
#if IS_TRT_VERSION_GE(5000)
CHECK(op_desc.HasAttr("X_scale"));
CHECK(op_desc.HasAttr("Y_scale"));
float x_scale = boost::get<float>(op_desc.GetAttr("X_scale"));
float y_scale = boost::get<float>(op_desc.GetAttr("Y_scale"));
engine_->SetTensorDynamicRange(X, x_scale);
engine_->SetTensorDynamicRange(Y, y_scale);
#endif #endif
} else {
plugin::ElementWisePlugin* plugin =
new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
plugin->AddInput(X);
plugin->AddInput(Y);
nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin(
plugin->GetInputs().data(), 2,
reinterpret_cast<plugin::PluginTensorRT*>(plugin));
layer = plugin_layer;
}
} }
common_func(layer);
} }
protected: protected:
......
...@@ -19,10 +19,10 @@ namespace paddle { ...@@ -19,10 +19,10 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector<int> ksize, inline void DealCeilMode(const nvinfer1::Dims &input_shape,
std::vector<int> strides, std::vector<int> paddings, std::vector<int> ksize, std::vector<int> strides,
nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad, std::vector<int> paddings, nvinfer1::DimsHW *pre_pad,
int input_dims) { nvinfer1::DimsHW *post_pad, int input_dims) {
int input_height = input_shape.d[input_dims - 2]; int input_height = input_shape.d[input_dims - 2];
int input_width = input_shape.d[input_dims - 1]; int input_width = input_shape.d[input_dims - 1];
int floor_h_output_size = int floor_h_output_size =
...@@ -112,6 +112,31 @@ class Pool2dOpConverter : public OpConverter { ...@@ -112,6 +112,31 @@ class Pool2dOpConverter : public OpConverter {
#endif #endif
} }
if (engine_->with_dynamic_shape()) {
if (!adaptive && pool_type == "max" && !global_pooling && !ceil_mode) {
auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
nv_pool_type, nv_ksize);
pool_layer->setStride(nv_strides);
pool_layer->setPadding(nv_paddings);
layer = pool_layer;
} else {
#if IS_TRT_VERSION_GE(6000)
plugin::PoolPluginDynamic *plugin =
new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize,
strides, paddings, global_pooling);
layer = engine_->AddPluginV2(&input1, 1, plugin);
#endif
}
auto output_name = op_desc.Output("Out")[0];
layer->setName(("pool2d (Output: " + output_name + ")").c_str());
layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
engine_->DeclareOutput(output_name);
}
return;
}
if (global_pooling == true) { if (global_pooling == true) {
nv_ksize.d[0] = input_shape.d[input_dims - 2]; nv_ksize.d[0] = input_shape.d[input_dims - 2];
nv_ksize.d[1] = input_shape.d[input_dims - 1]; nv_ksize.d[1] = input_shape.d[input_dims - 1];
......
...@@ -50,10 +50,22 @@ class PReluOpConverter : public OpConverter { ...@@ -50,10 +50,22 @@ class PReluOpConverter : public OpConverter {
TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place); float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
plugin::PReluPlugin* plugin = nvinfer1::ILayer* layer = nullptr;
new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); if (engine_->with_dynamic_shape()) {
nvinfer1::IPluginLayer* layer = #if IS_TRT_VERSION_GE(6000)
engine_->AddPlugin(&input, input_num, plugin); plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
alpha_data, alpha_tensor_temp->numel(), mode);
layer = engine_->AddPluginV2(&input, input_num, plugin);
#else
PADDLE_THROW(platform::errors::Fatal(
"You are running the TRT Dynamic Shape mode, need to confirm that "
"your TRT version is no less than 6.0"));
#endif
} else {
plugin::PReluPlugin* plugin =
new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
layer = engine_->AddPlugin(&input, input_num, plugin);
}
// keep alpha tensor to avoid release it's memory // keep alpha tensor to avoid release it's memory
engine_->SetWeights(op_desc.Input("Alpha")[0], engine_->SetWeights(op_desc.Input("Alpha")[0],
std::move(alpha_tensor_temp)); std::move(alpha_tensor_temp));
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* ConcatOp
*/
class ScaleOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope, bool test_mode) override {
VLOG(3) << "convert a fluid scale op to tensorrt mul layer without bias";
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
std::vector<nvinfer1::ITensor*> itensors;
std::string input_name = op_desc.Input("X").front();
std::string out_name = op_desc.Output("Out").front();
auto input = engine_->GetITensor(input_name);
bool bias_after_scale =
boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
float bias = boost::get<float>(op_desc.GetAttr("bias"));
float scale = boost::get<float>(op_desc.GetAttr("scale"));
auto create_weights = [&](float data, std::string type) -> float* {
std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
tmp_tensor->Resize({1});
auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
tmp_data[0] = data;
engine_->SetWeights(out_name + "_scale_op_" + type,
std::move(tmp_tensor));
return tmp_data;
};
float* bias_ptr = create_weights(bias, "bias");
float* scale_ptr = create_weights(scale, "scale");
TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(scale_ptr), 1};
TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_ptr), 1};
TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
0};
nvinfer1::ILayer* layer = nullptr;
if (bias_after_scale) {
layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM,
shift_weights.get(), scale_weights.get(), power_weights.get());
} else {
// add bias
layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM,
shift_weights.get(), power_weights.get(), power_weights.get());
// mul scale
layer = TRT_ENGINE_ADD_LAYER(
engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM,
power_weights.get(), scale_weights.get(), power_weights.get());
}
RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(scale, ScaleOpConverter);
...@@ -40,6 +40,12 @@ class ShuffleChannelOpConverter : public OpConverter { ...@@ -40,6 +40,12 @@ class ShuffleChannelOpConverter : public OpConverter {
int w = input_dims.d[2]; int w = input_dims.d[2];
int group = boost::get<int>(op_desc.GetAttr("group")); int group = boost::get<int>(op_desc.GetAttr("group"));
if (engine_->with_dynamic_shape()) {
PADDLE_THROW(platform::errors::Fatal(
"You are running the TRT Dynamic Shape mode, "
"the shuffle_channel op does not support dynamic shape yet"));
}
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
nvinfer1::Dims4 reshape_dim(group, c / group, h, w); nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
layer->setReshapeDimensions(reshape_dim); layer->setReshapeDimensions(reshape_dim);
......
...@@ -72,6 +72,7 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -72,6 +72,7 @@ struct SimpleOpTypeSetTeller : public Teller {
"instance_norm", "instance_norm",
"gelu", "gelu",
"layer_norm", "layer_norm",
"scale",
}; };
}; };
......
...@@ -21,64 +21,41 @@ namespace inference { ...@@ -21,64 +21,41 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, ElementWisePlugin *CreateElementWisePluginDeserialize(const void *buffer,
size_t length) { size_t length) {
return new ElementWisePlugin(buffer, length); return new ElementWisePlugin(buffer, length);
} }
REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
namespace details { namespace details {
template <typename T> template <typename T>
struct Add { struct Add {
__device__ T operator()(const T& a, const T& b) const { return a + b; } __device__ T operator()(const T &a, const T &b) const { return a + b; }
}; };
template <typename T> template <typename T>
struct Mul { struct Mul {
__device__ T operator()(const T& a, const T& b) const { return a * b; } __device__ T operator()(const T &a, const T &b) const { return a * b; }
}; };
} // namespace details
template <typename T, typename Operator> template <typename T, typename Operator>
__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out, __global__ void elementwise_kernel(const size_t total, const T *x_data,
int batch_size, int num_rows, int num_cols) { const T *y_data, T *out_data, int pre, int n,
for (int batch_id = 0; batch_id < batch_size; ++batch_id) { int post, Operator op) {
int row = blockIdx.x; int tid = blockIdx.x * blockDim.x + threadIdx.x;
for (; row < num_rows; row += gridDim.x) { if (tid < total) {
T value_y = y[batch_id * num_rows + row]; int idx = tid / post % n;
int col = threadIdx.x; #if __CUDA_ARCH__ >= 350
int offset = (batch_id * num_rows + row) * num_cols; out_data[tid] = op(__ldg(x_data + tid), __ldg(y_data + idx));
for (; col < num_cols; col += blockDim.x) { #else
T value_x = x[offset + col]; out_data[tid] = op(x_data[tid], y_data[idx]);
out[offset + col] = op(value_x, value_y); #endif
}
}
}
}
template <typename T, typename Operator>
static void ElementWise(Operator op, const T* x, const T* y, T* out,
int batch_size, int prev, int midd, int post,
cudaStream_t stream) {
const int kThreadsPerBlock = 1024;
const int kMaximumBlocks = 65535;
if (prev == 1) {
int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock
: (((post + 31) >> 5) << 5);
int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks;
ColumnWiseKernel<<<num_blocks, num_threads, 0, stream>>>(
op, x, y, out, batch_size, midd, post);
} else if (post == 1) {
PADDLE_THROW("Not implemented.");
} else {
PADDLE_THROW("Not implemented.");
} }
} }
} // namespace details
nvinfer1::Dims ElementWisePlugin::getOutputDimensions( nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
int index, const nvinfer1::Dims* input_dims, int num_inputs) { int index, const nvinfer1::Dims *input_dims, int num_inputs) {
PADDLE_ENFORCE_EQ(index, 0); PADDLE_ENFORCE_EQ(index, 0);
PADDLE_ENFORCE_EQ(num_inputs, 2); PADDLE_ENFORCE_EQ(num_inputs, 2);
PADDLE_ENFORCE_NOT_NULL(input_dims); PADDLE_ENFORCE_NOT_NULL(input_dims);
...@@ -119,25 +96,137 @@ int ElementWisePlugin::initialize() { ...@@ -119,25 +96,137 @@ int ElementWisePlugin::initialize() {
return 0; return 0;
} }
int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
void** outputs, void* workspace, void **outputs, void *workspace,
cudaStream_t stream) { cudaStream_t stream) {
const float* x = reinterpret_cast<const float*>(inputs[0]); const float *x = reinterpret_cast<const float *>(inputs[0]);
const float* y = reinterpret_cast<const float*>(inputs[1]); const float *y = reinterpret_cast<const float *>(inputs[1]);
float* out = reinterpret_cast<float*>(outputs[0]); float *out = reinterpret_cast<float *>(outputs[0]);
int num = batch_size * prev_size_ * midd_size_ * post_size_;
int thread = 256;
int block = (num + thread - 1) / thread;
if (type_ == "add") {
elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size_, batch_size * midd_size_, post_size_,
details::Add<float>());
} else if (type_ == "mul") {
elementwise_kernel<<<block, thread, 0, stream>>>(
num, x, y, out, prev_size_, batch_size * midd_size_, post_size_,
details::Mul<float>());
} else {
PADDLE_THROW(platform::errors::Fatal(
"The %s type elementwise is not implemented in trt plugin.", type_));
}
return cudaGetLastError() != cudaSuccess;
}
// Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000)
int ElementwisePluginDynamic::initialize() { return 0; }
size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; }
void ElementwisePluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) {
return inputs[0];
}
bool ElementwisePluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
in_out, platform::errors::InvalidArgument(
"The input of swish plugin shoule not be nullptr."));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos, nb_inputs + nb_outputs));
(in_out && pos < (nb_inputs + nb_outputs));
const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output
return in.type == prev.type && in.format == prev.format;
}
nvinfer1::DataType ElementwisePluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ(index, 0,
platform::errors::InvalidArgument(
"The Elementwise Plugin only has one input, so the "
"index value should be 0, but get %d.",
index));
return input_types[0];
}
int ElementwisePluginDynamic::enqueue(
const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
void *const *outputs, void *workspace, cudaStream_t stream) {
auto x_dims = input_desc[0].dims;
auto y_dims = input_desc[1].dims;
int axis = (axis_ == -1) ? x_dims.nbDims - y_dims.nbDims : axis_;
int batch_size = x_dims.d[0];
int prev_size = 1;
int midd_size = 1;
int post_size = 1;
for (int i = 0; i < axis; ++i) {
prev_size *= x_dims.d[i];
}
int trimed_nb_dims = y_dims.nbDims;
for (; trimed_nb_dims > 0; --trimed_nb_dims) {
if (y_dims.d[trimed_nb_dims - 1] != 1) {
break;
}
}
for (int i = 0; i < trimed_nb_dims; ++i) {
PADDLE_ENFORCE_EQ(x_dims.d[i + axis], y_dims.d[i],
platform::errors::InvalidArgument(
"Broadcast dimension mismatch found in trt "
"elementwise plugin's x and y input."));
midd_size *= y_dims.d[i];
}
for (int i = axis + trimed_nb_dims; i < x_dims.nbDims; ++i) {
post_size *= x_dims.d[i];
}
const float *x = static_cast<const float *>(inputs[0]);
const float *y = static_cast<const float *>(inputs[1]);
float *out = static_cast<float *>(outputs[0]);
int num = prev_size * midd_size * post_size;
int thread = 256;
int block = (num + thread - 1) / thread;
if (type_ == "add") { if (type_ == "add") {
details::ElementWise(details::Add<float>(), x, y, out, batch_size, elementwise_kernel<<<block, thread, 0, stream>>>(
prev_size_, midd_size_, post_size_, stream); num, x, y, out, prev_size, midd_size, post_size, details::Add<float>());
} else if (type_ == "mul") { } else if (type_ == "mul") {
details::ElementWise(details::Mul<float>(), x, y, out, batch_size, elementwise_kernel<<<block, thread, 0, stream>>>(
prev_size_, midd_size_, post_size_, stream); num, x, y, out, prev_size, midd_size, post_size, details::Mul<float>());
} else { } else {
PADDLE_THROW("Not implemented."); PADDLE_THROW("Not implemented.");
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
......
...@@ -25,8 +25,8 @@ namespace plugin { ...@@ -25,8 +25,8 @@ namespace plugin {
class ElementWisePlugin : public PluginTensorRT { class ElementWisePlugin : public PluginTensorRT {
public: public:
ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, ElementWisePlugin(std::string type, nvinfer1::Dims const& dims_x,
nvinfer1::Dims const &dims_y, int axis) nvinfer1::Dims const& dims_y, int axis)
: type_(type), : type_(type),
dims_x_(dims_x), dims_x_(dims_x),
dims_y_(dims_y), dims_y_(dims_y),
...@@ -35,9 +35,9 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -35,9 +35,9 @@ class ElementWisePlugin : public PluginTensorRT {
midd_size_(1), midd_size_(1),
post_size_(1) {} post_size_(1) {}
ElementWisePlugin(void const *serial_data, size_t serial_length) { ElementWisePlugin(void const* serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length); deserializeBase(serial_data, serial_length);
const char *elementwise_type; const char* elementwise_type;
DeserializeValue(&serial_data, &serial_length, &elementwise_type); DeserializeValue(&serial_data, &serial_length, &elementwise_type);
type_ = std::string(elementwise_type); type_ = std::string(elementwise_type);
DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &axis_);
...@@ -45,22 +45,22 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -45,22 +45,22 @@ class ElementWisePlugin : public PluginTensorRT {
DeserializeValue(&serial_data, &serial_length, &dims_y_); DeserializeValue(&serial_data, &serial_length, &dims_y_);
} }
ElementWisePlugin *clone() const override { ElementWisePlugin* clone() const override {
// return new ElementWisePlugin(dims_x_, dims_y_, axis_); // return new ElementWisePlugin(dims_x_, dims_y_, axis_);
return nullptr; return nullptr;
} }
const char *getPluginType() const override { return "elementwise_plugin"; } const char* getPluginType() const override { return "elementwise_plugin"; }
nvinfer1::Dims getOutputDimensions(int index, nvinfer1::Dims getOutputDimensions(int index,
const nvinfer1::Dims *input_dims, const nvinfer1::Dims* input_dims,
int num_inputs) override; int num_inputs) override;
int initialize() override; int initialize() override;
// execute the layer // execute the layer
int enqueue(int batch_size, const void *const *inputs, void **outputs, int enqueue(int batch_size, const void* const* inputs, void** outputs,
void *workspace, cudaStream_t stream); void* workspace, cudaStream_t stream);
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
...@@ -69,7 +69,7 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -69,7 +69,7 @@ class ElementWisePlugin : public PluginTensorRT {
getBaseSerializationSize(); getBaseSerializationSize();
} }
void serialize(void *buffer) override { void serialize(void* buffer) override {
SerializeValue(&buffer, getPluginType()); SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, type_.c_str()); SerializeValue(&buffer, type_.c_str());
...@@ -87,6 +87,59 @@ class ElementWisePlugin : public PluginTensorRT { ...@@ -87,6 +87,59 @@ class ElementWisePlugin : public PluginTensorRT {
int post_size_; int post_size_;
}; };
#if IS_TRT_VERSION_GE(6000)
class ElementwisePluginDynamic : public DynamicPluginTensorRT {
public:
explicit ElementwisePluginDynamic(const std::string& type, int axis)
: type_(type), axis_(axis) {}
ElementwisePluginDynamic(void const* serialData, size_t serialLength) {}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new ElementwisePluginDynamic(type_, axis_);
}
const char* getPluginType() const override { return "elementwise_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override;
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
void destroy() override { delete this; }
private:
std::string type_;
int axis_;
};
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -194,8 +194,9 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -194,8 +194,9 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
const float* input = static_cast<const float*>(inputs[0]); const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]); float* output = static_cast<float*>(outputs[0]);
gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>( no_exact_gelu_kernel<float,
kA, num, input, output); block_size><<<grid_size, block_size, 0, stream>>>(
kAT, kBT, kCT, num, input, output);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef SUPPORTS_CUDA_FP16
const half* input = static_cast<const half*>(inputs[0]); const half* input = static_cast<const half*>(inputs[0]);
......
...@@ -21,18 +21,18 @@ namespace inference { ...@@ -21,18 +21,18 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
PoolPlugin* CreatePoolPluginDeserialize(const void* buffer, size_t length) { PoolPlugin *CreatePoolPluginDeserialize(const void *buffer, size_t length) {
return new PoolPlugin(buffer, length); return new PoolPlugin(buffer, length);
} }
REGISTER_TRT_PLUGIN("pool_plugin", CreatePoolPluginDeserialize); REGISTER_TRT_PLUGIN("pool_plugin", CreatePoolPluginDeserialize);
nvinfer1::Dims PoolPlugin::getOutputDimensions(int index, nvinfer1::Dims PoolPlugin::getOutputDimensions(int index,
const nvinfer1::Dims* inputDims, const nvinfer1::Dims *inputDims,
int nbInputs) { int nbInputs) {
assert(nbInputs == 1); assert(nbInputs == 1);
assert(index == 0); assert(index == 0);
assert(inputDims[0].nbDims == 3); assert(inputDims[0].nbDims == 3);
nvinfer1::Dims const& input_dims = inputDims[0]; nvinfer1::Dims const &input_dims = inputDims[0];
nvinfer1::Dims output_dims = input_dims; nvinfer1::Dims output_dims = input_dims;
...@@ -41,12 +41,12 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index, ...@@ -41,12 +41,12 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index,
return output_dims; return output_dims;
} }
int PoolPlugin::enqueue(int batchSize, const void* const* inputs, int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
void** outputs, void* workspace, cudaStream_t stream) { void **outputs, void *workspace, cudaStream_t stream) {
auto const& input_dims = this->getInputDims(0); auto const &input_dims = this->getInputDims(0);
int input_size = 0; int input_size = 0;
float const* idata = reinterpret_cast<float const*>(inputs[0]); float const *idata = reinterpret_cast<float const *>(inputs[0]);
float** odatas = reinterpret_cast<float**>(outputs); float **odatas = reinterpret_cast<float **>(outputs);
std::vector<int> input_shape = input_shape_; std::vector<int> input_shape = input_shape_;
std::vector<int> output_shape = output_shape_; std::vector<int> output_shape = output_shape_;
...@@ -72,6 +72,153 @@ int PoolPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -72,6 +72,153 @@ int PoolPlugin::enqueue(int batchSize, const void* const* inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
// Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000)
size_t PoolPluginDynamic::getSerializationSize() const { return 0; }
void PoolPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) {
PADDLE_ENFORCE_EQ(nb_inputs, 1,
platform::errors::InvalidArgument(
"The Split plugin should be only one input."));
PADDLE_ENFORCE_EQ(
inputs[0].d[1]->isConstant(), true,
platform::errors::InvalidArgument("The channel dimension should be "
"static, but we found it's dynamic."));
nvinfer1::DimsExprs output(inputs[0]);
if (is_global_) {
output.d[2] = expr_builder.constant(1);
output.d[3] = expr_builder.constant(1);
return output;
}
if (adaptive_) {
output.d[2] = expr_builder.constant(ksize_[0]);
output.d[3] = expr_builder.constant(ksize_[1]);
return output;
}
auto stri_0 = expr_builder.constant(strides_[0]);
auto stri_1 = expr_builder.constant(strides_[1]);
auto tmp1_0 =
expr_builder.constant((-ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1);
auto tmp1_1 =
expr_builder.constant((-ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1);
auto tmp2_0 = expr_builder.constant(
(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / strides_[0] + 1);
auto tmp2_1 = expr_builder.constant(
(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / strides_[1] + 1);
auto *a_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[2], *stri_0);
auto *b_d = expr_builder.operation(nvinfer1::DimensionOperation::kCEIL_DIV,
*inputs[0].d[3], *stri_1);
if (!ceil_mode_) {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*a_d, *tmp1_0);
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*b_d, *tmp1_1);
} else {
output.d[2] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*a_d, *tmp2_0);
output.d[3] = expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
*b_d, *tmp2_1);
}
return output;
}
bool PoolPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
in_out, platform::errors::InvalidArgument(
"The input of swish plugin shoule not be nullptr."));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos, nb_inputs + nb_outputs));
(in_out && pos < (nb_inputs + nb_outputs));
return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
in_out[pos].format == nvinfer1::PluginFormat::kNCHW);
}
nvinfer1::DataType PoolPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The Pool Plugin only has one input, so the "
"index value should be 0, but get %d.",
index));
PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
platform::errors::InvalidArgument(
"The input type should be half or float"));
return input_types[0];
}
int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc,
const void *const *inputs, void *const *outputs,
void *workspace, cudaStream_t stream) {
auto input_dims = input_desc[0].dims;
int n = input_dims.d[0];
int c = input_dims.d[1];
int h = input_dims.d[2];
int w = input_dims.d[3];
const float *input = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
std::vector<int> input_shape, output_shape;
for (int i = 0; i < input_dims.nbDims; i++)
input_shape.push_back(input_dims.d[i]);
output_shape = input_shape;
std::vector<int> ksize = ksize_;
std::vector<int> paddings = paddings_;
if (is_global_) {
ksize[0] = h;
ksize[1] = w;
paddings[0] = 0;
paddings[1] = 0;
output_shape[2] = 1;
output_shape[3] = 1;
} else {
auto data_dim = CalcOutputSize({h, w}, ceil_mode_, adaptive_, ksize_,
strides_, paddings_);
output_shape[2] = data_dim[0];
output_shape[3] = data_dim[1];
}
if (pool_type_ == "max") {
paddle::operators::math::MaxPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor<
paddle::operators::math::MaxPool<float>, float>
pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
pool_process, true, adaptive_, output, stream);
} else if (pool_type_ == "avg") {
paddle::operators::math::AvgPool<float> pool_process;
paddle::operators::math::Pool2dDirectCUDAFunctor<
paddle::operators::math::AvgPool<float>, float>
pool2d_forward;
pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings,
pool_process, true, adaptive_, output, stream);
}
return cudaGetLastError() != cudaSuccess;
}
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -24,6 +24,37 @@ namespace inference { ...@@ -24,6 +24,37 @@ namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
const bool& ceil_mode,
const bool& adaptive,
const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings) {
std::vector<int> output_shape = input_shape;
if (adaptive) {
output_shape[0] = ksize[0];
output_shape[1] = ksize[1];
} else {
int output_h, output_w;
if (!ceil_mode) {
output_h = (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
output_w = (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
} else {
output_h =
(input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
strides[0] +
1;
output_w =
(input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
strides[1] +
1;
}
output_shape[0] = output_h;
output_shape[1] = output_w;
}
return output_shape;
}
class PoolPlugin : public PluginTensorRT { class PoolPlugin : public PluginTensorRT {
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
...@@ -36,7 +67,7 @@ class PoolPlugin : public PluginTensorRT { ...@@ -36,7 +67,7 @@ class PoolPlugin : public PluginTensorRT {
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
// tensorrt. // tensorrt.
void serialize(void *buffer) override { void serialize(void* buffer) override {
SerializeValue(&buffer, getPluginType()); SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ceil_mode_);
...@@ -66,34 +97,16 @@ class PoolPlugin : public PluginTensorRT { ...@@ -66,34 +97,16 @@ class PoolPlugin : public PluginTensorRT {
paddings_(paddings), paddings_(paddings),
input_shape_(input_shape) { input_shape_(input_shape) {
output_shape_ = input_shape_; output_shape_ = input_shape_;
if (adaptive_) { std::vector<int> output_shape =
output_shape_[1] = ksize[0]; CalcOutputSize({input_shape_[1], input_shape_[2]}, ceil_mode_,
output_shape_[2] = ksize[1]; adaptive_, ksize_, strides_, paddings_);
} else { output_shape_[1] = output_shape[0];
int output_h, output_w; output_shape_[2] = output_shape[1];
if (!ceil_mode_) {
output_h =
(input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1;
output_w =
(input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1;
} else {
output_h =
(input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) /
strides_[0] +
1;
output_w =
(input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) /
strides_[1] +
1;
}
output_shape_[1] = output_h;
output_shape_[2] = output_w;
}
} }
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
PoolPlugin(void const *serialData, size_t serialLength) { PoolPlugin(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength); deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &ceil_mode_); DeserializeValue(&serialData, &serialLength, &ceil_mode_);
DeserializeValue(&serialData, &serialLength, &pool_type_); DeserializeValue(&serialData, &serialLength, &pool_type_);
...@@ -105,18 +118,18 @@ class PoolPlugin : public PluginTensorRT { ...@@ -105,18 +118,18 @@ class PoolPlugin : public PluginTensorRT {
DeserializeValue(&serialData, &serialLength, &output_shape_); DeserializeValue(&serialData, &serialLength, &output_shape_);
} }
PoolPlugin *clone() const override { PoolPlugin* clone() const override {
return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, ksize_, strides_, return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, ksize_, strides_,
paddings_, input_shape_); paddings_, input_shape_);
} }
const char *getPluginType() const override { return "pool_plugin"; } const char* getPluginType() const override { return "pool_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) override; int nbInputDims) override;
int initialize() override { return 0; } int initialize() override { return 0; }
int enqueue(int batchSize, const void *const *inputs, void **outputs, int enqueue(int batchSize, const void* const* inputs, void** outputs,
void *workspace, cudaStream_t stream) override; void* workspace, cudaStream_t stream) override;
private: private:
bool ceil_mode_; bool ceil_mode_;
...@@ -129,6 +142,88 @@ class PoolPlugin : public PluginTensorRT { ...@@ -129,6 +142,88 @@ class PoolPlugin : public PluginTensorRT {
std::vector<int> output_shape_; std::vector<int> output_shape_;
}; };
#if IS_TRT_VERSION_GE(6000)
class PoolPluginDynamic : public DynamicPluginTensorRT {
public:
PoolPluginDynamic() {}
PoolPluginDynamic(const bool& ceil_mode, const std::string& pool_type,
const bool& adaptive, const std::vector<int>& ksize,
const std::vector<int>& strides,
const std::vector<int>& paddings, const bool& is_global)
: ceil_mode_(ceil_mode),
pool_type_(pool_type),
adaptive_(adaptive),
ksize_(ksize),
strides_(strides),
paddings_(paddings),
is_global_(is_global) {}
PoolPluginDynamic(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &ceil_mode_);
const char* pool_type;
DeserializeValue(&serialData, &serialLength, &pool_type);
pool_type_ = std::string(pool_type);
DeserializeValue(&serialData, &serialLength, &adaptive_);
DeserializeValue(&serialData, &serialLength, &ksize_);
DeserializeValue(&serialData, &serialLength, &strides_);
DeserializeValue(&serialData, &serialLength, &paddings_);
DeserializeValue(&serialData, &serialLength, &is_global_);
}
~PoolPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, ksize_,
strides_, paddings_, is_global_);
}
const char* getPluginType() const override { return "pool_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override { return 0; }
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
void destroy() override { delete this; }
private:
bool ceil_mode_;
std::string pool_type_;
bool adaptive_;
std::vector<int> ksize_;
std::vector<int> strides_;
std::vector<int> paddings_;
bool is_global_;
};
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -77,6 +77,84 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -77,6 +77,84 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
#if IS_TRT_VERSION_GE(6000)
int PReluPluginDynamic::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
cudaMemcpyHostToDevice);
return 0;
}
size_t PReluPluginDynamic::getSerializationSize() const { return 0; }
void PReluPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs PReluPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) {
return inputs[0];
}
bool PReluPluginDynamic::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
int nb_outputs) {
PADDLE_ENFORCE_NOT_NULL(
in_out, platform::errors::InvalidArgument(
"The input of swish plugin shoule not be nullptr."));
PADDLE_ENFORCE_LT(
pos, nb_inputs + nb_outputs,
platform::errors::InvalidArgument("The pos(%d) should be less than the "
"num(%d) of the input and the output.",
pos, nb_inputs + nb_outputs));
(in_out && pos < (nb_inputs + nb_outputs));
return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
in_out[pos].format == nvinfer1::PluginFormat::kNCHW);
}
nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
"The PRelu Plugin only has one input, so the "
"index value should be 0, but get %d.",
index));
PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true,
platform::errors::InvalidArgument(
"The input type should be half or float"));
return input_types[0];
}
int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
const nvinfer1::PluginTensorDesc *output_desc,
const void *const *inputs, void *const *outputs,
void *workspace, cudaStream_t stream) {
auto input_dims = input_desc[0].dims;
const float *alpha = p_gpu_weight_;
const float *input = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]);
std::vector<int> input_shape;
for (int i = 0; i < input_dims.nbDims; i++) {
input_shape.push_back(input_dims.d[i]);
}
if (mode_ == "channel") {
operators::math::PreluChannelWiseDirectCUDAFunctor<float>
prelu_channel_wise;
prelu_channel_wise(stream, input, alpha, output, input_shape);
} else if (mode_ == "element") {
operators::math::PreluElementWiseDirectCUDAFunctor<float>
prelu_element_wise;
prelu_element_wise(stream, input, alpha, output, input_shape);
} else {
operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
prelu_scalar(stream, input, alpha, output, input_shape);
}
return cudaGetLastError() != cudaSuccess;
}
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -30,7 +30,7 @@ namespace plugin { ...@@ -30,7 +30,7 @@ namespace plugin {
class PReluPlugin : public PluginTensorRT { class PReluPlugin : public PluginTensorRT {
std::vector<float> weight_; std::vector<float> weight_;
float *p_gpu_weight_; float* p_gpu_weight_;
std::string mode_; std::string mode_;
protected: protected:
...@@ -42,7 +42,7 @@ class PReluPlugin : public PluginTensorRT { ...@@ -42,7 +42,7 @@ class PReluPlugin : public PluginTensorRT {
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
// tensorrt. // tensorrt.
// It should not be called by users. // It should not be called by users.
void serialize(void *buffer) override { void serialize(void* buffer) override {
SerializeValue(&buffer, getPluginType()); SerializeValue(&buffer, getPluginType());
serializeBase(buffer); serializeBase(buffer);
SerializeValue(&buffer, weight_); SerializeValue(&buffer, weight_);
...@@ -50,8 +50,8 @@ class PReluPlugin : public PluginTensorRT { ...@@ -50,8 +50,8 @@ class PReluPlugin : public PluginTensorRT {
} }
public: public:
PReluPlugin(const float *weight, const int weight_num, PReluPlugin(const float* weight, const int weight_num,
std::string const &mode) std::string const& mode)
: mode_(mode) { : mode_(mode) {
weight_.resize(weight_num); weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data()); std::copy(weight, weight + weight_num, weight_.data());
...@@ -59,28 +59,96 @@ class PReluPlugin : public PluginTensorRT { ...@@ -59,28 +59,96 @@ class PReluPlugin : public PluginTensorRT {
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
PReluPlugin(void const *serialData, size_t serialLength) { PReluPlugin(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength); deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &weight_); DeserializeValue(&serialData, &serialLength, &weight_);
const char *prelu_mode; const char* prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode); DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode); mode_ = std::string(prelu_mode);
} }
~PReluPlugin() { cudaFree(p_gpu_weight_); } ~PReluPlugin() { cudaFree(p_gpu_weight_); }
int initialize() override; int initialize() override;
PReluPlugin *clone() const override { PReluPlugin* clone() const override {
return new PReluPlugin(weight_.data(), weight_.size(), mode_); return new PReluPlugin(weight_.data(), weight_.size(), mode_);
} }
const char *getPluginType() const override { return "prelu_plugin"; } const char* getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) override; int nbInputDims) override;
int enqueue(int batchSize, const void *const *inputs, void **outputs, int enqueue(int batchSize, const void* const* inputs, void** outputs,
void *workspace, cudaStream_t stream) override; void* workspace, cudaStream_t stream) override;
}; };
#if IS_TRT_VERSION_GE(6000)
class PReluPluginDynamic : public DynamicPluginTensorRT {
public:
PReluPluginDynamic(const float* weight, const int weight_num,
std::string const& mode)
: mode_(mode) {
weight_.resize(weight_num);
std::copy(weight, weight + weight_num, weight_.data());
}
// It was used for tensorrt deserialization.
// It should not be called by users.
PReluPluginDynamic(void const* serialData, size_t serialLength) {
deserializeBase(serialData, serialLength);
DeserializeValue(&serialData, &serialLength, &weight_);
const char* prelu_mode;
DeserializeValue(&serialData, &serialLength, &prelu_mode);
mode_ = std::string(prelu_mode);
}
~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
nvinfer1::IPluginV2DynamicExt* clone() const override {
return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
}
const char* getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; }
int initialize() override;
size_t getSerializationSize() const override;
void serialize(void* buffer) const override;
nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut,
int nbInputs, int nbOutputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs,
const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs,
const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override {
return 0;
}
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
const nvinfer1::PluginTensorDesc* outputDesc,
const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes,
int nbInputs) const override;
void destroy() override { delete this; }
private:
std::vector<float> weight_;
float* p_gpu_weight_;
std::string mode_;
};
#endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -373,13 +373,18 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -373,13 +373,18 @@ if(WITH_GPU AND TENSORRT_FOUND)
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR})
set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2})
inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
endif()
set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu") set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}) if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL})
inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz") inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
endif() endif()
inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_DYNAMIC_MODEL}) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR})
set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test") set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}) if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL})
......
...@@ -22,7 +22,8 @@ namespace paddle { ...@@ -22,7 +22,8 @@ namespace paddle {
namespace inference { namespace inference {
void TestDynamic(bool with_dynamic = true) { void TestDynamic(bool with_dynamic = true) {
std::string model_dir = FLAGS_infer_model + "/conv_bn_swish_split_gelu"; std::string model_dir =
FLAGS_infer_model + "/conv_bn_swish_split_gelu/conv_bn_swish_split_gelu";
AnalysisConfig config; AnalysisConfig config;
config.EnableUseGpu(100, 0); config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params"); config.SetModel(model_dir + "/model", model_dir + "/params");
...@@ -67,8 +68,69 @@ void TestDynamic(bool with_dynamic = true) { ...@@ -67,8 +68,69 @@ void TestDynamic(bool with_dynamic = true) {
output_t->copy_to_cpu(out_data.data()); output_t->copy_to_cpu(out_data.data());
} }
void TestDynamic2() {
std::string model_dir =
FLAGS_infer_model + "/complex_model_dynamic/complex_model_dynamic2";
AnalysisConfig config;
config.EnableUseGpu(100, 0);
config.SetModel(model_dir + "/model", model_dir + "/params");
config.SwitchUseFeedFetchOps(false);
// Set the input's min, max, opt shape
int batch_size = 1;
std::map<std::string, std::vector<int>> min_input_shape = {
{"image", {1, 3, 3, 3}}, {"in1", {1, 2, 1, 1}}, {"in2", {1, 2, 1, 1}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"image", {1, 3, 10, 10}}, {"in1", {1, 2, 1, 1}}, {"in2", {1, 2, 1, 1}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"image", {1, 3, 5, 5}}, {"in1", {1, 2, 1, 1}}, {"in2", {1, 2, 1, 1}}};
config.EnableTensorRtEngine(1 << 30, batch_size, 0,
AnalysisConfig::Precision::kFloat32, false, true);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
auto predictor = CreatePaddlePredictor(config);
int channels = 3;
int height = 5;
int width = 5;
int input_num = channels * height * width * 1;
float *input = new float[input_num];
memset(input, 0, input_num * sizeof(float));
auto input_names = predictor->GetInputNames();
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({batch_size, channels, height, width});
input_t->copy_from_cpu(input);
auto input_t1 = predictor->GetInputTensor(input_names[1]);
input_t1->Reshape({batch_size, 2, 1, 1});
std::vector<float> first;
for (int i = 0; i < batch_size * 2; i++) first.push_back(1.0);
input_t1->copy_from_cpu(first.data());
auto input_t2 = predictor->GetInputTensor(input_names[2]);
input_t2->Reshape({batch_size, 2, 1, 1});
input_t2->copy_from_cpu(first.data());
ASSERT_TRUE(predictor->ZeroCopyRun());
std::vector<float> out_data;
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data.resize(out_num);
output_t->copy_to_cpu(out_data.data());
std::vector<float> result = {0.617728, 1.63504, 2.15771, 0.535556};
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
}
}
TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); } TEST(AnalysisPredictor, trt_dynamic) { TestDynamic(true); }
TEST(AnalysisPredictor, trt_static) { TestDynamic(false); } TEST(AnalysisPredictor, trt_static) { TestDynamic(false); }
TEST(AnalysisPredictor, trt_dynamic2) { TestDynamic2(); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册