未验证 提交 b9e76a01 编写于 作者: S Shang Zhizhou 提交者: GitHub

detect tensorRT plugin fp16 in runtime (#27933)

* remove -DSUPPORTS_CUDA_FP16 in cuda.cmake

* comile with cuda9

* add some unittest

* notest;test=coverage

* add unittest for trt plugin swish && split

* update ernie unittest

* fix some error message

* remove repeated judgement of CUDA version in mbEltwiseLayerNormOpConverter

* fix comile errror when CUDA_ARCH_NAME < Pascal"

* fix comile error

* update unittest timeout

* compile with cuda9

* update error msg

* fix code style

* add some comments

* add define IF_CUDA_ARCH_SUPPORT_FP16

* rename IF_CUDA_ARCH_SUPPORT_FP16 to CUDA_ARCH_FP16_SUPPORTED
上级 c39da29d
...@@ -103,19 +103,10 @@ function(select_nvcc_arch_flags out_variable) ...@@ -103,19 +103,10 @@ function(select_nvcc_arch_flags out_variable)
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50") set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "60 61") set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "70") set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DSUPPORTS_CUDA_FP16")
endif()
set(cuda_arch_bin "75") set(cuda_arch_bin "75")
elseif(${CUDA_ARCH_NAME} STREQUAL "All") elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs}) set(cuda_arch_bin ${paddle_known_gpu_archs})
...@@ -194,6 +185,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x ...@@ -194,6 +185,10 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
endif() endif()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0)
add_definitions("-DTRT_PLUGIN_FP16_AVALIABLE")
endif()
add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"") add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"")
add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"") add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"")
add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"") add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")
......
...@@ -93,11 +93,12 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -93,11 +93,12 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
if (engine_->use_oss()) { if (engine_->use_oss()) {
int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0); int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
PADDLE_ENFORCE_EQ(output_fp16, 1, PADDLE_ENFORCE_EQ(
output_fp16, 1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Only Precision::KHalf(fp16) is supported when infering " "Only Precision::KHalf(fp16) is supported when infering "
"ernie(bert) model with config.EnableTensorRtOSS(). " "ernie(bert) model with config.EnableTensorRtOSS(). "
"But Precision::KFloat32 is setted.")); "But Precision::KFloat32 is setted."));
const std::vector<nvinfer1::PluginField> fields{ const std::vector<nvinfer1::PluginField> fields{
{"bert_embeddings_layernorm_beta", bias, {"bert_embeddings_layernorm_beta", bias,
nvinfer1::PluginFieldType::kFLOAT32, nvinfer1::PluginFieldType::kFLOAT32,
...@@ -135,21 +136,23 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -135,21 +136,23 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
plugin_inputs.emplace_back(engine_->GetITensor( plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens, engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2 // eval_placeholder_2
auto max_seqlen_tensor = engine_->GetITensor( auto max_seqlen_tensor =
engine_->network()->getInput(3)->getName()); engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor)); engine_, Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim; nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1; shape_dim.nbDims = 1;
shape_dim.d[0] = -1; shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim); shuffle_layer->setReshapeDimensions(shape_dim);
plugin_inputs.emplace_back(shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
auto creator = GetPluginRegistry()->getPluginCreator( auto creator = GetPluginRegistry()->getPluginCreator(
"CustomEmbLayerNormPluginDynamic", "2"); "CustomEmbLayerNormPluginDynamic", "2");
auto plugin_obj = auto plugin_obj = creator->createPlugin(
creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); "CustomEmbLayerNormPluginDynamic", plugin_ptr);
auto plugin_layer = engine_->network()->addPluginV2( auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
layer = plugin_layer; layer = plugin_layer;
...@@ -159,12 +162,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -159,12 +162,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
{output_name, std::string("qkv_plugin_mask")}, {output_name, std::string("qkv_plugin_mask")},
test_mode); test_mode);
} else { } else {
bool use_fp16 = engine_->WithFp16(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
plugin::DynamicPluginTensorRT* plugin = nullptr; plugin::DynamicPluginTensorRT* plugin = nullptr;
plugin = new plugin::EmbEltwiseLayernormPluginDynamic( plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps, use_fp16); eps, with_fp16);
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
......
...@@ -59,7 +59,10 @@ class GeluOpConverter : public OpConverter { ...@@ -59,7 +59,10 @@ class GeluOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::GeluPluginDynamic* plugin =
new plugin::GeluPluginDynamic(with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin); layer = engine_->AddPluginV2(&input, input_num, plugin);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
...@@ -67,7 +70,9 @@ class GeluOpConverter : public OpConverter { ...@@ -67,7 +70,9 @@ class GeluOpConverter : public OpConverter {
"your TRT version is no less than 6.0")); "your TRT version is no less than 6.0"));
#endif #endif
} else { } else {
plugin::GeluPlugin* plugin = new plugin::GeluPlugin(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin); layer = engine_->AddPlugin(&input, input_num, plugin);
} }
auto output_name = op_desc.Output("Out")[0]; auto output_name = op_desc.Output("Out")[0];
......
...@@ -87,7 +87,8 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -87,7 +87,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
} }
}; };
// [3, N, H] -> [N, 3, H] // [3, N, H] -> [N, 3, H]
auto transpose_bias_v2 = [](const float* src, float* dst, int N, int H) { auto transpose_bias_v2 = [](const float* src, float* dst, int N,
int H) {
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
for (int n = 0; n < N; ++n) { for (int n = 0; n < N; ++n) {
for (int h = 0; h < H; ++h) { for (int h = 0; h < H; ++h) {
...@@ -106,15 +107,16 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -106,15 +107,16 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<float> bias_data_tmp; std::vector<float> bias_data_tmp;
bias_data_tmp.reserve(bias_t->numel()); bias_data_tmp.reserve(bias_t->numel());
memcpy(bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float)); memcpy(bias_data_tmp.data(), bias_data,
bias_t->numel() * sizeof(float));
transpose_bias_v2(bias_data_tmp.data(), bias_data, head_number, transpose_bias_v2(bias_data_tmp.data(), bias_data, head_number,
head_size); head_size);
nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
static_cast<void*>(bias_data), static_cast<void*>(bias_data),
static_cast<int32_t>(bias_t->numel())}; static_cast<int32_t>(bias_t->numel())};
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
weight, bias); n, weight, bias);
auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
...@@ -151,15 +153,17 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -151,15 +153,17 @@ class MultiheadMatMulOpConverter : public OpConverter {
plugin_inputs.emplace_back(engine_->GetITensor( plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens, engine_->network()->getInput(2)->getName())); // cu_seqlens,
// eval_placeholder_2 // eval_placeholder_2
auto max_seqlen_tensor = engine_->GetITensor( auto max_seqlen_tensor =
engine_->network()->getInput(3)->getName()); engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor)); engine_, Shuffle,
*const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
nvinfer1::Dims shape_dim; nvinfer1::Dims shape_dim;
shape_dim.nbDims = 1; shape_dim.nbDims = 1;
shape_dim.d[0] = -1; shape_dim.d[0] = -1;
shuffle_layer->setReshapeDimensions(shape_dim); shuffle_layer->setReshapeDimensions(shape_dim);
plugin_inputs.emplace_back(shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 plugin_inputs.emplace_back(
shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
auto plugin_layer = engine_->network()->addPluginV2( auto plugin_layer = engine_->network()->addPluginV2(
plugin_inputs.data(), plugin_inputs.size(), *plugin); plugin_inputs.data(), plugin_inputs.size(), *plugin);
...@@ -178,8 +182,8 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -178,8 +182,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
static_cast<void*>(bias_data), static_cast<void*>(bias_data),
static_cast<size_t>(bias_t->numel())}; static_cast<size_t>(bias_t->numel())};
auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
weight.get(), bias.get()); n, weight.get(), bias.get());
auto* fc_out = fc_layer->getOutput(0); auto* fc_out = fc_layer->getOutput(0);
// add qkv to context // add qkv to context
int head_size = all_head_size / head_number; int head_size = all_head_size / head_number;
...@@ -188,10 +192,11 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -188,10 +192,11 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs; std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.push_back(fc_out); plugin_inputs.push_back(fc_out);
plugin_inputs.push_back(input_bias_qk); plugin_inputs.push_back(input_bias_qk);
bool ban_fp16 = engine_->disable_trt_plugin_fp16(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::DynamicPluginTensorRT* plugin = plugin::DynamicPluginTensorRT* plugin =
new plugin::QkvToContextPluginDynamic(hidden, head_number, head_size, new plugin::QkvToContextPluginDynamic(hidden, head_number,
scale, ban_fp16); head_size, scale, with_fp16);
layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin); layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
} }
} else { } else {
......
...@@ -76,8 +76,8 @@ class SkipLayerNormOpConverter : public OpConverter { ...@@ -76,8 +76,8 @@ class SkipLayerNormOpConverter : public OpConverter {
pluginPtr->nbFields = static_cast<int>(fields.size()); pluginPtr->nbFields = static_cast<int>(fields.size());
pluginPtr->fields = fields.data(); pluginPtr->fields = fields.data();
auto pluginObj = auto pluginObj = creator->createPlugin(
creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); "CustomSkipLayerNormPluginDynamic", pluginPtr);
auto plugin_layer = engine_->network()->addPluginV2( auto plugin_layer = engine_->network()->addPluginV2(
inputs.data(), inputs.size(), *pluginObj); inputs.data(), inputs.size(), *pluginObj);
...@@ -85,10 +85,11 @@ class SkipLayerNormOpConverter : public OpConverter { ...@@ -85,10 +85,11 @@ class SkipLayerNormOpConverter : public OpConverter {
layer = plugin_layer; layer = plugin_layer;
} else { } else {
float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
bool ban_fp16 = engine_->disable_trt_plugin_fp16(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SkipLayerNormPluginDynamic* plugin = plugin::SkipLayerNormPluginDynamic* plugin =
new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
scale_size, eps, ban_fp16); scale_size, eps, with_fp16);
layer = engine_->AddPluginV2(inputs.data(), 2, plugin); layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
} }
} else { } else {
......
...@@ -93,9 +93,10 @@ class SliceOpConverter : public OpConverter { ...@@ -93,9 +93,10 @@ class SliceOpConverter : public OpConverter {
layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
plugin); plugin);
} else { } else {
bool ban_fp16 = engine_->disable_trt_plugin_fp16(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SlicePluginDynamic* plugin = plugin::SlicePluginDynamic* plugin =
new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16); new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16);
layer = engine_->AddPluginV2(&input, 1, plugin); layer = engine_->AddPluginV2(&input, 1, plugin);
} }
#else #else
...@@ -104,9 +105,10 @@ class SliceOpConverter : public OpConverter { ...@@ -104,9 +105,10 @@ class SliceOpConverter : public OpConverter {
"your TRT version is no less than 6.0")); "your TRT version is no less than 6.0"));
#endif #endif
} else { } else {
bool ban_fp16 = engine_->disable_trt_plugin_fp16(); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SlicePlugin* plugin = plugin::SlicePlugin* plugin =
new plugin::SlicePlugin(starts, ends, axes, ban_fp16); new plugin::SlicePlugin(starts, ends, axes, with_fp16);
layer = engine_->AddPlugin(&input, 1, plugin); layer = engine_->AddPlugin(&input, 1, plugin);
} }
......
...@@ -86,8 +86,10 @@ class SplitOpConverter : public OpConverter { ...@@ -86,8 +86,10 @@ class SplitOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SplitPluginDynamic* plugin = plugin::SplitPluginDynamic* plugin =
new plugin::SplitPluginDynamic(axis, output_lengths); new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin); layer = engine_->AddPluginV2(&input, input_num, plugin);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
...@@ -95,8 +97,10 @@ class SplitOpConverter : public OpConverter { ...@@ -95,8 +97,10 @@ class SplitOpConverter : public OpConverter {
"your TRT version is no less than 6.0")); "your TRT version is no less than 6.0"));
#endif #endif
} else { } else {
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SplitPlugin* plugin = plugin::SplitPlugin* plugin =
new plugin::SplitPlugin(axis, output_lengths); new plugin::SplitPlugin(axis, output_lengths, with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin); layer = engine_->AddPlugin(&input, input_num, plugin);
} }
......
...@@ -46,8 +46,10 @@ class StackOpConverter : public OpConverter { ...@@ -46,8 +46,10 @@ class StackOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::StackPluginDynamic* plugin = plugin::StackPluginDynamic* plugin =
new plugin::StackPluginDynamic(axis, input_num); new plugin::StackPluginDynamic(axis, input_num, with_fp16);
layer = engine_->AddPluginV2(inputs, input_num, plugin); layer = engine_->AddPluginV2(inputs, input_num, plugin);
assert(layer != nullptr); assert(layer != nullptr);
#else #else
......
...@@ -60,7 +60,10 @@ class SwishOpConverter : public OpConverter { ...@@ -60,7 +60,10 @@ class SwishOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SwishPluginDynamic* plugin =
new plugin::SwishPluginDynamic(beta, with_fp16);
layer = engine_->AddPluginV2(&input, input_num, plugin); layer = engine_->AddPluginV2(&input, input_num, plugin);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
...@@ -68,7 +71,9 @@ class SwishOpConverter : public OpConverter { ...@@ -68,7 +71,9 @@ class SwishOpConverter : public OpConverter {
"your TRT version is no less than 6.0")); "your TRT version is no less than 6.0"));
#endif #endif
} else { } else {
plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta); bool with_fp16 =
engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16);
layer = engine_->AddPlugin(&input, input_num, plugin); layer = engine_->AddPlugin(&input, input_num, plugin);
} }
......
...@@ -160,9 +160,9 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue( ...@@ -160,9 +160,9 @@ int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
} }
template class EmbEltwiseLayernormPluginDynamicImpl<float>; template class EmbEltwiseLayernormPluginDynamicImpl<float>;
#ifdef SUPPORTS_CUDA_FP16 #ifdef TRT_PLUGIN_FP16_AVALIABLE
template class EmbEltwiseLayernormPluginDynamicImpl<half>; template class EmbEltwiseLayernormPluginDynamicImpl<half>;
#endif // SUPPORTS_CUDA_FP16 #endif
int EmbEltwiseLayernormPluginDynamic::initialize() { int EmbEltwiseLayernormPluginDynamic::initialize() {
impl_->initialize(); impl_->initialize();
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
// //
// Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
...@@ -105,18 +105,24 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -105,18 +105,24 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
scale_size_(scale_size), scale_size_(scale_size),
hidden_size_(hidden_size), hidden_size_(hidden_size),
eps_(eps), eps_(eps),
with_fp16_(with_fp16),
own_host_buff_(false) { own_host_buff_(false) {
if (with_fp16) { with_fp16_ = with_fp16;
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
#ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp16";
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>( impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_); hidden_size_, eps_);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half.")); "The Ernie(Bert) tensorRT plugin should be "
#endif // SUPPORTS_CUDA_FP16 "complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.EnableTensorRtEngine(1 << 30, 1, 5, "
"AnalysisConfig::Precision::kFloat32, false, false) "));
#endif
} else { } else {
VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp32";
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>( impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_); hidden_size_, eps_);
...@@ -160,14 +166,18 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -160,14 +166,18 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue(&serial_data, &serial_length, &with_fp16_); DeserializeValue(&serial_data, &serial_length, &with_fp16_);
if (with_fp16_) { if (with_fp16_) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef TRT_PLUGIN_FP16_AVALIABLE
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>( impl_ = new EmbEltwiseLayernormPluginDynamicImpl<half>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
hidden_size_, eps_); hidden_size_, eps_);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"Unsupported data type, current GPU doesn't support half.")); "The Ernie(Bert) tensorRT plugin should be "
#endif // SUPPORTS_CUDA_FP16 "complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.EnableTensorRtEngine(1 << 30, 1, 5, "
"AnalysisConfig::Precision::kFloat32, false, false) "));
#endif
} else { } else {
impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>( impl_ = new EmbEltwiseLayernormPluginDynamicImpl<float>(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_,
...@@ -283,7 +293,6 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -283,7 +293,6 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int hidden_size_; int hidden_size_;
float eps_; float eps_;
bool with_fp16_;
bool own_host_buff_{false}; bool own_host_buff_{false};
EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr}; EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr};
}; };
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -38,14 +39,14 @@ REGISTER_TRT_PLUGIN("gelu_plugin", CreateGeluPluginDeserialize); ...@@ -38,14 +39,14 @@ REGISTER_TRT_PLUGIN("gelu_plugin", CreateGeluPluginDeserialize);
bool GeluPlugin::supportsFormat(nvinfer1::DataType type, bool GeluPlugin::supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const { nvinfer1::PluginFormat format) const {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return ((type == nvinfer1::DataType::kFLOAT || return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) && type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
#else } else {
return ((type == nvinfer1::DataType::kFLOAT) && return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
#endif }
} }
nvinfer1::Dims GeluPlugin::getOutputDimensions(int index, nvinfer1::Dims GeluPlugin::getOutputDimensions(int index,
...@@ -87,6 +88,7 @@ __device__ half do_tanh<half>(half a) { ...@@ -87,6 +88,7 @@ __device__ half do_tanh<half>(half a) {
template <typename T, unsigned TPB> template <typename T, unsigned TPB>
__global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n, __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n,
const T* input, T* output) { const T* input, T* output) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const int idx = blockIdx.x * TPB + threadIdx.x; const int idx = blockIdx.x * TPB + threadIdx.x;
if (idx < n) { if (idx < n) {
const T in = input[idx]; const T in = input[idx];
...@@ -94,6 +96,7 @@ __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n, ...@@ -94,6 +96,7 @@ __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n,
const T cdf = a + a * do_tanh<T>(tmp); const T cdf = a + a * do_tanh<T>(tmp);
output[idx] = in * cdf; output[idx] = in * cdf;
} }
#endif
} }
int GeluPlugin::enqueue(int batch_size, const void* const* inputs, int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
...@@ -108,21 +111,18 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, ...@@ -108,21 +111,18 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
auto type = getDataType(); auto type = getDataType();
if (type == nvinfer1::DataType::kFLOAT) { if (type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
const float* input = static_cast<const float*>(inputs[0]); const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]); float* output = static_cast<float*>(outputs[0]);
gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>( gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
kA, num, input, output); kA, num, input, output);
} else if (type == nvinfer1::DataType::kHALF) { } else if (type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
const half* input = static_cast<const half*>(inputs[0]); const half* input = static_cast<const half*>(inputs[0]);
half* output = static_cast<half*>(outputs[0]); half* output = static_cast<half*>(outputs[0]);
no_exact_gelu_kernel<half, no_exact_gelu_kernel<half,
block_size><<<grid_size, block_size, 0, stream>>>( block_size><<<grid_size, block_size, 0, stream>>>(
kAT, kBT, kCT, num, input, output); kAT, kBT, kCT, num, input, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"The Gelu TRT Plugin's input type should be float or half.")); "The Gelu TRT Plugin's input type should be float or half."));
...@@ -155,14 +155,14 @@ bool GeluPluginDynamic::supportsFormatCombination( ...@@ -155,14 +155,14 @@ bool GeluPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos]; const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#else } else {
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif }
} }
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output // output
...@@ -189,21 +189,18 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -189,21 +189,18 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32";
const float* input = static_cast<const float*>(inputs[0]); const float* input = static_cast<const float*>(inputs[0]);
float* output = static_cast<float*>(outputs[0]); float* output = static_cast<float*>(outputs[0]);
gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>( gelu_kernel<float, block_size><<<grid_size, block_size, 0, stream>>>(
kA, num, input, output); kA, num, input, output);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16";
const half* input = static_cast<const half*>(inputs[0]); const half* input = static_cast<const half*>(inputs[0]);
half* output = static_cast<half*>(outputs[0]); half* output = static_cast<half*>(outputs[0]);
no_exact_gelu_kernel<half, no_exact_gelu_kernel<half,
block_size><<<grid_size, block_size, 0, stream>>>( block_size><<<grid_size, block_size, 0, stream>>>(
kAT, kBT, kCT, num, input, output); kAT, kBT, kCT, num, input, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"The Gelu TRT Plugin's input type should be float or half.")); "The Gelu TRT Plugin's input type should be float or half."));
......
...@@ -26,7 +26,7 @@ namespace plugin { ...@@ -26,7 +26,7 @@ namespace plugin {
class GeluPlugin : public PluginTensorRT { class GeluPlugin : public PluginTensorRT {
public: public:
GeluPlugin() {} explicit GeluPlugin(const bool with_fp16) { with_fp16_ = with_fp16; }
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
...@@ -35,7 +35,7 @@ class GeluPlugin : public PluginTensorRT { ...@@ -35,7 +35,7 @@ class GeluPlugin : public PluginTensorRT {
} }
~GeluPlugin() {} ~GeluPlugin() {}
GeluPlugin* clone() const override { return new GeluPlugin(); } GeluPlugin* clone() const override { return new GeluPlugin(with_fp16_); }
const char* getPluginType() const override { return "gelu_plugin"; } const char* getPluginType() const override { return "gelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
...@@ -63,20 +63,26 @@ class GeluPlugin : public PluginTensorRT { ...@@ -63,20 +63,26 @@ class GeluPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class GeluPluginDynamic : public DynamicPluginTensorRT { class GeluPluginDynamic : public DynamicPluginTensorRT {
public: public:
GeluPluginDynamic() {} explicit GeluPluginDynamic(const bool with_fp16) { with_fp16_ = with_fp16; }
GeluPluginDynamic(void const* serial_data, size_t serial_length) {} GeluPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
~GeluPluginDynamic() {} ~GeluPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new GeluPluginDynamic(); return new GeluPluginDynamic(with_fp16_);
} }
const char* getPluginType() const override { return "gelu_plugin"; } const char* getPluginType() const override { return "gelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override { return 0; } int initialize() override { return 0; }
size_t getSerializationSize() const override { return 0; } size_t getSerializationSize() const override {
void serialize(void* buffer) const override {} return SerializedSize(with_fp16_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
......
...@@ -109,7 +109,6 @@ inline void TransposeQKV(const int batch, const int seq_len, ...@@ -109,7 +109,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
} }
} }
#ifdef SUPPORTS_CUDA_FP16
inline void TransposeQKV(const int batch, const int seq_len, inline void TransposeQKV(const int batch, const int seq_len,
const int head_size, const int head_num, const int head_size, const int head_num,
const half *input, half *output, cudaStream_t stream) { const half *input, half *output, cudaStream_t stream) {
...@@ -148,7 +147,6 @@ inline void TransposeQKV(const int batch, const int seq_len, ...@@ -148,7 +147,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
output); output);
} }
} }
#endif
int QkvToContextPluginDynamic::initialize() { return 0; } int QkvToContextPluginDynamic::initialize() { return 0; }
...@@ -195,19 +193,19 @@ bool QkvToContextPluginDynamic::supportsFormatCombination( ...@@ -195,19 +193,19 @@ bool QkvToContextPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos]; const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
if (ban_fp16_) { #ifdef TRT_PLUGIN_FP16_AVALIABLE
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
}
#else #else
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif #endif
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
} }
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
...@@ -247,6 +245,7 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -247,6 +245,7 @@ int QkvToContextPluginDynamic::enqueue(
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp32";
auto *multihead_temp_data = multihead_temp_tensor.mutable_data<float>( auto *multihead_temp_data = multihead_temp_tensor.mutable_data<float>(
platform::CUDAPlace(device_id)); platform::CUDAPlace(device_id));
auto *qkptr = multihead_temp_data; auto *qkptr = multihead_temp_data;
...@@ -275,7 +274,8 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -275,7 +274,8 @@ int QkvToContextPluginDynamic::enqueue(
head_number_, head_size_); head_number_, head_size_);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp16";
auto *multihead_temp_data = auto *multihead_temp_data =
multihead_temp_tensor.mutable_data<int16_t>( // NOLINT multihead_temp_tensor.mutable_data<int16_t>( // NOLINT
platform::CUDAPlace(device_id)); platform::CUDAPlace(device_id));
...@@ -305,7 +305,11 @@ int QkvToContextPluginDynamic::enqueue( ...@@ -305,7 +305,11 @@ int QkvToContextPluginDynamic::enqueue(
head_number_, head_size_); head_number_, head_size_);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600.")); "The Ernie(Bert) TensorRT Plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.SetTRTDynamicShapeInfo(min_input_shape, "
"max_input_shape, opt_input_shape, true"));
#endif #endif
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -44,23 +44,24 @@ namespace plugin { ...@@ -44,23 +44,24 @@ namespace plugin {
class QkvToContextPluginDynamic : public DynamicPluginTensorRT { class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size, explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size,
float scale, bool ban_fp16) float scale, bool with_fp16)
: hidden_(hidden), : hidden_(hidden),
head_number_(head_number), head_number_(head_number),
head_size_(head_size), head_size_(head_size),
scale_(scale), scale_(scale) {
ban_fp16_(ban_fp16) {} with_fp16_ = with_fp16;
}
QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) { QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &hidden_); DeserializeValue(&serial_data, &serial_length, &hidden_);
DeserializeValue(&serial_data, &serial_length, &head_number_); DeserializeValue(&serial_data, &serial_length, &head_number_);
DeserializeValue(&serial_data, &serial_length, &head_size_); DeserializeValue(&serial_data, &serial_length, &head_size_);
DeserializeValue(&serial_data, &serial_length, &scale_); DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_); DeserializeValue(&serial_data, &serial_length, &with_fp16_);
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_, return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
scale_, ban_fp16_); scale_, with_fp16_);
} }
const char* getPluginType() const override { return "qkv_to_context_plugin"; } const char* getPluginType() const override { return "qkv_to_context_plugin"; }
...@@ -70,14 +71,14 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -70,14 +71,14 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
size_t getSerializationSize() const override { size_t getSerializationSize() const override {
return SerializedSize(hidden_) + SerializedSize(head_number_) + return SerializedSize(hidden_) + SerializedSize(head_number_) +
SerializedSize(head_size_) + SerializedSize(scale_) + SerializedSize(head_size_) + SerializedSize(scale_) +
SerializedSize(ban_fp16_); SerializedSize(with_fp16_);
} }
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
SerializeValue(&buffer, hidden_); SerializeValue(&buffer, hidden_);
SerializeValue(&buffer, head_number_); SerializeValue(&buffer, head_number_);
SerializeValue(&buffer, head_size_); SerializeValue(&buffer, head_size_);
SerializeValue(&buffer, scale_); SerializeValue(&buffer, scale_);
SerializeValue(&buffer, ban_fp16_); SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
...@@ -115,7 +116,6 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -115,7 +116,6 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
int head_number_; int head_number_;
int head_size_; int head_size_;
float scale_; float scale_;
bool ban_fp16_;
}; };
class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
......
...@@ -66,19 +66,19 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination( ...@@ -66,19 +66,19 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos]; const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
if (ban_fp16_) { #ifdef TRT_PLUGIN_FP16_AVALIABLE
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
}
#else #else
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif #endif
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
}
} }
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
...@@ -114,6 +114,7 @@ int SkipLayerNormPluginDynamic::enqueue( ...@@ -114,6 +114,7 @@ int SkipLayerNormPluginDynamic::enqueue(
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]); const float *input1 = static_cast<const float *>(inputs[0]);
const float *input2 = static_cast<const float *>(inputs[1]); const float *input2 = static_cast<const float *>(inputs[1]);
float *output = static_cast<float *>(outputs[0]); float *output = static_cast<float *>(outputs[0]);
...@@ -121,7 +122,8 @@ int SkipLayerNormPluginDynamic::enqueue( ...@@ -121,7 +122,8 @@ int SkipLayerNormPluginDynamic::enqueue(
skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_, skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_,
output, eps_, stream); output, eps_, stream);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef TRT_PLUGIN_FP16_AVALIABLE
VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]); const half *input1 = static_cast<const half *>(inputs[0]);
const half *input2 = static_cast<const half *>(inputs[1]); const half *input2 = static_cast<const half *>(inputs[1]);
half *output = static_cast<half *>(outputs[0]); half *output = static_cast<half *>(outputs[0]);
...@@ -130,7 +132,11 @@ int SkipLayerNormPluginDynamic::enqueue( ...@@ -130,7 +132,11 @@ int SkipLayerNormPluginDynamic::enqueue(
output, static_cast<half>(eps_), stream); output, static_cast<half>(eps_), stream);
#else #else
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600.")); "The Ernie(Bert) tensorRT plugin should be "
"complied with CUDA version >= 10.0 when running with fp16. "
"Please recomplie it or try to use fp32 by set "
"config.SetTRTDynamicShapeInfo(min_input_shape, "
"max_input_shape, opt_input_shape, true"));
#endif #endif
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -31,11 +31,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -31,11 +31,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale, explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
int bias_size, int scale_size, int bias_size, int scale_size,
const float eps, bool ban_fp16) const float eps, bool with_fp16)
: bias_size_(bias_size), : bias_size_(bias_size), scale_size_(scale_size), eps_(eps) {
scale_size_(scale_size), with_fp16_ = with_fp16;
eps_(eps),
ban_fp16_(ban_fp16) {
bias_.resize(bias_size); bias_.resize(bias_size);
scale_.resize(scale_size); scale_.resize(scale_size);
std::copy(bias, bias + bias_size, bias_.data()); std::copy(bias, bias + bias_size, bias_.data());
...@@ -47,12 +45,12 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -47,12 +45,12 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
DeserializeValue(&serial_data, &serial_length, &bias_size_); DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_); DeserializeValue(&serial_data, &serial_length, &scale_size_);
DeserializeValue(&serial_data, &serial_length, &eps_); DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_); DeserializeValue(&serial_data, &serial_length, &with_fp16_);
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
auto ptr = new SkipLayerNormPluginDynamic( auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, with_fp16_);
ptr->bias_gpu_ = bias_gpu_; ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_; ptr->scale_gpu_ = scale_gpu_;
return ptr; return ptr;
...@@ -65,7 +63,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -65,7 +63,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
size_t getSerializationSize() const override { size_t getSerializationSize() const override {
size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) + size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(bias_size_) + SerializedSize(scale_size_) + SerializedSize(bias_size_) + SerializedSize(scale_size_) +
SerializedSize(eps_) + SerializedSize(eps_); SerializedSize(eps_) + SerializedSize(with_fp16_);
return ser_size; return ser_size;
} }
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
...@@ -74,7 +72,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -74,7 +72,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
SerializeValue(&buffer, bias_size_); SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_); SerializeValue(&buffer, scale_size_);
SerializeValue(&buffer, eps_); SerializeValue(&buffer, eps_);
SerializeValue(&buffer, ban_fp16_); SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
...@@ -118,7 +116,6 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -118,7 +116,6 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
int scale_size_; int scale_size_;
float eps_; float eps_;
bool ban_fp16_;
}; };
class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
......
...@@ -59,8 +59,9 @@ __global__ void SliceKernel(int num, int dims, const T *input, ...@@ -59,8 +59,9 @@ __global__ void SliceKernel(int num, int dims, const T *input,
} }
SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends, SlicePlugin::SlicePlugin(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16) std::vector<int> axes, bool with_fp16)
: starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) { : starts_(starts), ends_(ends), axes_(axes) {
with_fp16_ = with_fp16;
cudaEventCreate(&copy_event_); cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_); cudaStreamCreate(&copy_stream_);
} }
...@@ -70,7 +71,6 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) { ...@@ -70,7 +71,6 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &starts_); DeserializeValue(&serial_data, &serial_length, &starts_);
DeserializeValue(&serial_data, &serial_length, &ends_); DeserializeValue(&serial_data, &serial_length, &ends_);
DeserializeValue(&serial_data, &serial_length, &axes_); DeserializeValue(&serial_data, &serial_length, &axes_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
cudaEventCreate(&copy_event_); cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_); cudaStreamCreate(&copy_stream_);
} }
...@@ -82,19 +82,19 @@ SlicePlugin::~SlicePlugin() { ...@@ -82,19 +82,19 @@ SlicePlugin::~SlicePlugin() {
} }
SlicePlugin *SlicePlugin::clone() const { SlicePlugin *SlicePlugin::clone() const {
return new SlicePlugin(starts_, ends_, axes_, ban_fp16_); return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
} }
bool SlicePlugin::supportsFormat(nvinfer1::DataType type, bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const { nvinfer1::PluginFormat format) const {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return ((type == nvinfer1::DataType::kFLOAT || return ((type == nvinfer1::DataType::kFLOAT ||
type == nvinfer1::DataType::kHALF) && type == nvinfer1::DataType::kHALF) &&
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
#else } else {
return ((type == nvinfer1::DataType::kFLOAT) && return ((type == nvinfer1::DataType::kFLOAT) &&
(format == nvinfer1::PluginFormat::kNCHW)); (format == nvinfer1::PluginFormat::kNCHW));
#endif }
} }
nvinfer1::Dims SlicePlugin::getOutputDimensions(int index, nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
...@@ -170,20 +170,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -170,20 +170,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
int blocks = (out_num + threads - 1) / threads; int blocks = (out_num + threads - 1) / threads;
auto input_type = getDataType(); auto input_type = getDataType();
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]); const float *input1 = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]); float *output = static_cast<float *>(outputs[0]);
SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>( SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output); out_num, num_dims, input1, offset_temp_data_, output);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]); const half *input1 = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]); half *output = static_cast<half *>(outputs[0]);
SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>( SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output); out_num, num_dims, input1, offset_temp_data_, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"The Slice TRT Plugin's input type should be float or half.")); "The Slice TRT Plugin's input type should be float or half."));
...@@ -194,7 +191,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -194,7 +191,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
size_t SlicePlugin::getSerializationSize() { size_t SlicePlugin::getSerializationSize() {
return getBaseSerializationSize() + SerializedSize(getPluginType()) + return getBaseSerializationSize() + SerializedSize(getPluginType()) +
SerializedSize(starts_) + SerializedSize(ends_) + SerializedSize(starts_) + SerializedSize(ends_) +
SerializedSize(axes_) + SerializedSize(ban_fp16_); SerializedSize(axes_);
} }
void SlicePlugin::serialize(void *buffer) { void SlicePlugin::serialize(void *buffer) {
...@@ -203,15 +200,15 @@ void SlicePlugin::serialize(void *buffer) { ...@@ -203,15 +200,15 @@ void SlicePlugin::serialize(void *buffer) {
SerializeValue(&buffer, starts_); SerializeValue(&buffer, starts_);
SerializeValue(&buffer, ends_); SerializeValue(&buffer, ends_);
SerializeValue(&buffer, axes_); SerializeValue(&buffer, axes_);
SerializeValue(&buffer, ban_fp16_);
} }
// Dynamic Plugin below. // Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts, SlicePluginDynamic::SlicePluginDynamic(std::vector<int> starts,
std::vector<int> ends, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16) std::vector<int> axes, bool with_fp16)
: starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) { : starts_(starts), ends_(ends), axes_(axes) {
with_fp16_ = with_fp16;
cudaEventCreate(&copy_event_); cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_); cudaStreamCreate(&copy_stream_);
} }
...@@ -221,7 +218,7 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData, ...@@ -221,7 +218,7 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
DeserializeValue(&serialData, &serialLength, &starts_); DeserializeValue(&serialData, &serialLength, &starts_);
DeserializeValue(&serialData, &serialLength, &ends_); DeserializeValue(&serialData, &serialLength, &ends_);
DeserializeValue(&serialData, &serialLength, &axes_); DeserializeValue(&serialData, &serialLength, &axes_);
DeserializeValue(&serialData, &serialLength, &ban_fp16_); DeserializeValue(&serialData, &serialLength, &with_fp16_);
cudaEventCreate(&copy_event_); cudaEventCreate(&copy_event_);
cudaStreamCreate(&copy_stream_); cudaStreamCreate(&copy_stream_);
} }
...@@ -237,7 +234,7 @@ int SlicePluginDynamic::initialize() { return 0; } ...@@ -237,7 +234,7 @@ int SlicePluginDynamic::initialize() { return 0; }
size_t SlicePluginDynamic::getSerializationSize() const { size_t SlicePluginDynamic::getSerializationSize() const {
size_t size = SerializedSize(starts_) + SerializedSize(ends_) + size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
SerializedSize(axes_) + SerializedSize(ban_fp16_); SerializedSize(axes_) + SerializedSize(with_fp16_);
return size; return size;
} }
...@@ -246,7 +243,7 @@ void SlicePluginDynamic::serialize(void *buffer) const { ...@@ -246,7 +243,7 @@ void SlicePluginDynamic::serialize(void *buffer) const {
SerializeValue(&buffer, starts_); SerializeValue(&buffer, starts_);
SerializeValue(&buffer, ends_); SerializeValue(&buffer, ends_);
SerializeValue(&buffer, axes_); SerializeValue(&buffer, axes_);
SerializeValue(&buffer, ban_fp16_); SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
...@@ -278,19 +275,14 @@ bool SlicePluginDynamic::supportsFormatCombination( ...@@ -278,19 +275,14 @@ bool SlicePluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos]; const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
if (ban_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
} else {
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
} }
#else
return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR);
#endif
} }
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output // output
...@@ -362,20 +354,17 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, ...@@ -362,20 +354,17 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
int blocks = (out_num + threads - 1) / threads; int blocks = (out_num + threads - 1) / threads;
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32";
const float *input1 = static_cast<const float *>(inputs[0]); const float *input1 = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]); float *output = static_cast<float *>(outputs[0]);
SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>( SliceKernel<float><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output); out_num, num_dims, input1, offset_temp_data_, output);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16";
const half *input1 = static_cast<const half *>(inputs[0]); const half *input1 = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]); half *output = static_cast<half *>(outputs[0]);
SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>( SliceKernel<half><<<blocks, threads, 3 * num_dims * sizeof(int), stream>>>(
out_num, num_dims, input1, offset_temp_data_, output); out_num, num_dims, input1, offset_temp_data_, output);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
"The Slice TRT Plugin's input type should be float or half.")); "The Slice TRT Plugin's input type should be float or half."));
......
...@@ -29,7 +29,7 @@ namespace plugin { ...@@ -29,7 +29,7 @@ namespace plugin {
class SlicePlugin : public PluginTensorRT { class SlicePlugin : public PluginTensorRT {
public: public:
explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends, explicit SlicePlugin(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16); std::vector<int> axes, bool with_fp16);
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
...@@ -58,7 +58,6 @@ class SlicePlugin : public PluginTensorRT { ...@@ -58,7 +58,6 @@ class SlicePlugin : public PluginTensorRT {
std::vector<int> starts_; std::vector<int> starts_;
std::vector<int> ends_; std::vector<int> ends_;
std::vector<int> axes_; std::vector<int> axes_;
bool ban_fp16_{false};
int* offset_temp_data_{nullptr}; int* offset_temp_data_{nullptr};
cudaEvent_t copy_event_; cudaEvent_t copy_event_;
cudaStream_t copy_stream_; cudaStream_t copy_stream_;
...@@ -68,10 +67,10 @@ class SlicePlugin : public PluginTensorRT { ...@@ -68,10 +67,10 @@ class SlicePlugin : public PluginTensorRT {
class SlicePluginDynamic : public DynamicPluginTensorRT { class SlicePluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends, explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
std::vector<int> axes, bool ban_fp16); std::vector<int> axes, bool with_fp16);
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_); return new SlicePluginDynamic(starts_, ends_, axes_, with_fp16_);
} }
SlicePluginDynamic(void const* serialData, size_t serialLength); SlicePluginDynamic(void const* serialData, size_t serialLength);
...@@ -117,7 +116,6 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { ...@@ -117,7 +116,6 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
std::vector<int> starts_; std::vector<int> starts_;
std::vector<int> ends_; std::vector<int> ends_;
std::vector<int> axes_; std::vector<int> axes_;
bool ban_fp16_{false};
int* offset_temp_data_{nullptr}; int* offset_temp_data_{nullptr};
cudaEvent_t copy_event_; cudaEvent_t copy_event_;
cudaStream_t copy_stream_; cudaStream_t copy_stream_;
......
...@@ -145,9 +145,16 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, ...@@ -145,9 +145,16 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
int SplitPluginDynamic::initialize() { return 0; } int SplitPluginDynamic::initialize() { return 0; }
size_t SplitPluginDynamic::getSerializationSize() const { return 0; } size_t SplitPluginDynamic::getSerializationSize() const {
return SerializedSize(axis_) + SerializedSize(output_length_) +
SerializedSize(with_fp16_);
}
void SplitPluginDynamic::serialize(void* buffer) const {} void SplitPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_);
SerializeValue(&buffer, output_length_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
...@@ -183,14 +190,14 @@ bool SplitPluginDynamic::supportsFormatCombination( ...@@ -183,14 +190,14 @@ bool SplitPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos]; const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#else } else {
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif }
} }
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output // output
...@@ -234,6 +241,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -234,6 +241,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Split-->fp32";
thrust::device_vector<float*> d_output_ptrs; thrust::device_vector<float*> d_output_ptrs;
d_output_ptrs.resize(this->getNbOutputs(), nullptr); d_output_ptrs.resize(this->getNbOutputs(), nullptr);
...@@ -249,7 +257,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -249,7 +257,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
inner_cols, axis_shape, outer_rows); inner_cols, axis_shape, outer_rows);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Split-->fp16";
thrust::device_vector<half*> d_output_ptrs; thrust::device_vector<half*> d_output_ptrs;
d_output_ptrs.resize(this->getNbOutputs(), nullptr); d_output_ptrs.resize(this->getNbOutputs(), nullptr);
...@@ -264,10 +272,6 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -264,10 +272,6 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
split_kernel<<<grid, block, 0, stream>>>( split_kernel<<<grid, block, 0, stream>>>(
d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs,
inner_cols, axis_shape, outer_rows); inner_cols, axis_shape, outer_rows);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
...@@ -27,8 +28,10 @@ namespace plugin { ...@@ -27,8 +28,10 @@ namespace plugin {
class SplitPlugin : public PluginTensorRT { class SplitPlugin : public PluginTensorRT {
public: public:
SplitPlugin() {} SplitPlugin() {}
SplitPlugin(int axis, std::vector<int> const& output_lengths) SplitPlugin(int axis, std::vector<int> const& output_lengths, bool with_fp16)
: axis_(axis), same_shape_(true), output_length_(output_lengths) {} : axis_(axis), same_shape_(true), output_length_(output_lengths) {
with_fp16_ = with_fp16;
}
SplitPlugin(void const* serial_data, size_t serial_length) { SplitPlugin(void const* serial_data, size_t serial_length) {
deserializeBase(serial_data, serial_length); deserializeBase(serial_data, serial_length);
...@@ -37,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { ...@@ -37,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
} }
SplitPlugin* clone() const override { SplitPlugin* clone() const override {
return new SplitPlugin(axis_, output_length_); return new SplitPlugin(axis_, output_length_, with_fp16_);
} }
const char* getPluginType() const override { return "split_plugin"; } const char* getPluginType() const override { return "split_plugin"; }
...@@ -77,13 +80,20 @@ class SplitPlugin : public PluginTensorRT { ...@@ -77,13 +80,20 @@ class SplitPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class SplitPluginDynamic : public DynamicPluginTensorRT { class SplitPluginDynamic : public DynamicPluginTensorRT {
public: public:
SplitPluginDynamic(int axis, std::vector<int> const& output_lengths) SplitPluginDynamic(int axis, std::vector<int> const& output_lengths,
: axis_(axis), output_length_(output_lengths) {} bool with_fp16)
: axis_(axis), output_length_(output_lengths) {
with_fp16_ = with_fp16;
}
SplitPluginDynamic(void const* serial_data, size_t serial_length) {} SplitPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &output_length_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SplitPluginDynamic(axis_, output_length_); return new SplitPluginDynamic(axis_, output_length_, with_fp16_);
} }
const char* getPluginType() const override { return "split_plugin"; } const char* getPluginType() const override { return "split_plugin"; }
...@@ -127,6 +137,46 @@ class SplitPluginDynamic : public DynamicPluginTensorRT { ...@@ -127,6 +137,46 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
int axis_; int axis_;
std::vector<int> output_length_; std::vector<int> output_length_;
}; };
class SplitPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SplitPluginV2Creator() {}
const char* getPluginName() const override { return "split_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SplitPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -24,19 +24,22 @@ namespace tensorrt { ...@@ -24,19 +24,22 @@ namespace tensorrt {
namespace plugin { namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
StackPluginDynamic::StackPluginDynamic(int axis, int num_stack) StackPluginDynamic::StackPluginDynamic(int axis, int num_stack, bool with_fp16)
: axis_(axis), num_stack_(num_stack) {} : axis_(axis), num_stack_(num_stack) {
with_fp16_ = with_fp16;
}
StackPluginDynamic::StackPluginDynamic(void const* serial_data, StackPluginDynamic::StackPluginDynamic(void const* serial_data,
size_t serial_length) { size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &axis_);
DeserializeValue(&serial_data, &serial_length, &num_stack_); DeserializeValue(&serial_data, &serial_length, &num_stack_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
} }
StackPluginDynamic::~StackPluginDynamic() {} StackPluginDynamic::~StackPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const { nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
return new StackPluginDynamic(axis_, num_stack_); return new StackPluginDynamic(axis_, num_stack_, with_fp16_);
} }
const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; } const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
...@@ -49,12 +52,14 @@ size_t StackPluginDynamic::getSerializationSize() const { ...@@ -49,12 +52,14 @@ size_t StackPluginDynamic::getSerializationSize() const {
size_t serialize_size = 0; size_t serialize_size = 0;
serialize_size += SerializedSize(axis_); serialize_size += SerializedSize(axis_);
serialize_size += SerializedSize(num_stack_); serialize_size += SerializedSize(num_stack_);
serialize_size += SerializedSize(with_fp16_);
return serialize_size; return serialize_size;
} }
void StackPluginDynamic::serialize(void* buffer) const { void StackPluginDynamic::serialize(void* buffer) const {
SerializeValue(&buffer, axis_); SerializeValue(&buffer, axis_);
SerializeValue(&buffer, num_stack_); SerializeValue(&buffer, num_stack_);
SerializeValue(&buffer, with_fp16_);
} }
nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
...@@ -99,14 +104,14 @@ bool StackPluginDynamic::supportsFormatCombination( ...@@ -99,14 +104,14 @@ bool StackPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc& in = in_out[pos]; const nvinfer1::PluginTensorDesc& in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#else } else {
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif }
} }
const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
// output // output
...@@ -170,20 +175,17 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, ...@@ -170,20 +175,17 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
auto infer_type = input_desc[0].type; auto infer_type = input_desc[0].type;
if (infer_type == nvinfer1::DataType::kFLOAT) { if (infer_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Stack-->fp32";
float* output = static_cast<float*>(outputs[0]); float* output = static_cast<float*>(outputs[0]);
StackKernel<float><<<num_blocks, num_threads, 0, stream>>>( StackKernel<float><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const float* const*>(workspace), output, num_stacks, reinterpret_cast<const float* const*>(workspace), output, num_stacks,
base_unit); base_unit);
} else if (infer_type == nvinfer1::DataType::kHALF) { } else if (infer_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Stack-->fp16";
__half* output = static_cast<__half*>(outputs[0]); __half* output = static_cast<__half*>(outputs[0]);
StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>( StackKernel<__half><<<num_blocks, num_threads, 0, stream>>>(
reinterpret_cast<const __half* const*>(workspace), output, num_stacks, reinterpret_cast<const __half* const*>(workspace), output, num_stacks,
base_unit); base_unit);
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW( PADDLE_THROW(
platform::errors::Fatal("The Stack TRT Plugin's input type only " platform::errors::Fatal("The Stack TRT Plugin's input type only "
...@@ -209,6 +211,7 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin( ...@@ -209,6 +211,7 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) { const char* name, const nvinfer1::PluginFieldCollection* fc) {
int axis = -1; int axis = -1;
int num_stack = -1; int num_stack = -1;
bool with_fp16 = false;
for (int i = 0; i < fc->nbFields; ++i) { for (int i = 0; i < fc->nbFields; ++i) {
const std::string name(fc->fields[i].name); const std::string name(fc->fields[i].name);
...@@ -216,13 +219,15 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin( ...@@ -216,13 +219,15 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
axis = static_cast<const int*>(fc->fields[i].data)[0]; axis = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "num_stack") { } else if (name == "num_stack") {
num_stack = static_cast<const int*>(fc->fields[i].data)[0]; num_stack = static_cast<const int*>(fc->fields[i].data)[0];
} else if (name == "with_fp16") {
with_fp16 = static_cast<const bool*>(fc->fields[i].data)[0];
} else { } else {
PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" + PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" +
name + name +
"' when creating stack op plugin.")); "' when creating stack op plugin."));
} }
} }
return new StackPluginDynamic(axis, num_stack); return new StackPluginDynamic(axis, num_stack, with_fp16);
} }
nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin( nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
......
...@@ -28,7 +28,7 @@ namespace plugin { ...@@ -28,7 +28,7 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class StackPluginDynamic : public DynamicPluginTensorRT { class StackPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit StackPluginDynamic(int axis, int num_stack); explicit StackPluginDynamic(int axis, int num_stack, bool with_fp16);
StackPluginDynamic(void const* serial_data, size_t serial_length); StackPluginDynamic(void const* serial_data, size_t serial_length);
~StackPluginDynamic(); ~StackPluginDynamic();
nvinfer1::IPluginV2DynamicExt* clone() const override; nvinfer1::IPluginV2DynamicExt* clone() const override;
......
...@@ -44,12 +44,12 @@ nvinfer1::Dims SwishPlugin::getOutputDimensions(int index, ...@@ -44,12 +44,12 @@ nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
template <typename T> template <typename T>
__device__ T math_exp(T a); __device__ T math_exp(T a);
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
__device__ half math_exp<half>(half a) { __device__ half math_exp<half>(half a) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
return hexp(a); return hexp(a);
}
#endif #endif
}
template <> template <>
__device__ float math_exp<float>(float a) { __device__ float math_exp<float>(float a) {
...@@ -71,6 +71,19 @@ __global__ void swish_kernel(int num, const T *input, T *output, T beta) { ...@@ -71,6 +71,19 @@ __global__ void swish_kernel(int num, const T *input, T *output, T beta) {
} }
} }
template <>
__global__ void swish_kernel<half>(int num, const half *input, half *output,
half beta) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < num) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
output[index] =
__ldg(input + index) /
(static_cast<half>(1.0) + math_exp<half>(-beta * __ldg(input + index)));
#endif
}
}
int SwishPlugin::enqueue(int batch_size, const void *const *inputs, int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) { void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW. // input dims is CHW.
...@@ -92,14 +105,18 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -92,14 +105,18 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
int SwishPluginDynamic::initialize() { int SwishPluginDynamic::initialize() {
setPluginNamespace("swish");
getPluginNamespace(); getPluginNamespace();
return 0; return 0;
} }
size_t SwishPluginDynamic::getSerializationSize() const { return 0; } size_t SwishPluginDynamic::getSerializationSize() const {
return SerializedSize(beta_) + SerializedSize(with_fp16_);
}
void SwishPluginDynamic::serialize(void *buffer) const {} void SwishPluginDynamic::serialize(void *buffer) const {
SerializeValue(&buffer, beta_);
SerializeValue(&buffer, with_fp16_);
}
nvinfer1::DimsExprs SwishPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SwishPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
...@@ -123,14 +140,14 @@ bool SwishPluginDynamic::supportsFormatCombination( ...@@ -123,14 +140,14 @@ bool SwishPluginDynamic::supportsFormatCombination(
const nvinfer1::PluginTensorDesc &in = in_out[pos]; const nvinfer1::PluginTensorDesc &in = in_out[pos];
if (pos == 0) { if (pos == 0) {
#ifdef SUPPORTS_CUDA_FP16 if (with_fp16_) {
return (in.type == nvinfer1::DataType::kFLOAT || return (in.type == nvinfer1::DataType::kFLOAT ||
in.type == nvinfer1::DataType::kHALF) && in.type == nvinfer1::DataType::kHALF) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#else } else {
return (in.type == nvinfer1::DataType::kFLOAT) && return (in.type == nvinfer1::DataType::kFLOAT) &&
(in.format == nvinfer1::TensorFormat::kLINEAR); (in.format == nvinfer1::TensorFormat::kLINEAR);
#endif }
} }
const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
// output // output
...@@ -157,20 +174,17 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, ...@@ -157,20 +174,17 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
auto input_type = input_desc[0].type; auto input_type = input_desc[0].type;
if (input_type == nvinfer1::DataType::kFLOAT) { if (input_type == nvinfer1::DataType::kFLOAT) {
VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
const float *input = static_cast<const float *>(inputs[0]); const float *input = static_cast<const float *>(inputs[0]);
float *output = static_cast<float *>(outputs[0]); float *output = static_cast<float *>(outputs[0]);
swish_kernel<float><<<blocks, threads, 0, stream>>>(num, input, output, swish_kernel<float><<<blocks, threads, 0, stream>>>(num, input, output,
beta_); beta_);
} else if (input_type == nvinfer1::DataType::kHALF) { } else if (input_type == nvinfer1::DataType::kHALF) {
#ifdef SUPPORTS_CUDA_FP16 VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
const half *input = static_cast<const half *>(inputs[0]); const half *input = static_cast<const half *>(inputs[0]);
half *output = static_cast<half *>(outputs[0]); half *output = static_cast<half *>(outputs[0]);
swish_kernel<half><<<blocks, threads, 0, stream>>>( swish_kernel<half><<<blocks, threads, 0, stream>>>(
num, input, output, static_cast<half>(beta_)); num, input, output, static_cast<half>(beta_));
#else
PADDLE_THROW(platform::errors::Fatal(
"The cuda archs you specific should greater than 600."));
#endif
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"The Swish TRT Plugin's input type should be float or half.")); "The Swish TRT Plugin's input type should be float or half."));
......
...@@ -32,7 +32,8 @@ class SwishPlugin : public PluginTensorRT { ...@@ -32,7 +32,8 @@ class SwishPlugin : public PluginTensorRT {
protected: protected:
size_t getSerializationSize() override { size_t getSerializationSize() override {
return getBaseSerializationSize() + SerializedSize(beta_); return SerializedSize(getPluginType()) + getBaseSerializationSize() +
SerializedSize(beta_);
} }
// TRT will call this func when we need to serialize the configuration of // TRT will call this func when we need to serialize the configuration of
...@@ -45,7 +46,9 @@ class SwishPlugin : public PluginTensorRT { ...@@ -45,7 +46,9 @@ class SwishPlugin : public PluginTensorRT {
} }
public: public:
explicit SwishPlugin(const float beta) : beta_(beta) {} explicit SwishPlugin(const float beta, const bool with_fp16) : beta_(beta) {
with_fp16_ = with_fp16;
}
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
...@@ -56,7 +59,9 @@ class SwishPlugin : public PluginTensorRT { ...@@ -56,7 +59,9 @@ class SwishPlugin : public PluginTensorRT {
~SwishPlugin() {} ~SwishPlugin() {}
int initialize() override; int initialize() override;
SwishPlugin* clone() const override { return new SwishPlugin(beta_); } SwishPlugin* clone() const override {
return new SwishPlugin(beta_, with_fp16_);
}
const char* getPluginType() const override { return "swish_plugin"; } const char* getPluginType() const override { return "swish_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
...@@ -69,10 +74,16 @@ class SwishPlugin : public PluginTensorRT { ...@@ -69,10 +74,16 @@ class SwishPlugin : public PluginTensorRT {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class SwishPluginDynamic : public DynamicPluginTensorRT { class SwishPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit SwishPluginDynamic(const float beta) : beta_(beta) {} explicit SwishPluginDynamic(const float beta, const bool with_fp16)
SwishPluginDynamic(void const* serialData, size_t serialLength) {} : beta_(beta) {
with_fp16_ = with_fp16;
}
SwishPluginDynamic(void const* serialData, size_t serialLength) {
DeserializeValue(&serialData, &serialLength, &beta_);
DeserializeValue(&serialData, &serialLength, &with_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SwishPluginDynamic(beta_); return new SwishPluginDynamic(beta_, with_fp16_);
} }
const char* getPluginType() const override { return "swish_plugin"; } const char* getPluginType() const override { return "swish_plugin"; }
...@@ -115,6 +126,46 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { ...@@ -115,6 +126,46 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
private: private:
float beta_; float beta_;
}; };
class SwishPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SwishPluginV2Creator() {}
const char* getPluginName() const override { return "swish_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SwishPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -24,6 +24,7 @@ void PluginTensorRT::serializeBase(void*& buffer) { ...@@ -24,6 +24,7 @@ void PluginTensorRT::serializeBase(void*& buffer) {
SerializeValue(&buffer, max_batch_size_); SerializeValue(&buffer, max_batch_size_);
SerializeValue(&buffer, data_type_); SerializeValue(&buffer, data_type_);
SerializeValue(&buffer, data_format_); SerializeValue(&buffer, data_format_);
SerializeValue(&buffer, with_fp16_);
} }
void PluginTensorRT::deserializeBase(void const*& serial_data, void PluginTensorRT::deserializeBase(void const*& serial_data,
...@@ -32,11 +33,13 @@ void PluginTensorRT::deserializeBase(void const*& serial_data, ...@@ -32,11 +33,13 @@ void PluginTensorRT::deserializeBase(void const*& serial_data,
DeserializeValue(&serial_data, &serial_length, &max_batch_size_); DeserializeValue(&serial_data, &serial_length, &max_batch_size_);
DeserializeValue(&serial_data, &serial_length, &data_type_); DeserializeValue(&serial_data, &serial_length, &data_type_);
DeserializeValue(&serial_data, &serial_length, &data_format_); DeserializeValue(&serial_data, &serial_length, &data_format_);
DeserializeValue(&serial_data, &serial_length, &with_fp16_);
} }
size_t PluginTensorRT::getBaseSerializationSize() { size_t PluginTensorRT::getBaseSerializationSize() {
return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) +
SerializedSize(data_type_) + SerializedSize(data_format_)); SerializedSize(data_type_) + SerializedSize(data_format_) +
SerializedSize(with_fp16_));
} }
bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
......
...@@ -42,7 +42,7 @@ typedef std::function<PluginTensorRT*(void)> PluginConstructFunc; ...@@ -42,7 +42,7 @@ typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
class PluginTensorRT : public nvinfer1::IPluginExt { class PluginTensorRT : public nvinfer1::IPluginExt {
public: public:
PluginTensorRT() {} PluginTensorRT() : with_fp16_(false) {}
// It was used for TensorRT deserialization. // It was used for TensorRT deserialization.
// It should not be called by users. // It should not be called by users.
PluginTensorRT(const void* serialized_data, size_t length) {} PluginTensorRT(const void* serialized_data, size_t length) {}
...@@ -112,12 +112,13 @@ class PluginTensorRT : public nvinfer1::IPluginExt { ...@@ -112,12 +112,13 @@ class PluginTensorRT : public nvinfer1::IPluginExt {
nvinfer1::PluginFormat data_format_; nvinfer1::PluginFormat data_format_;
std::vector<nvinfer1::ITensor*> inputs_; std::vector<nvinfer1::ITensor*> inputs_;
bool with_fp16_;
}; };
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
public: public:
DynamicPluginTensorRT() {} DynamicPluginTensorRT() : with_fp16_(false) {}
DynamicPluginTensorRT(const void* serialized_data, size_t length) {} DynamicPluginTensorRT(const void* serialized_data, size_t length) {}
// The Func in IPluginExt or IpluginExtV2 // The Func in IPluginExt or IpluginExtV2
...@@ -173,6 +174,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { ...@@ -173,6 +174,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
size_t& serial_length); // NOLINT size_t& serial_length); // NOLINT
size_t getBaseSerializationSize() const; size_t getBaseSerializationSize() const;
void serializeBase(void*& buffer) const; // NOLINT void serializeBase(void*& buffer) const; // NOLINT
bool with_fp16_;
private: private:
std::string name_space_; std::string name_space_;
......
...@@ -543,10 +543,19 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -543,10 +543,19 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif() endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized)
endif() endif()
set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
...@@ -597,6 +606,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -597,6 +606,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 120) set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 120)
set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 120) set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_dynamic_shape_ernie_fp16_ser_deser PROPERTIES TIMEOUT 120)
set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 120)
endif() endif()
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <dirent.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <unistd.h>
#include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
namespace paddle {
namespace inference {
TEST(AnalysisPredictor, fp16) {
std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
trt_ernie(true, result);
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <dirent.h>
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <unistd.h>
#include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h"
namespace paddle {
namespace inference {
TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result);
}
} // namespace inference
} // namespace paddle
...@@ -11,19 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,19 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include <dirent.h> #include <dirent.h>
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <unistd.h> #include <unistd.h>
#include <functional>
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
int DeleteCache(std::string path) { static int DeleteCache(std::string path) {
DIR* dir = opendir(path.c_str()); DIR* dir = opendir(path.c_str());
if (dir == NULL) return 0; if (dir == NULL) return 0;
struct dirent* ptr; struct dirent* ptr;
...@@ -39,7 +43,7 @@ int DeleteCache(std::string path) { ...@@ -39,7 +43,7 @@ int DeleteCache(std::string path) {
return 0; return 0;
} }
void run(const AnalysisConfig& config, std::vector<float>* out_data) { static void run(const AnalysisConfig& config, std::vector<float>* out_data) {
auto predictor = CreatePaddlePredictor(config); auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames(); auto input_names = predictor->GetInputNames();
...@@ -101,7 +105,7 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) { ...@@ -101,7 +105,7 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
output_t->copy_to_cpu(out_data->data()); output_t->copy_to_cpu(out_data->data());
} }
void trt_ernie(bool with_fp16, std::vector<float> result) { static void trt_ernie(bool with_fp16, std::vector<float> result) {
AnalysisConfig config; AnalysisConfig config;
std::string model_dir = FLAGS_infer_model; std::string model_dir = FLAGS_infer_model;
// Delete serialization cache to perform serialization first rather than // Delete serialization cache to perform serialization first rather than
...@@ -155,15 +159,5 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -155,15 +159,5 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
} }
} }
TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result);
}
#ifdef SUPPORTS_CUDA_FP16
TEST(AnalysisPredictor, fp16) {
std::vector<float> result = {0.59923654, 0.21923761, 0.18152587};
trt_ernie(true, result);
}
#endif // SUPPORTS_CUDA_FP16
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -83,7 +83,8 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) { ...@@ -83,7 +83,8 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
output_t->copy_to_cpu(out_data->data()); output_t->copy_to_cpu(out_data->data());
} }
void trt_ernie(bool with_fp16, std::vector<float> result) { void trt_ernie(bool with_fp16, std::vector<float> result,
float near_tolerance) {
AnalysisConfig config; AnalysisConfig config;
std::string model_dir = FLAGS_infer_model; std::string model_dir = FLAGS_infer_model;
SetConfig(&config, model_dir, true); SetConfig(&config, model_dir, true);
...@@ -126,19 +127,19 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -126,19 +127,19 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
run(config, &out_data); run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-5); EXPECT_NEAR(result[i], out_data[i], near_tolerance);
} }
} }
TEST(AnalysisPredictor, no_fp16) { TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187}; std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result); trt_ernie(false, result, 1e-5);
} }
TEST(AnalysisPredictor, fp16) { TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16 #ifdef TRT_PLUGIN_FP16_AVALIABLE
std::vector<float> result = {0.598336, 0.219558, 0.182106}; std::vector<float> result = {0.598, 0.219, 0.182};
trt_ernie(true, result); trt_ernie(true, result, 3e-3);
#endif #endif
} }
......
...@@ -145,6 +145,50 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, ...@@ -145,6 +145,50 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids,
LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps); LayerNorm<T, TPB>(thread_data, hidden, out_offset, bias, scale, output, eps);
} }
template <>
__global__ void EmbEltwiseLayernormKernel<half, 256>(
int hidden, const int64_t *ids, const float *scale, const float *bias,
const int64_t *embs, half *output, float eps, int input_num) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
cub::Sum pair_sum;
// blockIdx.x: position in the sequence
// blockIdx.y: batch
// gridDim.x: Seq
// gridDim.y: Batch
extern __shared__ int64_t array_id[];
const half rhidden = half(1.f) / half(hidden);
const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y;
if (threadIdx.x == 0) {
for (int i = 0; i < input_num; ++i) {
const int64_t *ids_p = reinterpret_cast<const int64_t *>(ids[i]);
array_id[i] = ids_p[seq_pos];
}
}
__syncthreads();
const int64_t out_offset = seq_pos * hidden;
kvp<half> thread_data(0, 0);
#pragma unroll
for (int it = threadIdx.x; it < hidden; it += 256) {
half val = 0;
for (int i = 0; i < input_num; ++i) {
val += reinterpret_cast<const half *>(embs[i])[array_id[i] * hidden + it];
}
output[out_offset + it] = val;
const half rhiddenval = rhidden * val;
thread_data =
pair_sum(thread_data, kvp<half>(rhiddenval, rhiddenval * val));
}
LayerNorm<half, 256>(thread_data, hidden, out_offset, bias, scale, output,
eps);
#endif
}
template <typename T> template <typename T>
void EmbEltwiseLayerNormFunctor<T>::operator()( void EmbEltwiseLayerNormFunctor<T>::operator()(
int batch, int seq_len, int hidden, const int64_t *ids, const float *scale, int batch, int seq_len, int hidden, const int64_t *ids, const float *scale,
...@@ -160,7 +204,8 @@ void EmbEltwiseLayerNormFunctor<T>::operator()( ...@@ -160,7 +204,8 @@ void EmbEltwiseLayerNormFunctor<T>::operator()(
template class EmbEltwiseLayerNormFunctor<float>; template class EmbEltwiseLayerNormFunctor<float>;
#ifdef SUPPORTS_CUDA_FP16 // device function 'operator()' is not supportted until cuda 10.0
#if CUDA_VERSION >= 10000
template class EmbEltwiseLayerNormFunctor<half>; template class EmbEltwiseLayerNormFunctor<half>;
#endif #endif
...@@ -185,6 +230,28 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_, ...@@ -185,6 +230,28 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_,
qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val); qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val);
} }
template <>
__global__ void SoftmaxKernelWithEltadd<half>(
half *qk_buf_, const half *bias_qk_, const int batch_size,
const int head_num, const int seq_len, const unsigned mask) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
int qk_offset = blockIdx.x * seq_len;
assert(blockDim.x % 32 == 0);
float tmp = threadIdx.x < seq_len
? static_cast<float>(qk_buf_[threadIdx.x + qk_offset] +
bias_qk_[threadIdx.x + qk_offset])
: -1e20f;
float max_val = blockReduceMax<float>(tmp, mask);
float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f;
float sum_val = blockReduceSum<float>(qk_tmp, mask);
if (threadIdx.x < seq_len)
qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val);
#endif
}
template <typename T> template <typename T>
__global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_, __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
const int batch_size, const int batch_size,
...@@ -210,6 +277,32 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_, ...@@ -210,6 +277,32 @@ __global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_,
} }
} }
template <>
__global__ void SoftmaxKernelWithEltadd2<half2>(
half2 *qk_buf_, const half2 *bias_qk_, const int batch_size,
const int head_num, const int seq_len, const unsigned mask) {
// operator "+" of half only suppotted after cuda version 10.0
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000
int qk_offset = blockIdx.x * seq_len;
int idx = threadIdx.x;
assert(blockDim.x % 32 == 0);
float2 tmp = idx < seq_len ? ToFloat2<half2>(qk_buf_[idx + qk_offset] +
bias_qk_[idx + qk_offset])
: make_float2(-1e20f, -1e20f);
float max_val = blockReduceMax<float>(max(tmp.x, tmp.y), mask);
float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val),
__expf(tmp.y - max_val))
: make_float2(0.f, 0.f);
float sum_val = blockReduceSum<float>(qk_tmp.x + qk_tmp.y, mask) + 1e-6f;
if (idx < seq_len) {
qk_buf_[idx + qk_offset] =
FloatsToPair<half2>(qk_tmp.x / sum_val, qk_tmp.y / sum_val);
}
#endif
}
template <typename T> template <typename T>
inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
int head_num, int seq_len, int size_per_head, int head_num, int seq_len, int size_per_head,
...@@ -241,21 +334,17 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, ...@@ -241,21 +334,17 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
seq_len)); seq_len));
if (seq_len % 2 == 0) { if (seq_len % 2 == 0) {
block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32; block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
#ifdef SUPPORTS_CUDA_FP16
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
#endif
SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>( SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
reinterpret_cast<float2 *>(qk_buf_), reinterpret_cast<float2 *>(qk_buf_),
reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num, reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
seq_len / 2, FINAL_MASK); seq_len / 2, FINAL_MASK);
#ifdef SUPPORTS_CUDA_FP16
} else { } else {
SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>( SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
reinterpret_cast<__half2 *>(qk_buf_), reinterpret_cast<__half2 *>(qk_buf_),
reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num, reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
seq_len / 2, FINAL_MASK); seq_len / 2, FINAL_MASK);
} }
#endif
} else { } else {
block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32; block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>( SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
...@@ -308,7 +397,8 @@ void MultiHeadGPUComputeFunctor<T>::operator()( ...@@ -308,7 +397,8 @@ void MultiHeadGPUComputeFunctor<T>::operator()(
template class MultiHeadGPUComputeFunctor<float>; template class MultiHeadGPUComputeFunctor<float>;
#ifdef SUPPORTS_CUDA_FP16 // device function 'operator()' is not supportted until cuda 10.0
#if CUDA_VERSION >= 10000
template class MultiHeadGPUComputeFunctor<half>; template class MultiHeadGPUComputeFunctor<half>;
#endif #endif
...@@ -332,6 +422,69 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1, ...@@ -332,6 +422,69 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1,
eps); eps);
} }
template <>
__global__ void SkipLayerNormSmallKernel<half, 32>(
int num, int hidden, const half *input1, const half *input2, half *output,
const float *scale, const float *bias, float eps) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden;
cub::Sum pair_sum;
kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x;
half val = 0;
if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx];
const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
}
LayerNormSmall<half, 32>(val, thread_data, hidden, idx, bias, scale, output,
eps);
#endif
}
template <>
__global__ void SkipLayerNormSmallKernel<half, 128>(
int num, int hidden, const half *input1, const half *input2, half *output,
const float *scale, const float *bias, float eps) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden;
cub::Sum pair_sum;
kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x;
half val = 0;
if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx];
const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
}
LayerNormSmall<half, 128>(val, thread_data, hidden, idx, bias, scale, output,
eps);
#endif
}
template <>
__global__ void SkipLayerNormSmallKernel<half, 384>(
int num, int hidden, const half *input1, const half *input2, half *output,
const float *scale, const float *bias, float eps) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden;
cub::Sum pair_sum;
kvp<half> thread_data(0, 0);
const int idx = offset + threadIdx.x;
half val = 0;
if (threadIdx.x < hidden) {
val = input1[idx] + input2[idx];
const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
}
LayerNormSmall<half, 384>(val, thread_data, hidden, idx, bias, scale, output,
eps);
#endif
}
template <typename T, unsigned TPB> template <typename T, unsigned TPB>
__global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
const T *input2, T *output, const T *input2, T *output,
...@@ -352,6 +505,29 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, ...@@ -352,6 +505,29 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1,
LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps); LayerNorm<T, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
} }
template <>
__global__ void SkipLayerNormKernel<half, 256>(int num, int hidden,
const half *input1,
const half *input2, half *output,
const float *scale,
const float *bias, float eps) {
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const half rld = half(1) / half(hidden);
const int offset = blockIdx.x * hidden;
cub::Sum pair_sum;
kvp<half> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += 256) {
const int idx = offset + it;
const half val = input1[idx] + input2[idx];
const half rldval = rld * val;
thread_data = pair_sum(thread_data, kvp<half>(rldval, rldval * val));
output[idx] = val;
}
LayerNorm<half, 256>(thread_data, hidden, offset, bias, scale, output, eps);
#endif
}
template <typename T, typename T2, unsigned TPB> template <typename T, typename T2, unsigned TPB>
__global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1, __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
const T2 *input2, T2 *output, const T2 *input2, T2 *output,
...@@ -373,6 +549,30 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1, ...@@ -373,6 +549,30 @@ __global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1,
LayerNorm2<T, T2, TPB>(thread_data, hidden, offset, bias, scale, output, eps); LayerNorm2<T, T2, TPB>(thread_data, hidden, offset, bias, scale, output, eps);
} }
template <>
__global__ void SkipLayerNormKernel2<half, half2, 256>(
int num, int hidden, const half2 *input1, const half2 *input2,
half2 *output, const float2 *scale, const float2 *bias, float eps) {
// operator "+" of half only suppotted after cuda version 10.0
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000
const half rld = half(0.5f / hidden); // because hidden is hidden/2
const int offset = blockIdx.x * hidden;
cub::Sum pair_sum;
kvp<half> thread_data(0, 0);
for (int it = threadIdx.x; it < hidden; it += 256) {
const int idx = offset + it;
const half2 val2 = input1[idx] + input2[idx];
thread_data = pair_sum(
thread_data, kvp<half>(rld * (val2.x + val2.y),
rld * val2.x * val2.x + rld * val2.y * val2.y));
output[idx] = val2;
}
LayerNorm2<half, half2, 256>(thread_data, hidden, offset, bias, scale, output,
eps);
#endif
}
template <typename T> template <typename T>
void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden, void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
const T *input1, const T *input2, const T *input1, const T *input2,
...@@ -395,9 +595,7 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden, ...@@ -395,9 +595,7 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
} else { } else {
const int threads = 256; const int threads = 256;
if (hidden % 2 == 0) { if (hidden % 2 == 0) {
#ifdef SUPPORTS_CUDA_FP16
if (std::is_same<T, float>::value) { if (std::is_same<T, float>::value) {
#endif
SkipLayerNormKernel2<float, float2, SkipLayerNormKernel2<float, float2,
threads><<<block, threads, 0, stream>>>( threads><<<block, threads, 0, stream>>>(
num, hidden / 2, reinterpret_cast<const float2 *>(input1), num, hidden / 2, reinterpret_cast<const float2 *>(input1),
...@@ -405,7 +603,6 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden, ...@@ -405,7 +603,6 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
reinterpret_cast<float2 *>(output), reinterpret_cast<float2 *>(output),
reinterpret_cast<const float2 *>(scale), reinterpret_cast<const float2 *>(scale),
reinterpret_cast<const float2 *>(bias), eps); reinterpret_cast<const float2 *>(bias), eps);
#ifdef SUPPORTS_CUDA_FP16
} else if (std::is_same<T, __half>::value) { } else if (std::is_same<T, __half>::value) {
SkipLayerNormKernel2<__half, __half2, SkipLayerNormKernel2<__half, __half2,
threads><<<block, threads, 0, stream>>>( threads><<<block, threads, 0, stream>>>(
...@@ -418,7 +615,6 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden, ...@@ -418,7 +615,6 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
assert(false); assert(false);
// should not be here // should not be here
} }
#endif
} else { } else {
SkipLayerNormKernel<T, threads><<<block, threads, 0, stream>>>( SkipLayerNormKernel<T, threads><<<block, threads, 0, stream>>>(
num, hidden, input1, input2, output, scale, bias, eps); num, hidden, input1, input2, output, scale, bias, eps);
...@@ -428,7 +624,8 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden, ...@@ -428,7 +624,8 @@ void SkipLayerNormFunctor<T>::operator()(const int num, const int hidden,
template class SkipLayerNormFunctor<float>; template class SkipLayerNormFunctor<float>;
#ifdef SUPPORTS_CUDA_FP16 // device function 'operator()' is not supportted until cuda 10.0
#if CUDA_VERSION >= 10000
template class SkipLayerNormFunctor<half>; template class SkipLayerNormFunctor<half>;
#endif #endif
......
...@@ -26,12 +26,10 @@ namespace math { ...@@ -26,12 +26,10 @@ namespace math {
template <typename T> template <typename T>
struct CUDATypeTraits; struct CUDATypeTraits;
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
struct CUDATypeTraits<half> { struct CUDATypeTraits<half> {
typedef platform::float16 TYPE; typedef platform::float16 TYPE;
}; };
#endif
template <> template <>
struct CUDATypeTraits<float> { struct CUDATypeTraits<float> {
......
...@@ -47,12 +47,10 @@ __device__ __forceinline__ float FromFloat<float>(float a) { ...@@ -47,12 +47,10 @@ __device__ __forceinline__ float FromFloat<float>(float a) {
return a; return a;
} }
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
__device__ __forceinline__ half FromFloat<half>(float a) { __device__ __forceinline__ half FromFloat<half>(float a) {
return __float2half(a); return __float2half(a);
} }
#endif
// to_float // to_float
template <> template <>
...@@ -75,7 +73,6 @@ __inline__ __device__ float2 operator+(const float2 &a, const float2 &b) { ...@@ -75,7 +73,6 @@ __inline__ __device__ float2 operator+(const float2 &a, const float2 &b) {
return make_float2(a.x + b.x, a.y + b.y); return make_float2(a.x + b.x, a.y + b.y);
} }
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
__device__ __forceinline__ float ToFloat<half>(half a) { __device__ __forceinline__ float ToFloat<half>(half a) {
return __half2float(a); return __half2float(a);
...@@ -91,23 +88,20 @@ __device__ __forceinline__ __half2 FloatsToPair<__half2>(const float a, ...@@ -91,23 +88,20 @@ __device__ __forceinline__ __half2 FloatsToPair<__half2>(const float a,
const float b) { const float b) {
return __floats2half2_rn(a, b); return __floats2half2_rn(a, b);
} }
#endif
template <> template <>
__device__ __forceinline__ float exp_func<float>(float a) { __device__ __forceinline__ float exp_func<float>(float a) {
return expf(a); return expf(a);
} }
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
__device__ __forceinline__ half exp_func<half>(half a) { __device__ __forceinline__ half exp_func<half>(half a) {
#if __CUDA_ARCH__ >= 600 #if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
return hexp(a); return hexp(a);
#else #else
return FromFloat<half>(expf(ToFloat<half>(a))); return FromFloat<half>(expf(ToFloat<half>(a)));
#endif #endif
} }
#endif
template <> template <>
struct KeyValuePair<float> { struct KeyValuePair<float> {
...@@ -129,7 +123,6 @@ struct KeyValuePair<float> { ...@@ -129,7 +123,6 @@ struct KeyValuePair<float> {
} }
}; };
#ifdef SUPPORTS_CUDA_FP16
template <> template <>
struct KeyValuePair<half> { struct KeyValuePair<half> {
__device__ __forceinline__ KeyValuePair() {} __device__ __forceinline__ KeyValuePair() {}
...@@ -144,11 +137,20 @@ struct KeyValuePair<half> { ...@@ -144,11 +137,20 @@ struct KeyValuePair<half> {
operator+(const KeyValuePair &a) const { operator+(const KeyValuePair &a) const {
const half2 a2 = __halves2half2(key, value); const half2 a2 = __halves2half2(key, value);
const half2 b2 = __halves2half2(a.key, a.value); const half2 b2 = __halves2half2(a.key, a.value);
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
const half2 res = __hadd2(a2, b2); const half2 res = __hadd2(a2, b2);
#else
float a2_1 = __low2float(a2);
float a2_2 = __high2float(a2);
float b2_1 = __low2float(b2);
float b2_2 = __high2float(b2);
float r1 = a2_1 + b2_1;
float r2 = a2_2 + b2_2;
const half2 res = __floats2half2_rn(r1, r2);
#endif
return KeyValuePair(res.x, res.y); return KeyValuePair(res.x, res.y);
} }
}; };
#endif
#define FINAL_MASK 0xffffffff #define FINAL_MASK 0xffffffff
#define HALF_WARP 16 #define HALF_WARP 16
......
...@@ -44,6 +44,8 @@ limitations under the License. */ ...@@ -44,6 +44,8 @@ limitations under the License. */
#define PADDLE_ALIGN(x) __declspec(align(x)) #define PADDLE_ALIGN(x) __declspec(align(x))
#endif #endif
#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600)
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -37,6 +37,7 @@ class InferencePassTest(unittest.TestCase): ...@@ -37,6 +37,7 @@ class InferencePassTest(unittest.TestCase):
def __init__(self, methodName='runTest'): def __init__(self, methodName='runTest'):
paddle.enable_static() paddle.enable_static()
super(InferencePassTest, self).__init__(methodName) super(InferencePassTest, self).__init__(methodName)
paddle.enable_static()
self.main_program = fluid.Program() self.main_program = fluid.Program()
self.startup_program = fluid.Program() self.startup_program = fluid.Program()
self.feeds = None self.feeds = None
...@@ -46,6 +47,7 @@ class InferencePassTest(unittest.TestCase): ...@@ -46,6 +47,7 @@ class InferencePassTest(unittest.TestCase):
self.enable_mkldnn_bfloat16 = False self.enable_mkldnn_bfloat16 = False
self.enable_trt = False self.enable_trt = False
self.trt_parameters = None self.trt_parameters = None
self.dynamic_shape_params = None
self.enable_lite = False self.enable_lite = False
self.lite_parameters = None self.lite_parameters = None
self.path = "./inference_pass/" + self.__class__.__name__ + "/" self.path = "./inference_pass/" + self.__class__.__name__ + "/"
...@@ -124,6 +126,14 @@ class InferencePassTest(unittest.TestCase): ...@@ -124,6 +126,14 @@ class InferencePassTest(unittest.TestCase):
self.trt_parameters.precision, self.trt_parameters.precision,
self.trt_parameters.use_static, self.trt_parameters.use_static,
self.trt_parameters.use_calib_mode) self.trt_parameters.use_calib_mode)
if self.dynamic_shape_params:
config.set_trt_dynamic_shape_info(
self.dynamic_shape_params.min_input_shape,
self.dynamic_shape_params.max_input_shape,
self.dynamic_shape_params.optim_input_shape,
self.dynamic_shape_params.disable_trt_plugin_fp16)
elif use_mkldnn: elif use_mkldnn:
config.enable_mkldnn() config.enable_mkldnn()
if self.enable_mkldnn_bfloat16: if self.enable_mkldnn_bfloat16:
...@@ -229,6 +239,12 @@ class InferencePassTest(unittest.TestCase): ...@@ -229,6 +239,12 @@ class InferencePassTest(unittest.TestCase):
self._get_analysis_config( self._get_analysis_config(
use_gpu=use_gpu, use_trt=self.enable_trt)) use_gpu=use_gpu, use_trt=self.enable_trt))
if self.trt_parameters.use_static:
#deserialize
tensorrt_outputs = self._get_analysis_outputs(
self._get_analysis_config(
use_gpu=use_gpu, use_trt=self.enable_trt))
self.assertTrue( self.assertTrue(
len(tensorrt_outputs) == len(outs), len(tensorrt_outputs) == len(outs),
"The number of outputs is different between GPU and TensorRT. ") "The number of outputs is different between GPU and TensorRT. ")
...@@ -276,6 +292,18 @@ class InferencePassTest(unittest.TestCase): ...@@ -276,6 +292,18 @@ class InferencePassTest(unittest.TestCase):
self.use_static = use_static self.use_static = use_static
self.use_calib_mode = use_calib_mode self.use_calib_mode = use_calib_mode
class DynamicShapeParam:
'''
Prepare TensorRT subgraph engine dynamic shape parameters.
'''
def __init__(self, min_input_shape, max_input_shape, optim_input_shape,
disable_trt_plugin_fp16):
self.min_input_shape = min_input_shape
self.max_input_shape = max_input_shape
self.optim_input_shape = optim_input_shape
self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16
class LiteParam: class LiteParam:
''' '''
Prepare Lite subgraph engine parameters. Prepare Lite subgraph engine parameters.
......
...@@ -23,44 +23,25 @@ from paddle.fluid.core import AnalysisConfig ...@@ -23,44 +23,25 @@ from paddle.fluid.core import AnalysisConfig
#normal starts && ends #normal starts && ends
class SlicePluginTRTTest1(InferencePassTest): class SlicePluginTRTTest(InferencePassTest):
def setUp(self): def setUpSliceParams(self):
with fluid.program_guard(self.main_program, self.startup_program): self.params_axes = [1, 3]
data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") self.params_starts = [0, 1]
axes = [1, 3] self.params_ends = [2, 3]
starts = [0, 1]
ends = [2, 3] def setUpTensorRTParams(self):
slice_out = fluid.layers.slice( self.trt_parameters = SlicePluginTRTTest.TensorRTParam(
data, axes=axes, starts=starts, ends=ends)
out = fluid.layers.batch_norm(slice_out, is_test=True)
self.feeds = {
"data": np.random.random((3, 3, 3, 3)).astype("float32"),
}
# Diff occurred between GPU and TRT.
# In order to provide TRT CI ASAP, this test for trt part
# is disabled temporarily.
self.enable_trt = True
self.trt_parameters = SlicePluginTRTTest1.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out] self.enable_trt = True
def test_check_output(self):
use_gpu = [False]
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
#negative starts && ends
class SlicePluginTRTTest2(InferencePassTest):
def setUp(self): def setUp(self):
self.setUpSliceParams()
self.setUpTensorRTParams()
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
axes = [2, 3] axes = self.params_axes
starts = [-3, -2] starts = self.params_starts
ends = [-1, 3] ends = self.params_ends
slice_out = fluid.layers.slice( slice_out = fluid.layers.slice(
data, axes=axes, starts=starts, ends=ends) data, axes=axes, starts=starts, ends=ends)
out = fluid.layers.batch_norm(slice_out, is_test=True) out = fluid.layers.batch_norm(slice_out, is_test=True)
...@@ -68,12 +49,6 @@ class SlicePluginTRTTest2(InferencePassTest): ...@@ -68,12 +49,6 @@ class SlicePluginTRTTest2(InferencePassTest):
self.feeds = { self.feeds = {
"data": np.random.random((3, 3, 3, 3)).astype("float32"), "data": np.random.random((3, 3, 3, 3)).astype("float32"),
} }
# Diff occurred between GPU and TRT.
# In order to provide TRT CI ASAP, this test for trt part
# is disabled temporarily.
self.enable_trt = True
self.trt_parameters = SlicePluginTRTTest2.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out] self.fetch_list = [out]
def test_check_output(self): def test_check_output(self):
...@@ -84,66 +59,28 @@ class SlicePluginTRTTest2(InferencePassTest): ...@@ -84,66 +59,28 @@ class SlicePluginTRTTest2(InferencePassTest):
self.check_output_with_option(use_gpu[i]) self.check_output_with_option(use_gpu[i])
#exceeded bound starts && ends #negative starts && ends
class SlicePluginTRTTest3(InferencePassTest): class SlicePluginTRTTestNegativeStartsAndEnds(SlicePluginTRTTest):
def setUp(self): def setUpSliceParams(self):
with fluid.program_guard(self.main_program, self.startup_program): self.params_axes = [2, 3]
data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") self.params_starts = [-3, -2]
axes = [2, 3] self.params_ends = [-1, 3]
starts = [-5, -2]
ends = [-1, 8]
slice_out = fluid.layers.slice(
data, axes=axes, starts=starts, ends=ends)
out = fluid.layers.batch_norm(slice_out, is_test=True)
self.feeds = {
"data": np.random.random((3, 3, 3, 3)).astype("float32"),
}
# Diff occurred between GPU and TRT.
# In order to provide TRT CI ASAP, this test for trt part
# is disabled temporarily.
self.enable_trt = True
self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out]
def test_check_output(self): #exceeded bound starts && ends
use_gpu = [False] class SlicePluginTRTTestStartsAndEndsBoundCheck(SlicePluginTRTTest):
if core.is_compiled_with_cuda(): def setUpSliceParams(self):
use_gpu.append(True) self.params_axes = [2, 3]
for i in range(len(use_gpu)): self.params_starts = [-5, -2]
self.check_output_with_option(use_gpu[i]) self.params_ends = [-1, 8]
#fp16 #fp16
class SlicePluginTRTTest4(InferencePassTest): class SlicePluginTRTTestFp16(SlicePluginTRTTest):
def setUp(self): def setUpTensorRTParams(self):
with fluid.program_guard(self.main_program, self.startup_program): self.trt_parameters = SlicePluginTRTTest.TensorRTParam(
data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32")
axes = [2, 3]
starts = [-5, -2]
ends = [-1, 8]
slice_out = fluid.layers.slice(
data, axes=axes, starts=starts, ends=ends)
out = fluid.layers.batch_norm(slice_out, is_test=True)
self.feeds = {
"data": np.random.random((3, 3, 3, 3)).astype("float32"),
}
# Diff occurred between GPU and TRT.
# In order to provide TRT CI ASAP, this test for trt part
# is disabled temporarily.
self.enable_trt = True
self.trt_parameters = SlicePluginTRTTest3.TensorRTParam(
1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False) 1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
self.fetch_list = [out] self.enable_trt = True
def test_check_output(self):
use_gpu = [False]
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import shutil
import unittest import unittest
import numpy as np import numpy as np
from inference_pass_test import InferencePassTest from inference_pass_test import InferencePassTest
...@@ -281,7 +283,13 @@ class TensorRTSubgraphPassValidPaddingPoolTest(InferencePassTest): ...@@ -281,7 +283,13 @@ class TensorRTSubgraphPassValidPaddingPoolTest(InferencePassTest):
class TensorRTSubgraphPassActivationTest(InferencePassTest): class TensorRTSubgraphPassActivationTest(InferencePassTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
def setUp(self): def setUp(self):
self.setUpTensorRTParam()
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data( data = fluid.data(
name="data", shape=[-1, 6, 64, 64], dtype="float32") name="data", shape=[-1, 6, 64, 64], dtype="float32")
...@@ -290,9 +298,6 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest): ...@@ -290,9 +298,6 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest):
self.feeds = { self.feeds = {
"data": np.random.random([1, 6, 64, 64]).astype("float32"), "data": np.random.random([1, 6, 64, 64]).astype("float32"),
} }
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
self.fetch_list = [out] self.fetch_list = [out]
def append_act(self, x): def append_act(self, x):
...@@ -301,6 +306,8 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest): ...@@ -301,6 +306,8 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest):
def test_check_output(self): def test_check_output(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
use_gpu = True use_gpu = True
if os.path.exists(self.path + "_opt_cache"):
shutil.rmtree(self.path + "_opt_cache")
self.check_output_with_option(use_gpu) self.check_output_with_option(use_gpu)
self.assertTrue( self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
...@@ -342,6 +349,37 @@ class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest): ...@@ -342,6 +349,37 @@ class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest): class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
def append_act(self, x):
return fluid.layers.swish(x)
class TensorRTSubgraphPassSwishFp16SerializeTest(
TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
def append_act(self, x):
return fluid.layers.swish(x)
class TensorRTSubgraphPassDynamicSwishFp16SerializeTest(
TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
{
'data': [1, 6, 8, 8]
}, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
def append_act(self, x): def append_act(self, x):
return fluid.layers.swish(x) return fluid.layers.swish(x)
...@@ -366,6 +404,71 @@ class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest): ...@@ -366,6 +404,71 @@ class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
return fluid.layers.gelu(x) return fluid.layers.gelu(x)
class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
{
'data': [1, 6, 8, 8]
}, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
def append_act(self, x):
return fluid.layers.gelu(x)
class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
def append_act(self, x):
return fluid.layers.gelu(x)
class TensorRTSubgraphPassGeluFp16SerializeTest(
TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
def append_act(self, x):
return fluid.layers.gelu(x)
class TensorRTSubgraphPassGeluFp16DynamicTest(
TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
{
'data': [1, 6, 8, 8]
}, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
def append_act(self, x):
return fluid.layers.gelu(x)
class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
TensorRTSubgraphPassActivationTest):
def setUpTensorRTParam(self):
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
{
'data': [1, 6, 8, 8]
}, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
def append_act(self, x):
return fluid.layers.gelu(x)
class TensorRTSubgraphPassConcatTest(InferencePassTest): class TensorRTSubgraphPassConcatTest(InferencePassTest):
def setUp(self): def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
...@@ -415,6 +518,60 @@ class TensorRTSubgraphPassSplitTest(InferencePassTest): ...@@ -415,6 +518,60 @@ class TensorRTSubgraphPassSplitTest(InferencePassTest):
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
class TensorRTSubgraphPassSplitSerializeTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
out = fluid.layers.batch_norm(split_out[0], is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
self.fetch_list = [out]
def test_check_output(self):
if core.is_compiled_with_cuda():
use_gpu = True
if os.path.exists(self.path + "_opt_cache"):
shutil.rmtree(self.path + "_opt_cache")
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program):
data = fluid.data(
name="data", shape=[-1, 3, 64, 64], dtype="float32")
split_out = fluid.layers.split(data, dim=-1, num_or_sections=2)
out = fluid.layers.batch_norm(split_out[0], is_test=True)
self.feeds = {
"data": np.random.random([1, 3, 64, 64]).astype("float32"),
}
self.enable_trt = True
self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
{
'data': [1, 3, 8, 64]
}, {'data': [1, 3, 512, 64]}, {'data': [1, 3, 256, 64]}, False)
self.fetch_list = [out]
def test_check_output(self):
if core.is_compiled_with_cuda():
use_gpu = True
if os.path.exists(self.path + "_opt_cache"):
shutil.rmtree(self.path + "_opt_cache")
self.check_output_with_option(use_gpu)
self.assertTrue(
PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
class TensorRTSubgraphPassInstanceNormTest(InferencePassTest): class TensorRTSubgraphPassInstanceNormTest(InferencePassTest):
def setUp(self): def setUp(self):
with fluid.program_guard(self.main_program, self.startup_program): with fluid.program_guard(self.main_program, self.startup_program):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册