diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 51ab57141026c44986ecf40f888c580e15dfdce2..cb69a8067ad7ed235de589034d47826d82d59aa4 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -100,19 +100,10 @@ function(select_nvcc_arch_flags out_variable) elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") set(cuda_arch_bin "50") elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") - if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) - add_definitions("-DSUPPORTS_CUDA_FP16") - endif() set(cuda_arch_bin "60 61") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - if (NOT ${CUDA_VERSION} LESS 10.0) - add_definitions("-DSUPPORTS_CUDA_FP16") - endif() set(cuda_arch_bin "70") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") - if (NOT ${CUDA_VERSION} LESS 10.0) - add_definitions("-DSUPPORTS_CUDA_FP16") - endif() set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) @@ -188,6 +179,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x endif() add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"") +if (NOT ${CUDA_VERSION} LESS 10.0) + add_definitions("-DTRT_PLUGIN_FP16_AVALIABLE") +endif() + include_directories(${CUDA_INCLUDE_DIRS}) if(NOT WITH_DSO) if(WIN32) diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 05ff62e8f7e7aa4fbad8aa59ac304e93a9c06f91..ed28864234249512dc7e47402811f620930010d3 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -152,12 +152,13 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { {output_name, std::string("qkv_plugin_mask")}, test_mode); } else { - bool use_fp16 = engine_->WithFp16(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); float eps = boost::get(op_desc.GetAttr("epsilon")); plugin::DynamicPluginTensorRT* plugin = nullptr; plugin = new plugin::EmbEltwiseLayernormPluginDynamic( input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps, use_fp16); + eps, with_fp16); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index 7927b6cd1bb559b1c4ce81d2846942a0db31e5fc..cc94ae39ab911bc8f869bdbcfa3c2816369a0279 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -47,7 +47,10 @@ class GeluOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::GeluPluginDynamic* plugin = + new plugin::GeluPluginDynamic(with_fp16); layer = engine_->AddPluginV2(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( @@ -55,7 +58,9 @@ class GeluOpConverter : public OpConverter { "your TRT version is no less than 6.0")); #endif } else { - plugin::GeluPlugin* plugin = new plugin::GeluPlugin(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::GeluPlugin* plugin = new plugin::GeluPlugin(with_fp16); layer = engine_->AddPlugin(&input, input_num, plugin); } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 820be425e844901130f2ada92c3e47845fc7afbd..0beced7363c573564c05f4d3b9ae817e3a9ffd7a 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -191,10 +191,11 @@ class MultiheadMatMulOpConverter : public OpConverter { std::vector plugin_inputs; plugin_inputs.push_back(fc_out); plugin_inputs.push_back(input_bias_qk); - bool ban_fp16 = engine_->disable_trt_plugin_fp16(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden, head_number, - head_size, scale, ban_fp16); + head_size, scale, with_fp16); layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin); } } else { diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 802e979045c884afcc2ec56067679b79992d804e..9419fafbe8267c3602c0d08f9b838882e905e8b0 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -85,10 +85,11 @@ class SkipLayerNormOpConverter : public OpConverter { layer = plugin_layer; } else { float eps = boost::get(op_desc.GetAttr("epsilon")); - bool ban_fp16 = engine_->disable_trt_plugin_fp16(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SkipLayerNormPluginDynamic* plugin = new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, - scale_size, eps, ban_fp16); + scale_size, eps, with_fp16); layer = engine_->AddPluginV2(inputs.data(), 2, plugin); } } else { diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index e922f24fe3a02655076bec9bfecba988259a6801..3fb5644701b4799b84770bd2d73945ca480c581d 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -93,9 +93,10 @@ class SliceOpConverter : public OpConverter { layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), plugin); } else { - bool ban_fp16 = engine_->disable_trt_plugin_fp16(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SlicePluginDynamic* plugin = - new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16); + new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); layer = engine_->AddPluginV2(&input, 1, plugin); } #else @@ -104,9 +105,10 @@ class SliceOpConverter : public OpConverter { "your TRT version is no less than 6.0")); #endif } else { - bool ban_fp16 = engine_->disable_trt_plugin_fp16(); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SlicePlugin* plugin = - new plugin::SlicePlugin(starts, ends, axes, ban_fp16); + new plugin::SlicePlugin(starts, ends, axes, with_fp16); layer = engine_->AddPlugin(&input, 1, plugin); } diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 9f4a048961f8d285f9280161e3ea3e3630b63556..05c9c0ec5da9a80a0afee1780daa23f78dde4e9d 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -12,8 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + namespace paddle { namespace inference { namespace tensorrt { @@ -30,9 +40,40 @@ class SoftMaxOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + nvinfer1::Dims input_shape = input1->getDimensions(); + int input_dims = input_shape.nbDims; + int axis = + op_desc.HasAttr("axis") ? boost::get(op_desc.GetAttr("axis")) : -1; + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax, *const_cast(input1)); - + uint32_t axes = std::max(0, input_dims - 3); + // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers + // support Nd. + int padded_dims = 0; + int explicit_batch = 0; + if (engine_->with_dynamic_shape()) explicit_batch = 1; + for (int i = input_dims - 1; i > explicit_batch; i--) { + if (input_shape.d[i] == 1) { + padded_dims += 1; + } else { + break; + } + } + if (!engine_->with_dynamic_shape()) { + if (axis == -1) { + axes = input_dims - 1 - padded_dims; + } else { + axes = axis; + } + } else { + if (axis == -1) { + axes = input_dims - 1 - padded_dims; + } else { + axes = axis + 1; + } + } + layer->setAxes(1 << axes); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 90136c7d5db697b577ffa138ed7ff90b19ca9997..b08eef12aa84de11de2dad83b4387c9e5a203685 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -86,8 +86,10 @@ class SplitOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPluginDynamic* plugin = - new plugin::SplitPluginDynamic(axis, output_lengths); + new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16); layer = engine_->AddPluginV2(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( @@ -95,8 +97,10 @@ class SplitOpConverter : public OpConverter { "your TRT version is no less than 6.0")); #endif } else { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPlugin* plugin = - new plugin::SplitPlugin(axis, output_lengths); + new plugin::SplitPlugin(axis, output_lengths, with_fp16); layer = engine_->AddPlugin(&input, input_num, plugin); } diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index d8d055f0f1d6f323087dafa6baecb22f70c23ee9..e706d20067e87852591b59bb19bcf2aac89ee65e 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -46,8 +46,10 @@ class StackOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::StackPluginDynamic* plugin = - new plugin::StackPluginDynamic(axis, input_num); + new plugin::StackPluginDynamic(axis, input_num, with_fp16); layer = engine_->AddPluginV2(inputs, input_num, plugin); assert(layer != nullptr); #else diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc index e69676a7877f2daf0342114f655d207a21b069f8..4bc00cc36b6186c4666707e0d4118ed7ea88411e 100644 --- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc @@ -39,7 +39,10 @@ class SwishOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::SwishPluginDynamic* plugin = + new plugin::SwishPluginDynamic(beta, with_fp16); layer = engine_->AddPluginV2(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( @@ -47,7 +50,9 @@ class SwishOpConverter : public OpConverter { "your TRT version is no less than 6.0")); #endif } else { - plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16); layer = engine_->AddPlugin(&input, input_num, plugin); } diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 873631fea614cc18cdc2b2b2f27d2480aa71d50b..30667514ac83a466fb7c131e66286617a62a778e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -160,9 +160,9 @@ int EmbEltwiseLayernormPluginDynamicImpl::enqueue( } template class EmbEltwiseLayernormPluginDynamicImpl; -#ifdef SUPPORTS_CUDA_FP16 +#ifdef TRT_PLUGIN_FP16_AVALIABLE template class EmbEltwiseLayernormPluginDynamicImpl; -#endif // SUPPORTS_CUDA_FP16 +#endif int EmbEltwiseLayernormPluginDynamic::initialize() { impl_->initialize(); diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h index 24ca853104e35c26a2f9add57fd2f8bc025646c2..fcba85daf9fa97f0426b847c393423c17ddc0478 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h @@ -8,7 +8,7 @@ // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // // See the License for the specific language governing permissions and // limitations under the License. @@ -105,18 +105,24 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { scale_size_(scale_size), hidden_size_(hidden_size), eps_(eps), - with_fp16_(with_fp16), own_host_buff_(false) { - if (with_fp16) { -#ifdef SUPPORTS_CUDA_FP16 + with_fp16_ = with_fp16; + if (with_fp16_) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE + VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp16"; impl_ = new EmbEltwiseLayernormPluginDynamicImpl( embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, eps_); #else PADDLE_THROW(platform::errors::Fatal( - "Unsupported data type, current GPU doesn't support half.")); -#endif // SUPPORTS_CUDA_FP16 + "The Ernie(Bert) tensorRT plugin should be " + "complied with CUDA version >= 10.0 when running with fp16. " + "Please recomplie it or try to use fp32 by set " + "config.EnableTensorRtEngine(1 << 30, 1, 5, " + "AnalysisConfig::Precision::kFloat32, false, false) ")); +#endif } else { + VLOG(1) << "TRT Plugin DataType selected. EmbEltwiseLayerNorm-->fp32"; impl_ = new EmbEltwiseLayernormPluginDynamicImpl( embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, eps_); @@ -160,14 +166,18 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { DeserializeValue(&serial_data, &serial_length, &with_fp16_); if (with_fp16_) { -#ifdef SUPPORTS_CUDA_FP16 +#ifdef TRT_PLUGIN_FP16_AVALIABLE impl_ = new EmbEltwiseLayernormPluginDynamicImpl( embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, eps_); #else PADDLE_THROW(platform::errors::Fatal( - "Unsupported data type, current GPU doesn't support half.")); -#endif // SUPPORTS_CUDA_FP16 + "The Ernie(Bert) tensorRT plugin should be " + "complied with CUDA version >= 10.0 when running with fp16. " + "Please recomplie it or try to use fp32 by set " + "config.EnableTensorRtEngine(1 << 30, 1, 5, " + "AnalysisConfig::Precision::kFloat32, false, false) ")); +#endif } else { impl_ = new EmbEltwiseLayernormPluginDynamicImpl( embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, @@ -283,7 +293,6 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { int hidden_size_; float eps_; - bool with_fp16_; bool own_host_buff_{false}; EmbEltwiseLayernormPluginDynamicImplBase* impl_{nullptr}; }; diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index 854ca515db5f5904abb47378fcbec1ea3b391fb9..abe54dd1e57489ed6ac033056d4284608c7da6f4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -17,6 +17,7 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace inference { @@ -38,14 +39,14 @@ REGISTER_TRT_PLUGIN("gelu_plugin", CreateGeluPluginDeserialize); bool GeluPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { -#ifdef SUPPORTS_CUDA_FP16 - return ((type == nvinfer1::DataType::kFLOAT || - type == nvinfer1::DataType::kHALF) && - (format == nvinfer1::PluginFormat::kNCHW)); -#else - return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); -#endif + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kNCHW)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); + } } nvinfer1::Dims GeluPlugin::getOutputDimensions(int index, @@ -87,6 +88,7 @@ __device__ half do_tanh(half a) { template __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n, const T* input, T* output) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const int idx = blockIdx.x * TPB + threadIdx.x; if (idx < n) { const T in = input[idx]; @@ -94,6 +96,7 @@ __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n, const T cdf = a + a * do_tanh(tmp); output[idx] = in * cdf; } +#endif } int GeluPlugin::enqueue(int batch_size, const void* const* inputs, @@ -108,21 +111,18 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, auto type = getDataType(); if (type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); gelu_kernel<<>>( kA, num, input, output); } else if (type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); no_exact_gelu_kernel<<>>( kAT, kBT, kCT, num, input, output); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Gelu TRT Plugin's input type should be float or half.")); @@ -155,14 +155,14 @@ bool GeluPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc& in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - return (in.type == nvinfer1::DataType::kFLOAT || - in.type == nvinfer1::DataType::kHALF) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#endif + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; // output @@ -189,21 +189,18 @@ int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp32"; const float* input = static_cast(inputs[0]); float* output = static_cast(outputs[0]); gelu_kernel<<>>( kA, num, input, output); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Gelu-->fp16"; const half* input = static_cast(inputs[0]); half* output = static_cast(outputs[0]); no_exact_gelu_kernel<<>>( kAT, kBT, kCT, num, input, output); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Gelu TRT Plugin's input type should be float or half.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 02219bc27a763569484db0ba06259abd001e514d..979f600a3a9cea0ab5bc35fc0c2882cf34c82c98 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -26,7 +26,7 @@ namespace plugin { class GeluPlugin : public PluginTensorRT { public: - GeluPlugin() {} + explicit GeluPlugin(const bool with_fp16) { with_fp16_ = with_fp16; } // It was used for tensorrt deserialization. // It should not be called by users. @@ -35,7 +35,7 @@ class GeluPlugin : public PluginTensorRT { } ~GeluPlugin() {} - GeluPlugin* clone() const override { return new GeluPlugin(); } + GeluPlugin* clone() const override { return new GeluPlugin(with_fp16_); } const char* getPluginType() const override { return "gelu_plugin"; } int getNbOutputs() const override { return 1; } @@ -63,20 +63,26 @@ class GeluPlugin : public PluginTensorRT { #if IS_TRT_VERSION_GE(6000) class GeluPluginDynamic : public DynamicPluginTensorRT { public: - GeluPluginDynamic() {} - GeluPluginDynamic(void const* serial_data, size_t serial_length) {} + explicit GeluPluginDynamic(const bool with_fp16) { with_fp16_ = with_fp16; } + GeluPluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &with_fp16_); + } ~GeluPluginDynamic() {} nvinfer1::IPluginV2DynamicExt* clone() const override { - return new GeluPluginDynamic(); + return new GeluPluginDynamic(with_fp16_); } const char* getPluginType() const override { return "gelu_plugin"; } int getNbOutputs() const override { return 1; } int initialize() override { return 0; } - size_t getSerializationSize() const override { return 0; } - void serialize(void* buffer) const override {} + size_t getSerializationSize() const override { + return SerializedSize(with_fp16_); + } + void serialize(void* buffer) const override { + SerializeValue(&buffer, with_fp16_); + } nvinfer1::DimsExprs getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 7c905a245a58c0f40c477f86201774e158455947..8af036a0e86709336b0ef8b3310442cb7374bfbc 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -50,10 +50,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, float *output = reinterpret_cast(outputs)[0]; int begin_norm_axis = begin_norm_axis_; float eps = eps_; - int c = input_dims.d[begin_norm_axis - 1]; - scale_t.Resize(framework::make_ddim({c})); - bias_t.Resize(framework::make_ddim({c})); + std::vector input_shape; + input_shape.push_back(batch_size); + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + const auto input_ddim = framework::make_ddim(input_shape); + auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1); + int feature_size = static_cast(matrix_dim[1]); + + scale_t.Resize(framework::make_ddim({feature_size})); + bias_t.Resize(framework::make_ddim({feature_size})); mean_t.Resize(framework::make_ddim(mean_shape_)); variance_t.Resize(framework::make_ddim(variance_shape_)); int device_id; @@ -63,15 +71,11 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); float *variance_d = variance_t.mutable_data(platform::CUDAPlace(device_id)); - cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * c, + cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * c, + cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, cudaMemcpyHostToDevice, stream); - std::vector input_shape; - input_shape.push_back(batch_size); - for (int i = 0; i < input_dims.nbDims; i++) { - input_shape.push_back(input_dims.d[i]); - } + paddle::operators::LayerNormDirectCUDAFunctor layer_norm; layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, variance_d, begin_norm_axis, eps); diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 5014a7db981e932b0d606ea15761fdc460550a11..f13d2fedac166b576f96a5a0ae0add4851b7739a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -109,7 +109,6 @@ inline void TransposeQKV(const int batch, const int seq_len, } } -#ifdef SUPPORTS_CUDA_FP16 inline void TransposeQKV(const int batch, const int seq_len, const int head_size, const int head_num, const half *input, half *output, cudaStream_t stream) { @@ -137,7 +136,6 @@ inline void TransposeQKV(const int batch, const int seq_len, output); } } -#endif int QkvToContextPluginDynamic::initialize() { return 0; } @@ -184,19 +182,19 @@ bool QkvToContextPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc &in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - if (ban_fp16_) { - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); - } else { + if (with_fp16_) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE return (in.type == nvinfer1::DataType::kFLOAT || in.type == nvinfer1::DataType::kHALF) && (in.format == nvinfer1::TensorFormat::kLINEAR); - } #else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); #endif + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; @@ -236,6 +234,7 @@ int QkvToContextPluginDynamic::enqueue( auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp32"; auto *multihead_temp_data = multihead_temp_tensor.mutable_data( platform::CUDAPlace(device_id)); auto *qkptr = multihead_temp_data; @@ -264,7 +263,8 @@ int QkvToContextPluginDynamic::enqueue( head_number_, head_size_); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 +#ifdef TRT_PLUGIN_FP16_AVALIABLE + VLOG(1) << "TRT Plugin DataType selected. QkvToContext-->fp16"; auto *multihead_temp_data = multihead_temp_tensor.mutable_data( // NOLINT platform::CUDAPlace(device_id)); @@ -294,7 +294,11 @@ int QkvToContextPluginDynamic::enqueue( head_number_, head_size_); #else PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); + "The Ernie(Bert) TensorRT Plugin should be " + "complied with CUDA version >= 10.0 when running with fp16. " + "Please recomplie it or try to use fp32 by set " + "config.SetTRTDynamicShapeInfo(min_input_shape, " + "max_input_shape, opt_input_shape, true")); #endif } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h index 72a2732ae2021a19b24dd9bfe5bf3a965c937712..b852f5a454c07ca9684f7bb12aa62275c3121de3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h @@ -44,23 +44,24 @@ namespace plugin { class QkvToContextPluginDynamic : public DynamicPluginTensorRT { public: explicit QkvToContextPluginDynamic(int hidden, int head_number, int head_size, - float scale, bool ban_fp16) + float scale, bool with_fp16) : hidden_(hidden), head_number_(head_number), head_size_(head_size), - scale_(scale), - ban_fp16_(ban_fp16) {} + scale_(scale) { + with_fp16_ = with_fp16; + } QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &hidden_); DeserializeValue(&serial_data, &serial_length, &head_number_); DeserializeValue(&serial_data, &serial_length, &head_size_); DeserializeValue(&serial_data, &serial_length, &scale_); - DeserializeValue(&serial_data, &serial_length, &ban_fp16_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); } nvinfer1::IPluginV2DynamicExt* clone() const override { return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_, - scale_, ban_fp16_); + scale_, with_fp16_); } const char* getPluginType() const override { return "qkv_to_context_plugin"; } @@ -70,14 +71,14 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { size_t getSerializationSize() const override { return SerializedSize(hidden_) + SerializedSize(head_number_) + SerializedSize(head_size_) + SerializedSize(scale_) + - SerializedSize(ban_fp16_); + SerializedSize(with_fp16_); } void serialize(void* buffer) const override { SerializeValue(&buffer, hidden_); SerializeValue(&buffer, head_number_); SerializeValue(&buffer, head_size_); SerializeValue(&buffer, scale_); - SerializeValue(&buffer, ban_fp16_); + SerializeValue(&buffer, with_fp16_); } nvinfer1::DimsExprs getOutputDimensions( @@ -115,7 +116,6 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { int head_number_; int head_size_; float scale_; - bool ban_fp16_; }; class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index c51dae5e00c12ee632f29c59c6acd9a36e58b945..6b2b93ba2230faa3355075252a8e94db65f8df28 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -66,19 +66,19 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc &in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - if (ban_fp16_) { - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); - } else { + if (with_fp16_) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE return (in.type == nvinfer1::DataType::kFLOAT || in.type == nvinfer1::DataType::kHALF) && (in.format == nvinfer1::TensorFormat::kLINEAR); - } #else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); #endif + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; @@ -114,6 +114,7 @@ int SkipLayerNormPluginDynamic::enqueue( auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp32"; const float *input1 = static_cast(inputs[0]); const float *input2 = static_cast(inputs[1]); float *output = static_cast(outputs[0]); @@ -121,7 +122,8 @@ int SkipLayerNormPluginDynamic::enqueue( skip_layer_norm_func(num, hidden, input1, input2, scale_gpu_, bias_gpu_, output, eps_, stream); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 +#ifdef TRT_PLUGIN_FP16_AVALIABLE + VLOG(1) << "TRT Plugin DataType selected. SkipLayerNorm-->fp16"; const half *input1 = static_cast(inputs[0]); const half *input2 = static_cast(inputs[1]); half *output = static_cast(outputs[0]); @@ -130,7 +132,11 @@ int SkipLayerNormPluginDynamic::enqueue( output, static_cast(eps_), stream); #else PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); + "The Ernie(Bert) tensorRT plugin should be " + "complied with CUDA version >= 10.0 when running with fp16. " + "Please recomplie it or try to use fp32 by set " + "config.SetTRTDynamicShapeInfo(min_input_shape, " + "max_input_shape, opt_input_shape, true")); #endif } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h index 24cd8e0368182ae597e48765bc0167ca1eca6bd3..f1f50304db750967889e731b7e0f579bda04be2f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h @@ -31,11 +31,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { public: explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale, int bias_size, int scale_size, - const float eps, bool ban_fp16) - : bias_size_(bias_size), - scale_size_(scale_size), - eps_(eps), - ban_fp16_(ban_fp16) { + const float eps, bool with_fp16) + : bias_size_(bias_size), scale_size_(scale_size), eps_(eps) { + with_fp16_ = with_fp16; bias_.resize(bias_size); scale_.resize(scale_size); std::copy(bias, bias + bias_size, bias_.data()); @@ -47,12 +45,12 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { DeserializeValue(&serial_data, &serial_length, &bias_size_); DeserializeValue(&serial_data, &serial_length, &scale_size_); DeserializeValue(&serial_data, &serial_length, &eps_); - DeserializeValue(&serial_data, &serial_length, &ban_fp16_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); } nvinfer1::IPluginV2DynamicExt* clone() const override { auto ptr = new SkipLayerNormPluginDynamic( - bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); + bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, with_fp16_); ptr->bias_gpu_ = bias_gpu_; ptr->scale_gpu_ = bias_gpu_; return ptr; @@ -65,7 +63,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { size_t getSerializationSize() const override { size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) + SerializedSize(bias_size_) + SerializedSize(scale_size_) + - SerializedSize(eps_) + SerializedSize(eps_); + SerializedSize(eps_) + SerializedSize(with_fp16_); return ser_size; } void serialize(void* buffer) const override { @@ -74,7 +72,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { SerializeValue(&buffer, bias_size_); SerializeValue(&buffer, scale_size_); SerializeValue(&buffer, eps_); - SerializeValue(&buffer, ban_fp16_); + SerializeValue(&buffer, with_fp16_); } nvinfer1::DimsExprs getOutputDimensions( @@ -118,7 +116,6 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { int scale_size_; float eps_; - bool ban_fp16_; }; class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index e923887eef6025c26fd7b3e9924aa9ea4926d056..e07fee48d179b84235289b9504b07a3f8a5ee46f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -61,8 +61,9 @@ __global__ void SliceKernel(int num, int dims, const T *input, } SlicePlugin::SlicePlugin(std::vector starts, std::vector ends, - std::vector axes, bool ban_fp16) - : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) { + std::vector axes, bool with_fp16) + : starts_(starts), ends_(ends), axes_(axes) { + with_fp16_ = with_fp16; cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -72,7 +73,6 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &starts_); DeserializeValue(&serial_data, &serial_length, &ends_); DeserializeValue(&serial_data, &serial_length, &axes_); - DeserializeValue(&serial_data, &serial_length, &ban_fp16_); cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -84,19 +84,19 @@ SlicePlugin::~SlicePlugin() { } SlicePlugin *SlicePlugin::clone() const { - return new SlicePlugin(starts_, ends_, axes_, ban_fp16_); + return new SlicePlugin(starts_, ends_, axes_, with_fp16_); } bool SlicePlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { -#ifdef SUPPORTS_CUDA_FP16 - return ((type == nvinfer1::DataType::kFLOAT || - type == nvinfer1::DataType::kHALF) && - (format == nvinfer1::PluginFormat::kNCHW)); -#else - return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); -#endif + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kNCHW)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); + } } nvinfer1::Dims SlicePlugin::getOutputDimensions(int index, @@ -172,20 +172,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, int blocks = (out_num + threads - 1) / threads; auto input_type = getDataType(); if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32"; const float *input1 = static_cast(inputs[0]); float *output = static_cast(outputs[0]); SliceKernel<<>>( out_num, num_dims, input1, offset_temp_data_, output); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16"; const half *input1 = static_cast(inputs[0]); half *output = static_cast(outputs[0]); SliceKernel<<>>( out_num, num_dims, input1, offset_temp_data_, output); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW(platform::errors::Fatal( "The Slice TRT Plugin's input type should be float or half.")); @@ -196,7 +193,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, size_t SlicePlugin::getSerializationSize() { return getBaseSerializationSize() + SerializedSize(getPluginType()) + SerializedSize(starts_) + SerializedSize(ends_) + - SerializedSize(axes_) + SerializedSize(ban_fp16_); + SerializedSize(axes_); } void SlicePlugin::serialize(void *buffer) { @@ -205,15 +202,15 @@ void SlicePlugin::serialize(void *buffer) { SerializeValue(&buffer, starts_); SerializeValue(&buffer, ends_); SerializeValue(&buffer, axes_); - SerializeValue(&buffer, ban_fp16_); } // Dynamic Plugin below. #if IS_TRT_VERSION_GE(6000) SlicePluginDynamic::SlicePluginDynamic(std::vector starts, std::vector ends, - std::vector axes, bool ban_fp16) - : starts_(starts), ends_(ends), axes_(axes), ban_fp16_(ban_fp16) { + std::vector axes, bool with_fp16) + : starts_(starts), ends_(ends), axes_(axes) { + with_fp16_ = with_fp16; cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -223,7 +220,7 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData, DeserializeValue(&serialData, &serialLength, &starts_); DeserializeValue(&serialData, &serialLength, &ends_); DeserializeValue(&serialData, &serialLength, &axes_); - DeserializeValue(&serialData, &serialLength, &ban_fp16_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -239,7 +236,7 @@ int SlicePluginDynamic::initialize() { return 0; } size_t SlicePluginDynamic::getSerializationSize() const { size_t size = SerializedSize(starts_) + SerializedSize(ends_) + - SerializedSize(axes_) + SerializedSize(ban_fp16_); + SerializedSize(axes_) + SerializedSize(with_fp16_); return size; } @@ -248,7 +245,7 @@ void SlicePluginDynamic::serialize(void *buffer) const { SerializeValue(&buffer, starts_); SerializeValue(&buffer, ends_); SerializeValue(&buffer, axes_); - SerializeValue(&buffer, ban_fp16_); + SerializeValue(&buffer, with_fp16_); } nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions( @@ -280,19 +277,14 @@ bool SlicePluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc &in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - if (ban_fp16_) { - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); - } else { + if (with_fp16_) { return (in.type == nvinfer1::DataType::kFLOAT || in.type == nvinfer1::DataType::kHALF) && (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); } -#else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#endif } const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; // output @@ -364,20 +356,17 @@ int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, int blocks = (out_num + threads - 1) / threads; auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Slice-->fp32"; const float *input1 = static_cast(inputs[0]); float *output = static_cast(outputs[0]); SliceKernel<<>>( out_num, num_dims, input1, offset_temp_data_, output); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Slice-->fp16"; const half *input1 = static_cast(inputs[0]); half *output = static_cast(outputs[0]); SliceKernel<<>>( out_num, num_dims, input1, offset_temp_data_, output); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW(platform::errors::Fatal( "The Slice TRT Plugin's input type should be float or half.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index e36a270f05d9fee497fa1a033ed16faf08c08225..340406c5e7fae8bf3f298228259e9fa33fc76887 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -29,7 +29,7 @@ namespace plugin { class SlicePlugin : public PluginTensorRT { public: explicit SlicePlugin(std::vector starts, std::vector ends, - std::vector axes, bool ban_fp16); + std::vector axes, bool with_fp16); // It was used for tensorrt deserialization. // It should not be called by users. @@ -58,7 +58,6 @@ class SlicePlugin : public PluginTensorRT { std::vector starts_; std::vector ends_; std::vector axes_; - bool ban_fp16_{false}; int* offset_temp_data_{nullptr}; cudaEvent_t copy_event_; cudaStream_t copy_stream_; @@ -68,10 +67,10 @@ class SlicePlugin : public PluginTensorRT { class SlicePluginDynamic : public DynamicPluginTensorRT { public: explicit SlicePluginDynamic(std::vector starts, std::vector ends, - std::vector axes, bool ban_fp16); + std::vector axes, bool with_fp16); nvinfer1::IPluginV2DynamicExt* clone() const override { - return new SlicePluginDynamic(starts_, ends_, axes_, ban_fp16_); + return new SlicePluginDynamic(starts_, ends_, axes_, with_fp16_); } SlicePluginDynamic(void const* serialData, size_t serialLength); @@ -117,7 +116,6 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { std::vector starts_; std::vector ends_; std::vector axes_; - bool ban_fp16_{false}; int* offset_temp_data_{nullptr}; cudaEvent_t copy_event_; cudaStream_t copy_stream_; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 9eefb925d2061f398db53bc7d4c99ac0e8636678..2f4f731d887b72430d05a6f235b3512d357efb94 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -145,9 +145,16 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs, #if IS_TRT_VERSION_GE(6000) int SplitPluginDynamic::initialize() { return 0; } -size_t SplitPluginDynamic::getSerializationSize() const { return 0; } +size_t SplitPluginDynamic::getSerializationSize() const { + return SerializedSize(axis_) + SerializedSize(output_length_) + + SerializedSize(with_fp16_); +} -void SplitPluginDynamic::serialize(void* buffer) const {} +void SplitPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, output_length_); + SerializeValue(&buffer, with_fp16_); +} nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, @@ -183,14 +190,14 @@ bool SplitPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc& in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - return (in.type == nvinfer1::DataType::kFLOAT || - in.type == nvinfer1::DataType::kHALF) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#endif + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; // output @@ -234,6 +241,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Split-->fp32"; thrust::device_vector d_output_ptrs; d_output_ptrs.resize(this->getNbOutputs(), nullptr); @@ -249,7 +257,7 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, inner_cols, axis_shape, outer_rows); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Split-->fp16"; thrust::device_vector d_output_ptrs; d_output_ptrs.resize(this->getNbOutputs(), nullptr); @@ -264,10 +272,6 @@ int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, split_kernel<<>>( d_segment_offsets.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, inner_cols, axis_shape, outer_rows); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index bf40957c4fd9ef9186c9f0f38efd4a88fcfe0917..e3057f2bd180362388e4a4061321e633ff861c82 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -27,8 +28,10 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: SplitPlugin() {} - SplitPlugin(int axis, std::vector const& output_lengths) - : axis_(axis), same_shape_(true), output_length_(output_lengths) {} + SplitPlugin(int axis, std::vector const& output_lengths, bool with_fp16) + : axis_(axis), same_shape_(true), output_length_(output_lengths) { + with_fp16_ = with_fp16; + } SplitPlugin(void const* serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); @@ -37,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { } SplitPlugin* clone() const override { - return new SplitPlugin(axis_, output_length_); + return new SplitPlugin(axis_, output_length_, with_fp16_); } const char* getPluginType() const override { return "split_plugin"; } @@ -77,13 +80,20 @@ class SplitPlugin : public PluginTensorRT { #if IS_TRT_VERSION_GE(6000) class SplitPluginDynamic : public DynamicPluginTensorRT { public: - SplitPluginDynamic(int axis, std::vector const& output_lengths) - : axis_(axis), output_length_(output_lengths) {} + SplitPluginDynamic(int axis, std::vector const& output_lengths, + bool with_fp16) + : axis_(axis), output_length_(output_lengths) { + with_fp16_ = with_fp16; + } - SplitPluginDynamic(void const* serial_data, size_t serial_length) {} + SplitPluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &output_length_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); + } nvinfer1::IPluginV2DynamicExt* clone() const override { - return new SplitPluginDynamic(axis_, output_length_); + return new SplitPluginDynamic(axis_, output_length_, with_fp16_); } const char* getPluginType() const override { return "split_plugin"; } @@ -127,6 +137,46 @@ class SplitPluginDynamic : public DynamicPluginTensorRT { int axis_; std::vector output_length_; }; + +class SplitPluginV2Creator : public nvinfer1::IPluginCreator { + public: + SplitPluginV2Creator() {} + const char* getPluginName() const override { return "split_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new SplitPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu index 1ecbf4be154f01059ef33e2d510d8329d6726314..79ec2066faa130e191ab34f58a030b607172c218 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu @@ -24,19 +24,22 @@ namespace tensorrt { namespace plugin { #if IS_TRT_VERSION_GE(6000) -StackPluginDynamic::StackPluginDynamic(int axis, int num_stack) - : axis_(axis), num_stack_(num_stack) {} +StackPluginDynamic::StackPluginDynamic(int axis, int num_stack, bool with_fp16) + : axis_(axis), num_stack_(num_stack) { + with_fp16_ = with_fp16; +} StackPluginDynamic::StackPluginDynamic(void const* serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &num_stack_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); } StackPluginDynamic::~StackPluginDynamic() {} nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const { - return new StackPluginDynamic(axis_, num_stack_); + return new StackPluginDynamic(axis_, num_stack_, with_fp16_); } const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; } @@ -49,12 +52,14 @@ size_t StackPluginDynamic::getSerializationSize() const { size_t serialize_size = 0; serialize_size += SerializedSize(axis_); serialize_size += SerializedSize(num_stack_); + serialize_size += SerializedSize(with_fp16_); return serialize_size; } void StackPluginDynamic::serialize(void* buffer) const { SerializeValue(&buffer, axis_); SerializeValue(&buffer, num_stack_); + SerializeValue(&buffer, with_fp16_); } nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions( @@ -99,14 +104,14 @@ bool StackPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc& in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - return (in.type == nvinfer1::DataType::kFLOAT || - in.type == nvinfer1::DataType::kHALF) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#endif + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; // output @@ -170,20 +175,17 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, auto infer_type = input_desc[0].type; if (infer_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Stack-->fp32"; float* output = static_cast(outputs[0]); StackKernel<<>>( reinterpret_cast(workspace), output, num_stacks, base_unit); } else if (infer_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Stack-->fp16"; __half* output = static_cast<__half*>(outputs[0]); StackKernel<__half><<>>( reinterpret_cast(workspace), output, num_stacks, base_unit); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW( platform::errors::Fatal("The Stack TRT Plugin's input type only " @@ -209,6 +211,7 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin( const char* name, const nvinfer1::PluginFieldCollection* fc) { int axis = -1; int num_stack = -1; + bool with_fp16 = false; for (int i = 0; i < fc->nbFields; ++i) { const std::string name(fc->fields[i].name); @@ -216,13 +219,15 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin( axis = static_cast(fc->fields[i].data)[0]; } else if (name == "num_stack") { num_stack = static_cast(fc->fields[i].data)[0]; + } else if (name == "with_fp16") { + with_fp16 = static_cast(fc->fields[i].data)[0]; } else { PADDLE_THROW(platform::errors::Fatal("Meet an unknown plugin field '" + name + "' when creating stack op plugin.")); } } - return new StackPluginDynamic(axis, num_stack); + return new StackPluginDynamic(axis, num_stack, with_fp16); } nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin( diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h index f4f6cde6f87ea97c514e68bc2862bb163b0aa448..cd8adaf754957239be7030bd988b9a941f084e09 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h @@ -28,7 +28,7 @@ namespace plugin { #if IS_TRT_VERSION_GE(6000) class StackPluginDynamic : public DynamicPluginTensorRT { public: - explicit StackPluginDynamic(int axis, int num_stack); + explicit StackPluginDynamic(int axis, int num_stack, bool with_fp16); StackPluginDynamic(void const* serial_data, size_t serial_length); ~StackPluginDynamic(); nvinfer1::IPluginV2DynamicExt* clone() const override; diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu index 58e5afd6019f0ac68ad2ada92c9c11f180aa6056..3847d999446e99dfe0bcdc7abfa06ac6c57e64e2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu @@ -44,12 +44,12 @@ nvinfer1::Dims SwishPlugin::getOutputDimensions(int index, template __device__ T math_exp(T a); -#ifdef SUPPORTS_CUDA_FP16 template <> __device__ half math_exp(half a) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) return hexp(a); -} #endif +} template <> __device__ float math_exp(float a) { @@ -71,6 +71,19 @@ __global__ void swish_kernel(int num, const T *input, T *output, T beta) { } } +template <> +__global__ void swish_kernel(int num, const half *input, half *output, + half beta) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + output[index] = + __ldg(input + index) / + (static_cast(1.0) + math_exp(-beta * __ldg(input + index))); +#endif + } +} + int SwishPlugin::enqueue(int batch_size, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { // input dims is CHW. @@ -92,14 +105,18 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs, #if IS_TRT_VERSION_GE(6000) int SwishPluginDynamic::initialize() { - setPluginNamespace("swish"); getPluginNamespace(); return 0; } -size_t SwishPluginDynamic::getSerializationSize() const { return 0; } +size_t SwishPluginDynamic::getSerializationSize() const { + return SerializedSize(beta_) + SerializedSize(with_fp16_); +} -void SwishPluginDynamic::serialize(void *buffer) const {} +void SwishPluginDynamic::serialize(void *buffer) const { + SerializeValue(&buffer, beta_); + SerializeValue(&buffer, with_fp16_); +} nvinfer1::DimsExprs SwishPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, @@ -123,14 +140,14 @@ bool SwishPluginDynamic::supportsFormatCombination( const nvinfer1::PluginTensorDesc &in = in_out[pos]; if (pos == 0) { -#ifdef SUPPORTS_CUDA_FP16 - return (in.type == nvinfer1::DataType::kFLOAT || - in.type == nvinfer1::DataType::kHALF) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#else - return (in.type == nvinfer1::DataType::kFLOAT) && - (in.format == nvinfer1::TensorFormat::kLINEAR); -#endif + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } } const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; // output @@ -157,20 +174,17 @@ int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, auto input_type = input_desc[0].type; if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32"; const float *input = static_cast(inputs[0]); float *output = static_cast(outputs[0]); swish_kernel<<>>(num, input, output, beta_); } else if (input_type == nvinfer1::DataType::kHALF) { -#ifdef SUPPORTS_CUDA_FP16 + VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16"; const half *input = static_cast(inputs[0]); half *output = static_cast(outputs[0]); swish_kernel<<>>( num, input, output, static_cast(beta_)); -#else - PADDLE_THROW(platform::errors::Fatal( - "The cuda archs you specific should greater than 600.")); -#endif } else { PADDLE_THROW(platform::errors::InvalidArgument( "The Swish TRT Plugin's input type should be float or half.")); diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h index 6defdae0eef0820c7b9a050df299c699f6d6566c..85cc6916238fefa028310b07e02301f10e07aefd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -32,7 +32,8 @@ class SwishPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return getBaseSerializationSize() + SerializedSize(beta_); + return SerializedSize(getPluginType()) + getBaseSerializationSize() + + SerializedSize(beta_); } // TRT will call this func when we need to serialize the configuration of @@ -45,7 +46,9 @@ class SwishPlugin : public PluginTensorRT { } public: - explicit SwishPlugin(const float beta) : beta_(beta) {} + explicit SwishPlugin(const float beta, const bool with_fp16) : beta_(beta) { + with_fp16_ = with_fp16; + } // It was used for tensorrt deserialization. // It should not be called by users. @@ -56,7 +59,9 @@ class SwishPlugin : public PluginTensorRT { ~SwishPlugin() {} int initialize() override; - SwishPlugin* clone() const override { return new SwishPlugin(beta_); } + SwishPlugin* clone() const override { + return new SwishPlugin(beta_, with_fp16_); + } const char* getPluginType() const override { return "swish_plugin"; } int getNbOutputs() const override { return 1; } @@ -69,10 +74,16 @@ class SwishPlugin : public PluginTensorRT { #if IS_TRT_VERSION_GE(6000) class SwishPluginDynamic : public DynamicPluginTensorRT { public: - explicit SwishPluginDynamic(const float beta) : beta_(beta) {} - SwishPluginDynamic(void const* serialData, size_t serialLength) {} + explicit SwishPluginDynamic(const float beta, const bool with_fp16) + : beta_(beta) { + with_fp16_ = with_fp16; + } + SwishPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &beta_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); + } nvinfer1::IPluginV2DynamicExt* clone() const override { - return new SwishPluginDynamic(beta_); + return new SwishPluginDynamic(beta_, with_fp16_); } const char* getPluginType() const override { return "swish_plugin"; } @@ -115,6 +126,46 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { private: float beta_; }; + +class SwishPluginV2Creator : public nvinfer1::IPluginCreator { + public: + SwishPluginV2Creator() {} + const char* getPluginName() const override { return "swish_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new SwishPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index b0f4cff3ac184beeed2ebd3a4b7531d570c87075..fd721b161450d7a8d4660ca09ea3a1093d754664 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -24,6 +24,7 @@ void PluginTensorRT::serializeBase(void*& buffer) { SerializeValue(&buffer, max_batch_size_); SerializeValue(&buffer, data_type_); SerializeValue(&buffer, data_format_); + SerializeValue(&buffer, with_fp16_); } void PluginTensorRT::deserializeBase(void const*& serial_data, @@ -32,11 +33,13 @@ void PluginTensorRT::deserializeBase(void const*& serial_data, DeserializeValue(&serial_data, &serial_length, &max_batch_size_); DeserializeValue(&serial_data, &serial_length, &data_type_); DeserializeValue(&serial_data, &serial_length, &data_format_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); } size_t PluginTensorRT::getBaseSerializationSize() { return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + - SerializedSize(data_type_) + SerializedSize(data_format_)); + SerializedSize(data_type_) + SerializedSize(data_format_) + + SerializedSize(with_fp16_)); } bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 528adacb27c9897420a5115a93c88c246c0d78d8..871bd89ce6bde7a5b81050ddc94ddd1701b02f8e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -42,7 +42,7 @@ typedef std::function PluginConstructFunc; class PluginTensorRT : public nvinfer1::IPluginExt { public: - PluginTensorRT() {} + PluginTensorRT() : with_fp16_(false) {} // It was used for TensorRT deserialization. // It should not be called by users. PluginTensorRT(const void* serialized_data, size_t length) {} @@ -112,12 +112,13 @@ class PluginTensorRT : public nvinfer1::IPluginExt { nvinfer1::PluginFormat data_format_; std::vector inputs_; + bool with_fp16_; }; #if IS_TRT_VERSION_GE(6000) class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { public: - DynamicPluginTensorRT() {} + DynamicPluginTensorRT() : with_fp16_(false) {} DynamicPluginTensorRT(const void* serialized_data, size_t length) {} // The Func in IPluginExt or IpluginExtV2 @@ -173,6 +174,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { size_t& serial_length); // NOLINT size_t getBaseSerializationSize() const; void serializeBase(void*& buffer) const; // NOLINT + bool with_fp16_; private: std::string name_space_; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 27bd28f6b30123048cc4d7aa7654fa4138ae57d4..5670984bc3a1fcafac5221a98f0d7a8d9771c426 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -439,11 +439,18 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") endif() - inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc + inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) + set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/") + if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz) + inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz") + endif() + inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized) endif() set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5585980c53fcb0cd4ccb96010fccb4076163829c --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h" + +namespace paddle { +namespace inference { + +TEST(AnalysisPredictor, fp16) { + std::vector result = {0.59923654, 0.21923761, 0.18152587}; + trt_ernie(true, result); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c8776477658e744dc6fbf3214b3f686fa90759e --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h" + +namespace paddle { +namespace inference { + +TEST(AnalysisPredictor, no_fp16) { + std::vector result = {0.597841, 0.219972, 0.182187}; + trt_ernie(false, result); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h similarity index 92% rename from paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc rename to paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h index 3d84264319a6fa8ba4363cf31425603489207e06..30d89943df28770295bd86ecd18cebefb820205a 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h @@ -8,19 +8,23 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - +#pragma once #include #include #include #include #include +#include +#include +#include +#include #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { namespace inference { -int DeleteCache(std::string path) { +static int DeleteCache(std::string path) { DIR* dir = opendir(path.c_str()); if (dir == NULL) return 0; struct dirent* ptr; @@ -36,7 +40,7 @@ int DeleteCache(std::string path) { return 0; } -void run(const AnalysisConfig& config, std::vector* out_data) { +static void run(const AnalysisConfig& config, std::vector* out_data) { auto predictor = CreatePaddlePredictor(config); auto input_names = predictor->GetInputNames(); @@ -98,7 +102,7 @@ void run(const AnalysisConfig& config, std::vector* out_data) { output_t->copy_to_cpu(out_data->data()); } -void trt_ernie(bool with_fp16, std::vector result) { +static void trt_ernie(bool with_fp16, std::vector result) { AnalysisConfig config; std::string model_dir = FLAGS_infer_model; // Delete serialization cache to perform serialization first rather than @@ -152,15 +156,5 @@ void trt_ernie(bool with_fp16, std::vector result) { } } -TEST(AnalysisPredictor, no_fp16) { - std::vector result = {0.597841, 0.219972, 0.182187}; - trt_ernie(false, result); -} -#ifdef SUPPORTS_CUDA_FP16 -TEST(AnalysisPredictor, fp16) { - std::vector result = {0.59923654, 0.21923761, 0.18152587}; - trt_ernie(true, result); -} -#endif // SUPPORTS_CUDA_FP16 } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 25ad6e6105aae7eff4c0af707439c6b586f81315..6bf34484e5dffddc189fedaff20dbd6557dfcf70 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -83,7 +83,8 @@ void run(const AnalysisConfig& config, std::vector* out_data) { output_t->copy_to_cpu(out_data->data()); } -void trt_ernie(bool with_fp16, std::vector result) { +void trt_ernie(bool with_fp16, std::vector result, + float near_tolerance) { AnalysisConfig config; std::string model_dir = FLAGS_infer_model; SetConfig(&config, model_dir, true); @@ -126,19 +127,19 @@ void trt_ernie(bool with_fp16, std::vector result) { run(config, &out_data); for (size_t i = 0; i < out_data.size(); i++) { - EXPECT_NEAR(result[i], out_data[i], 1e-6); + EXPECT_NEAR(result[i], out_data[i], near_tolerance); } } TEST(AnalysisPredictor, no_fp16) { std::vector result = {0.597841, 0.219972, 0.182187}; - trt_ernie(false, result); + trt_ernie(false, result, 1e-5); } TEST(AnalysisPredictor, fp16) { -#ifdef SUPPORTS_CUDA_FP16 - std::vector result = {0.598336, 0.219558, 0.182106}; - trt_ernie(true, result); +#ifdef TRT_PLUGIN_FP16_AVALIABLE + std::vector result = {0.598, 0.219, 0.182}; + trt_ernie(true, result, 3e-3); #endif } diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 4004ad401ee7f509edb00fe1424bcf69665e17a3..2373042815cd01a9eb9d729ba88b342bada20a97 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -76,6 +76,34 @@ __device__ inline void LayerNorm(const kvp &thread_data, const int ld, } } +template +__device__ inline void LayerNorm2(const kvp &thread_data, const int ld, + const int offset, const float2 *bias, + const float2 *scale, T2 *output, T eps) { + using BlockReduce = cub::BlockReduce, TPB>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T mu; // mean + __shared__ T rsigma; // 1 / std.dev. + + const auto sum_kv = BlockReduce(temp_storage).Reduce(thread_data, cub::Sum()); + + if (threadIdx.x == 0) { + mu = sum_kv.key; + rsigma = rsqrt(sum_kv.value - mu * mu + eps); + } + __syncthreads(); + + for (int i = threadIdx.x; i < ld; i += TPB) { + const int idx = offset + i; + T2 val = output[idx]; + const float2 g = scale[i]; + const float2 b = bias[i]; + val.x = T(g.x) * (val.x - mu) * rsigma + T(b.x); + val.y = T(g.y) * (val.y - mu) * rsigma + T(b.y); + output[idx] = val; + } +} + template __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, const float *scale, const float *bias, @@ -117,6 +145,50 @@ __global__ void EmbEltwiseLayernormKernel(int hidden, const int64_t *ids, LayerNorm(thread_data, hidden, out_offset, bias, scale, output, eps); } +template <> +__global__ void EmbEltwiseLayernormKernel( + int hidden, const int64_t *ids, const float *scale, const float *bias, + const int64_t *embs, half *output, float eps, int input_num) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + cub::Sum pair_sum; + // blockIdx.x: position in the sequence + // blockIdx.y: batch + // gridDim.x: Seq + // gridDim.y: Batch + + extern __shared__ int64_t array_id[]; + + const half rhidden = half(1.f) / half(hidden); + const int64_t seq_pos = blockIdx.y + blockIdx.x * gridDim.y; + if (threadIdx.x == 0) { + for (int i = 0; i < input_num; ++i) { + const int64_t *ids_p = reinterpret_cast(ids[i]); + array_id[i] = ids_p[seq_pos]; + } + } + __syncthreads(); + + const int64_t out_offset = seq_pos * hidden; + + kvp thread_data(0, 0); + +#pragma unroll + for (int it = threadIdx.x; it < hidden; it += 256) { + half val = 0; + for (int i = 0; i < input_num; ++i) { + val += reinterpret_cast(embs[i])[array_id[i] * hidden + it]; + } + + output[out_offset + it] = val; + const half rhiddenval = rhidden * val; + thread_data = + pair_sum(thread_data, kvp(rhiddenval, rhiddenval * val)); + } + LayerNorm(thread_data, hidden, out_offset, bias, scale, output, + eps); +#endif +} + template void EmbEltwiseLayerNormFunctor::operator()( int batch, int seq_len, int hidden, const int64_t *ids, const float *scale, @@ -132,7 +204,8 @@ void EmbEltwiseLayerNormFunctor::operator()( template class EmbEltwiseLayerNormFunctor; -#ifdef SUPPORTS_CUDA_FP16 +// device function 'operator()' is not supportted until cuda 10.0 +#if CUDA_VERSION >= 10000 template class EmbEltwiseLayerNormFunctor; #endif @@ -144,30 +217,90 @@ __global__ void SoftmaxKernelWithEltadd(T *qk_buf_, const T *bias_qk_, int qk_offset = blockIdx.x * seq_len; assert(blockDim.x % 32 == 0); - __shared__ float s_sum, s_max; + float tmp = threadIdx.x < seq_len + ? static_cast(qk_buf_[threadIdx.x + qk_offset] + + bias_qk_[threadIdx.x + qk_offset]) + : -1e20f; + float max_val = blockReduceMax(tmp, mask); - float qk = threadIdx.x < seq_len - ? static_cast((qk_buf_[threadIdx.x + qk_offset] + - bias_qk_[threadIdx.x + qk_offset])) - : 0.0f; - float tmp = threadIdx.x < seq_len ? static_cast(qk) : -1e20f; + float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; + float sum_val = blockReduceSum(qk_tmp, mask); - float max_val = blockReduceMax(tmp, mask); + if (threadIdx.x < seq_len) + qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / sum_val); +} - if (threadIdx.x == 0) s_max = max_val; - __syncthreads(); +template <> +__global__ void SoftmaxKernelWithEltadd( + half *qk_buf_, const half *bias_qk_, const int batch_size, + const int head_num, const int seq_len, const unsigned mask) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + int qk_offset = blockIdx.x * seq_len; + assert(blockDim.x % 32 == 0); + + float tmp = threadIdx.x < seq_len + ? static_cast(qk_buf_[threadIdx.x + qk_offset] + + bias_qk_[threadIdx.x + qk_offset]) + : -1e20f; + float max_val = blockReduceMax(tmp, mask); - float qk_tmp = - threadIdx.x < seq_len ? __expf(static_cast(tmp - s_max)) : 0.0f; + float qk_tmp = threadIdx.x < seq_len ? __expf(tmp - max_val) : 0.0f; float sum_val = blockReduceSum(qk_tmp, mask); - if (threadIdx.x == 0) { - s_sum = sum_val + 1e-6f; + if (threadIdx.x < seq_len) + qk_buf_[threadIdx.x + qk_offset] = (half)(qk_tmp / sum_val); +#endif +} + +template +__global__ void SoftmaxKernelWithEltadd2(T *qk_buf_, const T *bias_qk_, + const int batch_size, + const int head_num, const int seq_len, + const unsigned mask) { + int qk_offset = blockIdx.x * seq_len; + int idx = threadIdx.x; + assert(blockDim.x % 32 == 0); + + float2 tmp = + idx < seq_len + ? ToFloat2(qk_buf_[idx + qk_offset] + bias_qk_[idx + qk_offset]) + : make_float2(-1e20f, -1e20f); + float max_val = blockReduceMax(max(tmp.x, tmp.y), mask); + float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), + __expf(tmp.y - max_val)) + : make_float2(0.f, 0.f); + float sum_val = blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; + + if (idx < seq_len) { + qk_buf_[idx + qk_offset] = + FloatsToPair(qk_tmp.x / sum_val, qk_tmp.y / sum_val); } - __syncthreads(); +} - if (threadIdx.x < seq_len) - qk_buf_[threadIdx.x + qk_offset] = (T)(qk_tmp / s_sum); +template <> +__global__ void SoftmaxKernelWithEltadd2( + half2 *qk_buf_, const half2 *bias_qk_, const int batch_size, + const int head_num, const int seq_len, const unsigned mask) { +// operator "+" of half only suppotted after cuda version 10.0 +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000 + int qk_offset = blockIdx.x * seq_len; + int idx = threadIdx.x; + assert(blockDim.x % 32 == 0); + + float2 tmp = idx < seq_len ? ToFloat2(qk_buf_[idx + qk_offset] + + bias_qk_[idx + qk_offset]) + : make_float2(-1e20f, -1e20f); + float max_val = blockReduceMax(max(tmp.x, tmp.y), mask); + float2 qk_tmp = idx < seq_len ? make_float2(__expf(tmp.x - max_val), + __expf(tmp.y - max_val)) + : make_float2(0.f, 0.f); + float sum_val = blockReduceSum(qk_tmp.x + qk_tmp.y, mask) + 1e-6f; + + if (idx < seq_len) { + qk_buf_[idx + qk_offset] = + FloatsToPair(qk_tmp.x / sum_val, qk_tmp.y / sum_val); + } +#endif } template @@ -199,21 +332,24 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, "seq_len should <= 1024, " "but received seq_len is:%d", seq_len)); - if (seq_len <= 32) - block = 32; - else if (seq_len > 32 && seq_len <= 64) - block = 64; - else if (seq_len > 64 && seq_len <= 128) - block = 128; - else if (seq_len > 128 && seq_len <= 256) - block = 256; - else if (seq_len > 256 && seq_len <= 512) - block = 512; - else - block = 1024; - - SoftmaxKernelWithEltadd<<>>( - qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); + if (seq_len % 2 == 0) { + block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32; + if (std::is_same::value) { + SoftmaxKernelWithEltadd2<<>>( + reinterpret_cast(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } else { + SoftmaxKernelWithEltadd2<__half2><<>>( + reinterpret_cast<__half2 *>(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } + } else { + block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32; + SoftmaxKernelWithEltadd<<>>( + qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); + } } template @@ -261,7 +397,8 @@ void MultiHeadGPUComputeFunctor::operator()( template class MultiHeadGPUComputeFunctor; -#ifdef SUPPORTS_CUDA_FP16 +// device function 'operator()' is not supportted until cuda 10.0 +#if CUDA_VERSION >= 10000 template class MultiHeadGPUComputeFunctor; #endif @@ -285,6 +422,69 @@ __global__ void SkipLayerNormSmallKernel(int num, int hidden, const T *input1, eps); } +template <> +__global__ void SkipLayerNormSmallKernel( + int num, int hidden, const half *input1, const half *input2, half *output, + const float *scale, const float *bias, float eps) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half rld = half(1) / half(hidden); + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + const int idx = offset + threadIdx.x; + half val = 0; + if (threadIdx.x < hidden) { + val = input1[idx] + input2[idx]; + const half rldval = rld * val; + thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + } + LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, + eps); +#endif +} + +template <> +__global__ void SkipLayerNormSmallKernel( + int num, int hidden, const half *input1, const half *input2, half *output, + const float *scale, const float *bias, float eps) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half rld = half(1) / half(hidden); + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + const int idx = offset + threadIdx.x; + half val = 0; + if (threadIdx.x < hidden) { + val = input1[idx] + input2[idx]; + const half rldval = rld * val; + thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + } + LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, + eps); +#endif +} + +template <> +__global__ void SkipLayerNormSmallKernel( + int num, int hidden, const half *input1, const half *input2, half *output, + const float *scale, const float *bias, float eps) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half rld = half(1) / half(hidden); + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + const int idx = offset + threadIdx.x; + half val = 0; + if (threadIdx.x < hidden) { + val = input1[idx] + input2[idx]; + const half rldval = rld * val; + thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + } + LayerNormSmall(val, thread_data, hidden, idx, bias, scale, output, + eps); +#endif +} + template __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, const T *input2, T *output, @@ -305,6 +505,74 @@ __global__ void SkipLayerNormKernel(int num, int hidden, const T *input1, LayerNorm(thread_data, hidden, offset, bias, scale, output, eps); } +template <> +__global__ void SkipLayerNormKernel(int num, int hidden, + const half *input1, + const half *input2, half *output, + const float *scale, + const float *bias, float eps) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half rld = half(1) / half(hidden); + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + + for (int it = threadIdx.x; it < hidden; it += 256) { + const int idx = offset + it; + const half val = input1[idx] + input2[idx]; + const half rldval = rld * val; + thread_data = pair_sum(thread_data, kvp(rldval, rldval * val)); + output[idx] = val; + } + LayerNorm(thread_data, hidden, offset, bias, scale, output, eps); +#endif +} + +template +__global__ void SkipLayerNormKernel2(int num, int hidden, const T2 *input1, + const T2 *input2, T2 *output, + const float2 *scale, const float2 *bias, + float eps) { + const T rld = T(0.5f / hidden); // because hidden is hidden/2 + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + + for (int it = threadIdx.x; it < hidden; it += TPB) { + const int idx = offset + it; + const T2 val2 = input1[idx] + input2[idx]; + thread_data = pair_sum( + thread_data, kvp(rld * (val2.x + val2.y), + rld * val2.x * val2.x + rld * val2.y * val2.y)); + output[idx] = val2; + } + LayerNorm2(thread_data, hidden, offset, bias, scale, output, eps); +} + +template <> +__global__ void SkipLayerNormKernel2( + int num, int hidden, const half2 *input1, const half2 *input2, + half2 *output, const float2 *scale, const float2 *bias, float eps) { +// operator "+" of half only suppotted after cuda version 10.0 +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000 + const half rld = half(0.5f / hidden); // because hidden is hidden/2 + const int offset = blockIdx.x * hidden; + cub::Sum pair_sum; + kvp thread_data(0, 0); + + for (int it = threadIdx.x; it < hidden; it += 256) { + const int idx = offset + it; + const half2 val2 = input1[idx] + input2[idx]; + thread_data = pair_sum( + thread_data, kvp(rld * (val2.x + val2.y), + rld * val2.x * val2.x + rld * val2.y * val2.y)); + output[idx] = val2; + } + LayerNorm2(thread_data, hidden, offset, bias, scale, output, + eps); +#endif +} + template void SkipLayerNormFunctor::operator()(const int num, const int hidden, const T *input1, const T *input2, @@ -326,14 +594,38 @@ void SkipLayerNormFunctor::operator()(const int num, const int hidden, num, hidden, input1, input2, output, scale, bias, eps); } else { const int threads = 256; - SkipLayerNormKernel<<>>( - num, hidden, input1, input2, output, scale, bias, eps); + if (hidden % 2 == 0) { + if (std::is_same::value) { + SkipLayerNormKernel2<<>>( + num, hidden / 2, reinterpret_cast(input1), + reinterpret_cast(input2), + reinterpret_cast(output), + reinterpret_cast(scale), + reinterpret_cast(bias), eps); + } else if (std::is_same::value) { + SkipLayerNormKernel2<__half, __half2, + threads><<>>( + num, hidden / 2, reinterpret_cast(input1), + reinterpret_cast(input2), + reinterpret_cast<__half2 *>(output), + reinterpret_cast(scale), + reinterpret_cast(bias), eps); + } else { + assert(false); + // should not be here + } + } else { + SkipLayerNormKernel<<>>( + num, hidden, input1, input2, output, scale, bias, eps); + } } } template class SkipLayerNormFunctor; -#ifdef SUPPORTS_CUDA_FP16 +// device function 'operator()' is not supportted until cuda 10.0 +#if CUDA_VERSION >= 10000 template class SkipLayerNormFunctor; #endif diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h index dd8d171208566818f52520a0f0d38315e9bb66e4..fdbddd96a57d2ca47fd74af9fbb062a44c99512a 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.h +++ b/paddle/fluid/operators/math/bert_encoder_functor.h @@ -26,12 +26,10 @@ namespace math { template struct CUDATypeTraits; -#ifdef SUPPORTS_CUDA_FP16 template <> struct CUDATypeTraits { typedef platform::float16 TYPE; }; -#endif template <> struct CUDATypeTraits { diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h index 17175fa7299d40938509b9243b5759278991e821..65961f33aa4f9d274363a8ca075092059ad6dd8c 100644 --- a/paddle/fluid/operators/math/math_cuda_utils.h +++ b/paddle/fluid/operators/math/math_cuda_utils.h @@ -26,9 +26,15 @@ __device__ __forceinline__ T FromFloat(float a); template __device__ __forceinline__ float ToFloat(T a); +template +__device__ __forceinline__ float2 ToFloat2(T a); + template __device__ __forceinline__ T exp_func(T a); +template +__device__ __forceinline__ T FloatsToPair(const float a, const float b); + template struct KeyValuePair; @@ -41,12 +47,10 @@ __device__ __forceinline__ float FromFloat(float a) { return a; } -#ifdef SUPPORTS_CUDA_FP16 template <> __device__ __forceinline__ half FromFloat(float a) { return __float2half(a); } -#endif // to_float template <> @@ -54,28 +58,50 @@ __device__ __forceinline__ float ToFloat(float a) { return a; } -#ifdef SUPPORTS_CUDA_FP16 +template <> +__device__ __forceinline__ float2 ToFloat2(float2 a) { + return a; +} + +template <> +__device__ __forceinline__ float2 FloatsToPair(const float a, + const float b) { + return make_float2(a, b); +} + +__inline__ __device__ float2 operator+(const float2 &a, const float2 &b) { + return make_float2(a.x + b.x, a.y + b.y); +} + template <> __device__ __forceinline__ float ToFloat(half a) { return __half2float(a); } -#endif + +template <> +__device__ __forceinline__ float2 ToFloat2<__half2>(__half2 a) { + return __half22float2(a); +} + +template <> +__device__ __forceinline__ __half2 FloatsToPair<__half2>(const float a, + const float b) { + return __floats2half2_rn(a, b); +} template <> __device__ __forceinline__ float exp_func(float a) { return expf(a); } -#ifdef SUPPORTS_CUDA_FP16 template <> __device__ __forceinline__ half exp_func(half a) { -#if __CUDA_ARCH__ >= 600 +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) return hexp(a); #else return FromFloat(expf(ToFloat(a))); #endif } -#endif template <> struct KeyValuePair { @@ -97,7 +123,6 @@ struct KeyValuePair { } }; -#ifdef SUPPORTS_CUDA_FP16 template <> struct KeyValuePair { __device__ __forceinline__ KeyValuePair() {} @@ -112,11 +137,20 @@ struct KeyValuePair { operator+(const KeyValuePair &a) const { const half2 a2 = __halves2half2(key, value); const half2 b2 = __halves2half2(a.key, a.value); +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const half2 res = __hadd2(a2, b2); +#else + float a2_1 = __low2float(a2); + float a2_2 = __high2float(a2); + float b2_1 = __low2float(b2); + float b2_2 = __high2float(b2); + float r1 = a2_1 + b2_1; + float r2 = a2_2 + b2_2; + const half2 res = __floats2half2_rn(r1, r2); +#endif return KeyValuePair(res.x, res.y); } }; -#endif #define FINAL_MASK 0xffffffff #define HALF_WARP 16 @@ -148,7 +182,7 @@ __inline__ __device__ T blockReduceSum(T val, unsigned mask) { // align block_span to warpSize int block_span = (blockDim.x + warpSize - 1) >> 5; - val = (threadIdx.x < block_span) ? shared[lane] : static_cast(0.0f); + val = (lane < block_span) ? shared[lane] : static_cast(0.0f); val = warpReduceSum(val, mask); return val; @@ -180,7 +214,7 @@ __inline__ __device__ T blockReduceMax(T val, unsigned mask) { // align block_span to warpSize int block_span = (blockDim.x + warpSize - 1) >> 5; - val = (threadIdx.x < block_span) ? shared[lane] : -1e10f; + val = (lane < block_span) ? shared[lane] : -1e10f; val = warpReduceMax(val, mask); return val; diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 496eb78f20ef7bd25db07f68bb15202b2f7f2972..b70a206b7dee62b6c1f37e166eb5e475c118430a 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -44,6 +44,8 @@ limitations under the License. */ #define PADDLE_ALIGN(x) __declspec(align(x)) #endif +#define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) + namespace paddle { namespace platform { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py index c7fd7995118dbfdadc0a11934c7f171b1880a732..733ab358de05ce5a85756a303bb33fa15c700566 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py @@ -40,6 +40,7 @@ class InferencePassTest(unittest.TestCase): self.enable_mkldnn = False self.enable_trt = False self.trt_parameters = None + self.dynamic_shape_params = None self.enable_lite = False self.lite_parameters = None self.path = "./inference_pass/" + self.__class__.__name__ + "/" @@ -114,6 +115,14 @@ class InferencePassTest(unittest.TestCase): self.trt_parameters.precision, self.trt_parameters.use_static, self.trt_parameters.use_calib_mode) + + if self.dynamic_shape_params: + config.set_trt_dynamic_shape_info( + self.dynamic_shape_params.min_input_shape, + self.dynamic_shape_params.max_input_shape, + self.dynamic_shape_params.optim_input_shape, + self.dynamic_shape_params.disable_trt_plugin_fp16) + elif use_mkldnn: config.enable_mkldnn() @@ -131,7 +140,7 @@ class InferencePassTest(unittest.TestCase): for place_ in use_gpu: self.check_output_with_option(place_, atol) - def check_output_with_option(self, use_gpu, atol=1e-5): + def check_output_with_option(self, use_gpu, atol=1e-5, flatten=False): ''' Check whether calculating on CPU and GPU, enable TensorRT or disable TensorRT, enable MKLDNN or disable MKLDNN @@ -154,6 +163,9 @@ class InferencePassTest(unittest.TestCase): for out, analysis_output in zip(outs, analysis_outputs): out = np.array(out) + if flatten: + out = out.flatten() + analysis_output = analysis_output.flatten() self.assertTrue( np.allclose( @@ -167,12 +179,21 @@ class InferencePassTest(unittest.TestCase): self._get_analysis_config( use_gpu=use_gpu, use_trt=self.enable_trt)) + if self.trt_parameters.use_static: + #deserialize + tensorrt_outputs = self._get_analysis_outputs( + self._get_analysis_config( + use_gpu=use_gpu, use_trt=self.enable_trt)) + self.assertTrue( len(tensorrt_outputs) == len(outs), "The number of outputs is different between GPU and TensorRT. ") for out, tensorrt_output in zip(outs, tensorrt_outputs): out = np.array(out) + if flatten: + out = out.flatten() + tensorrt_output = tensorrt_output.flatten() self.assertTrue( np.allclose( @@ -209,6 +230,18 @@ class InferencePassTest(unittest.TestCase): self.use_static = use_static self.use_calib_mode = use_calib_mode + class DynamicShapeParam: + ''' + Prepare TensorRT subgraph engine dynamic shape parameters. + ''' + + def __init__(self, min_input_shape, max_input_shape, optim_input_shape, + disable_trt_plugin_fp16): + self.min_input_shape = min_input_shape + self.max_input_shape = max_input_shape + self.optim_input_shape = optim_input_shape + self.disable_trt_plugin_fp16 = disable_trt_plugin_fp16 + class LiteParam: ''' Prepare Lite subgraph engine parameters. diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py index 660a9c93e66715f41e4a972ff571c0c00f31316f..d9817c6fe1825d19bbbf8ab58df180dd382c81cf 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_slice_plugin.py @@ -23,44 +23,25 @@ from paddle.fluid.core import AnalysisConfig #normal starts && ends -class SlicePluginTRTTest1(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") - axes = [1, 3] - starts = [0, 1] - ends = [2, 3] - slice_out = fluid.layers.slice( - data, axes=axes, starts=starts, ends=ends) - out = fluid.layers.batch_norm(slice_out, is_test=True) - - self.feeds = { - "data": np.random.random((3, 3, 3, 3)).astype("float32"), - } - # Diff occurred between GPU and TRT. - # In order to provide TRT CI ASAP, this test for trt part - # is disabled temporarily. - self.enable_trt = True - self.trt_parameters = SlicePluginTRTTest1.TensorRTParam( +class SlicePluginTRTTest(InferencePassTest): + def setUpSliceParams(self): + self.params_axes = [1, 3] + self.params_starts = [0, 1] + self.params_ends = [2, 3] + + def setUpTensorRTParams(self): + self.trt_parameters = SlicePluginTRTTest.TensorRTParam( 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) - self.fetch_list = [out] - - def test_check_output(self): - use_gpu = [False] - if core.is_compiled_with_cuda(): - use_gpu.append(True) - for i in range(len(use_gpu)): - self.check_output_with_option(use_gpu[i]) - + self.enable_trt = True -#negative starts && ends -class SlicePluginTRTTest2(InferencePassTest): def setUp(self): + self.setUpSliceParams() + self.setUpTensorRTParams() with fluid.program_guard(self.main_program, self.startup_program): data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") - axes = [2, 3] - starts = [-3, -2] - ends = [-1, 3] + axes = self.params_axes + starts = self.params_starts + ends = self.params_ends slice_out = fluid.layers.slice( data, axes=axes, starts=starts, ends=ends) out = fluid.layers.batch_norm(slice_out, is_test=True) @@ -68,12 +49,6 @@ class SlicePluginTRTTest2(InferencePassTest): self.feeds = { "data": np.random.random((3, 3, 3, 3)).astype("float32"), } - # Diff occurred between GPU and TRT. - # In order to provide TRT CI ASAP, this test for trt part - # is disabled temporarily. - self.enable_trt = True - self.trt_parameters = SlicePluginTRTTest2.TensorRTParam( - 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) self.fetch_list = [out] def test_check_output(self): @@ -84,66 +59,28 @@ class SlicePluginTRTTest2(InferencePassTest): self.check_output_with_option(use_gpu[i]) -#exceeded bound starts && ends -class SlicePluginTRTTest3(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") - axes = [2, 3] - starts = [-5, -2] - ends = [-1, 8] - slice_out = fluid.layers.slice( - data, axes=axes, starts=starts, ends=ends) - out = fluid.layers.batch_norm(slice_out, is_test=True) +#negative starts && ends +class SlicePluginTRTTestNegativeStartsAndEnds(SlicePluginTRTTest): + def setUpSliceParams(self): + self.params_axes = [2, 3] + self.params_starts = [-3, -2] + self.params_ends = [-1, 3] - self.feeds = { - "data": np.random.random((3, 3, 3, 3)).astype("float32"), - } - # Diff occurred between GPU and TRT. - # In order to provide TRT CI ASAP, this test for trt part - # is disabled temporarily. - self.enable_trt = True - self.trt_parameters = SlicePluginTRTTest3.TensorRTParam( - 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) - self.fetch_list = [out] - def test_check_output(self): - use_gpu = [False] - if core.is_compiled_with_cuda(): - use_gpu.append(True) - for i in range(len(use_gpu)): - self.check_output_with_option(use_gpu[i]) +#exceeded bound starts && ends +class SlicePluginTRTTestStartsAndEndsBoundCheck(SlicePluginTRTTest): + def setUpSliceParams(self): + self.params_axes = [2, 3] + self.params_starts = [-5, -2] + self.params_ends = [-1, 8] #fp16 -class SlicePluginTRTTest4(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data(name="data", shape=[3, 3, 3, 3], dtype="float32") - axes = [2, 3] - starts = [-5, -2] - ends = [-1, 8] - slice_out = fluid.layers.slice( - data, axes=axes, starts=starts, ends=ends) - out = fluid.layers.batch_norm(slice_out, is_test=True) - - self.feeds = { - "data": np.random.random((3, 3, 3, 3)).astype("float32"), - } - # Diff occurred between GPU and TRT. - # In order to provide TRT CI ASAP, this test for trt part - # is disabled temporarily. - self.enable_trt = True - self.trt_parameters = SlicePluginTRTTest3.TensorRTParam( +class SlicePluginTRTTestFp16(SlicePluginTRTTest): + def setUpTensorRTParams(self): + self.trt_parameters = SlicePluginTRTTest.TensorRTParam( 1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False) - self.fetch_list = [out] - - def test_check_output(self): - use_gpu = [False] - if core.is_compiled_with_cuda(): - use_gpu.append(True) - for i in range(len(use_gpu)): - self.check_output_with_option(use_gpu[i]) + self.enable_trt = True if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..17c92a24eb066af13d51da659dcd8c44478e8f56 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py @@ -0,0 +1,679 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import shutil +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import AnalysisConfig + + +class TensorRTSubgraphPassConvTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + conv_out = fluid.layers.conv2d( + input=data, + num_filters=self.conv_num_filters, + filter_size=self.conv_filter_size, + groups=self.conv_groups, + padding=self.conv_padding, + bias_attr=False, + act=None) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassConvTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [conv_out] + + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 3 + self.conv_padding = [1, 1] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 3 + self.conv_padding = 'VALID' + + +''' +# conv2d padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete. +class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 3 + self.conv_padding = 'SAME' +''' + + +class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 6 + self.conv_padding = [1, 1] + + +class TensorRTSubgraphPassConvTransposeTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + conv_out = fluid.layers.conv2d_transpose( + input=data, + num_filters=self.conv_num_filters, + filter_size=self.conv_filter_size, + groups=self.conv_groups, + padding=self.conv_padding, + bias_attr=False, + act=None) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassConvTransposeTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [conv_out] + + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 1 + self.conv_padding = [1, 1] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassConvTransposeValidPaddingTest( + TensorRTSubgraphPassConvTransposeTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 1 + self.conv_padding = 'VALID' + + +''' +# conv2d_transpose padded in 'SAME' mode is not yet supported in TRT, reopen this when support is complete. +class TensorRTSubgraphPassConvTransposeSamePaddingTest(TensorRTSubgraphPassConvTransposeTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 1 + self.conv_padding = 'SAME' +''' + + +class TensorRTSubgraphPassDepthwiseConvTransposeTest( + TensorRTSubgraphPassConvTransposeTest): + def set_params(self): + self.conv_num_filters = 6 + self.conv_filter_size = 6 + self.conv_groups = 1 + self.conv_padding = [1, 1] + + +class TensorRTSubgraphPassFcTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + fc_out = fluid.layers.fc(input=[data], act=None, size=1000) + reshape_out = fluid.layers.reshape(x=fc_out, shape=[1, 1000]) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassFcTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [reshape_out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + # TRT output shape of fc is (1, 1000, 1, 1). To compare the output value only, flatten the results. + self.check_output_with_option(use_gpu, flatten=True) + + +class TensorRTSubgraphPassPoolTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + pool_out = fluid.layers.pool2d( + input=data, + pool_size=self.pool_size, + pool_type=self.pool_type, + pool_stride=self.pool_stride, + pool_padding=self.pool_padding, + global_pooling=self.global_pooling, + ceil_mode=self.ceil_mode, + exclusive=self.exclusive) + out = fluid.layers.batch_norm(pool_out, is_test=True) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassPoolTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = False + self.ceil_mode = False + self.exclusive = False + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassAvgPoolTest(TensorRTSubgraphPassPoolTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'avg' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = False + self.ceil_mode = False + self.exclusive = False + + +class TensorRTSubgraphPassGlobalPoolTest(TensorRTSubgraphPassPoolTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = True + self.ceil_mode = False + self.exclusive = False + + +class TensorRTSubgraphPassCeilPoolTest(TensorRTSubgraphPassPoolTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = False + self.ceil_mode = True + self.exclusive = False + + +class TensorRTSubgraphPassExclusivePoolTest(TensorRTSubgraphPassPoolTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 0 + self.global_pooling = False + self.ceil_mode = False + self.exclusive = True + + +class TensorRTSubgraphPassSamePaddingPoolTest(InferencePassTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 'SAME' + self.global_pooling = False + self.ceil_mode = False + self.exclusive = False + + +class TensorRTSubgraphPassValidPaddingPoolTest(InferencePassTest): + def set_params(self): + self.pool_size = 2 + self.pool_type = 'max' + self.pool_stride = 1 + self.pool_padding = 'VALID' + self.global_pooling = False + self.ceil_mode = False + self.exclusive = False + + +class TensorRTSubgraphPassActivationTest(InferencePassTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + + def setUp(self): + self.setUpTensorRTParam() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + act_out = self.append_act(data) + out = fluid.layers.batch_norm(act_out, is_test=True) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.fetch_list = [out] + + def append_act(self, x): + return fluid.layers.relu(x) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + if os.path.exists(self.path + "_opt_cache"): + shutil.rmtree(self.path + "_opt_cache") + if self.trt_parameters.precision == AnalysisConfig.Precision.Float32: + self.check_output_with_option(use_gpu) + else: + self.check_output_with_option(use_gpu, 1e-3) + + +class TensorRTSubgraphPassLeakyReluTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.leaky_relu(x) + + +class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.relu6(x) + + +class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.softmax(x) + + +class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.sigmoid(x) + + +class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.hard_swish(x) + + +class TensorRTSubgraphPassHardSigmoidTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.hard_sigmoid(x) + + +class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.tanh(x) + + +class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + + def append_act(self, x): + return fluid.layers.swish(x) + + +class TensorRTSubgraphPassSwishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + + def append_act(self, x): + return fluid.layers.swish(x) + + +class TensorRTSubgraphPassDynamicSwishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.swish(x) + + +class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.prelu(x, mode='all') + + +class TensorRTSubgraphPassPreluChannelTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.prelu(x, mode='channel') + + +class TensorRTSubgraphPassPreluElementTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.prelu(x, mode='element') + + +class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest): + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False) + + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassGeluFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassGeluFp16DynamicTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassGeluFp16DynamicSerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.gelu(x) + + +class TensorRTSubgraphPassConcatTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data1 = fluid.data( + name="data1", shape=[-1, 3, 64, 64], dtype="float32") + data2 = fluid.data( + name="data2", shape=[-1, 3, 64, 64], dtype="float32") + concat_out = fluid.layers.concat([data1, data2], axis=2) + out = fluid.layers.batch_norm(concat_out, is_test=True) + self.feeds = { + "data1": np.random.random([1, 3, 64, 64]).astype("float32"), + "data2": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassConcatTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassSplitTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 3, 64, 64], dtype="float32") + split_out = fluid.layers.split(data, dim=-1, num_or_sections=2) + out = fluid.layers.batch_norm(split_out[0], is_test=True) + self.feeds = { + "data": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassSplitSerializeTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 3, 64, 64], dtype="float32") + split_out = fluid.layers.split(data, dim=-1, num_or_sections=2) + out = fluid.layers.batch_norm(split_out[0], is_test=True) + self.feeds = { + "data": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + if os.path.exists(self.path + "_opt_cache"): + shutil.rmtree(self.path + "_opt_cache") + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 3, 64, 64], dtype="float32") + split_out = fluid.layers.split(data, dim=-1, num_or_sections=2) + out = fluid.layers.batch_norm(split_out[0], is_test=True) + self.feeds = { + "data": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassSplitTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 3, 8, 64] + }, {'data': [1, 3, 512, 64]}, {'data': [1, 3, 256, 64]}, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + if os.path.exists(self.path + "_opt_cache"): + shutil.rmtree(self.path + "_opt_cache") + self.check_output_with_option(use_gpu, 1e-3) + + +class TensorRTSubgraphPassInstanceNormTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 3, 64, 64], dtype="float32") + fc_out = fluid.layers.fc(input=data, size=200) + param_attr = fluid.ParamAttr( + name='instance_norm_w', + initializer=fluid.initializer.Constant(value=1.0)) + bias_attr = fluid.ParamAttr( + name='instance_norm_b', + initializer=fluid.initializer.Constant(value=0.0)) + out = fluid.layers.instance_norm( + input=fc_out, param_attr=param_attr, bias_attr=bias_attr) + self.feeds = { + "data": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassInstanceNormTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, atol=1e-4, flatten=True) + + +class TensorRTSubgraphPassLayerNormTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 3, 64, 64], dtype="float32") + out = fluid.layers.layer_norm( + data, begin_norm_axis=self.begin_norm_axis) + self.feeds = { + "data": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassLayerNormTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.begin_norm_axis = 1 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, atol=1e-3) + + +class TensorRTSubgraphPassLayerNormBeginNormAxis2Test( + TensorRTSubgraphPassLayerNormTest): + def set_params(self): + self.begin_norm_axis = 2 + + +class TensorRTSubgraphPassLayerNormBeginNormAxis3Test( + TensorRTSubgraphPassLayerNormTest): + def set_params(self): + self.begin_norm_axis = 3 + + +class TensorRTSubgraphPassElementwiseTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data1 = fluid.data( + name="data1", shape=[-1, 3, 64, 64], dtype="float32") + data2 = fluid.data( + name="data2", shape=[-1, 3, 64, 64], dtype="float32") + eltwise_out = self.append_eltwise(data1, data2) + out = fluid.layers.batch_norm(eltwise_out, is_test=True) + self.feeds = { + "data1": np.random.random([1, 3, 64, 64]).astype("float32"), + "data2": np.random.random([1, 3, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def append_eltwise(self, data1, data2): + return fluid.layers.elementwise_add(x=data1, y=data2) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +class TensorRTSubgraphPassElementwiseMulTest( + TensorRTSubgraphPassElementwiseTest): + def append_eltwise(self, data1, data2): + return fluid.layers.elementwise_mul(x=data1, y=data2) + + +class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 6, 64, 64], dtype="float32") + sc_out = fluid.layers.shuffle_channel(data, group=3) + out = fluid.layers.batch_norm(sc_out, is_test=True) + self.feeds = { + "data": np.random.random([1, 6, 64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassShuffleChannelTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + + +if __name__ == "__main__": + unittest.main()