diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc index 03ec1133111750e02ba0fe577c77011332b15080..3a848019fea9233702332fdc31f124cb2bf8b24d 100644 --- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc @@ -67,7 +67,8 @@ class MatMulOpConverter : public OpConverter { if (op_desc.HasAttr("support_int8") && PADDLE_GET_CONST(bool, op_desc.GetAttr("support_int8")) && engine_->precision() == AnalysisConfig::Precision::kInt8 && - platform::GetGPUComputeCapability(0) >= 75) { + platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >= + 75) { if (engine_->with_dynamic_shape()) { VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT " "MatmulPluginLayer"; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 2ae972729f5e166c3a698fdca241fdbc6dac3c39..b3447bb23c38acabab3f3c4ecb6403e0a9ac122a 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -88,11 +88,10 @@ class MultiheadMatMulOpConverter : public OpConverter { engine_->tensorrt_transformer_posid() != "" && engine_->tensorrt_transformer_maskid() != ""; if (engine_->with_dynamic_shape()) { - if (engine_->tensorrt_transformer_maskid() != "") { - if (engine_->precision() == AnalysisConfig::Precision::kFloat32) { - PADDLE_THROW(platform::errors::Fatal( - "use use_varseqlen must be int8 or half, not float32.")); - } + if (engine_->tensorrt_transformer_maskid() != "" && + engine_->precision() != AnalysisConfig::Precision::kFloat32 && + platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >= + 75) { nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), static_cast(weight_t->numel())}; @@ -401,7 +400,8 @@ class MultiheadMatMulOpConverter : public OpConverter { } else { if (input_dims.d[1] <= 384 && !bias_qk_attr && engine_->precision() != AnalysisConfig::Precision::kFloat32 && - platform::GetGPUComputeCapability(0) >= 75) { + platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >= + 75) { /* * input_dims.d[0]: batch(-1) * input_dims.d[1]: length:256 diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index b87afafb7ab53026beb4a7e88b52e966064bfb03..cb4bcdd9e43936e3a5533263e30f8f44227d7b08 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -39,12 +39,12 @@ list( generic_plugin.cu lookup_table.cu many_emb_layernorm_plugin.cu - many_emb_Layernorm_kernel.cu) + many_emb_layernorm_kernel.cu) if(${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 7) list(APPEND TRT_FILES many_emb_layernorm_varseqlen_plugin.cu - many_emb_Layernorm_varseqlen_kernel_mtron.cu - many_emb_Layernorm_varseqlen_kernel_hface.cu) + many_emb_layernorm_varseqlen_kernel_mtron.cu + many_emb_layernorm_varseqlen_kernel_hface.cu) endif() if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8) diff --git a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_kernel.cu b/paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_kernel.cu similarity index 100% rename from paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_kernel.cu rename to paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_kernel.cu diff --git a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernel_hface.cu b/paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_kernel_hface.cu similarity index 100% rename from paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernel_hface.cu rename to paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_kernel_hface.cu diff --git a/paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernel_mtron.cu b/paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_kernel_mtron.cu similarity index 100% rename from paddle/fluid/inference/tensorrt/plugin/many_emb_Layernorm_varseqlen_kernel_mtron.cu rename to paddle/fluid/inference/tensorrt/plugin/many_emb_layernorm_varseqlen_kernel_mtron.cu diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h index b50826953386c80ecec09f48dcb7cf1c8650e4f9..049d2be3f9e936b3df867a3bb852a7a4384f1b1a 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_serialize_deserialize_test.h @@ -139,6 +139,8 @@ static void trt_ernie(bool with_fp16, std::vector result) { config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false); config.SetTRTDynamicShapeInfo( min_input_shape, max_input_shape, opt_input_shape); + paddle_infer::experimental::InternalUtils::SetTransformerMaskid( + &config, "read_file_0.tmp_4"); AnalysisConfig* config_deser = new AnalysisConfig(config); std::vector out_data; diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index a32d2161d951913a14b5dce62f6c6925d68b945e..aa252fd190784fb737bf437bf3ca710c3c4bbe4e 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -133,6 +133,8 @@ void trt_ernie(bool with_fp16, config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false); config.SetTRTDynamicShapeInfo( min_input_shape, max_input_shape, opt_input_shape); + paddle_infer::experimental::InternalUtils::SetTransformerMaskid( + &config, "read_file_0.tmp_4"); std::vector out_data; run(config, &out_data, batch_size); @@ -423,7 +425,7 @@ void run(paddle_infer::Predictor* predictor, std::vector* out_data) { TEST(AnalysisPredictor, ernie_varlen) { #if IS_TRT_VERSION_GE(7234) - if (platform::GetGPUComputeCapability(0) >= 75) { + if (platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >= 75) { auto predictor = InitPredictor(); std::vector out_data; run(predictor.get(), &out_data);