diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 59ed09b96cc0ea2e4619a99200847bc818c08749..60de4234b41a8ba0a41f508f3f5e441950ff9fb6 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -168,11 +168,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( std::set output_names; std::set output_names_with_id; - std::vector origin_output_dims; + std::map origin_name_output_dims; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); - origin_output_dims.push_back(x->Var()->GetShape().size()); + origin_name_output_dims[x->Name()] = x->Var()->GetShape().size(); } std::unordered_map output_name_map; @@ -216,11 +216,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // output_mapping help us copy the data from the renamed ITensor // to Tensor. std::vector output_mapping; + std::vector renamed_output_dims; for (auto name : output_names) { PADDLE_ENFORCE_NE(output_name_map.count(name), 0, platform::errors::PreconditionNotMet( "The output_name_map should have %s", name)); output_mapping.push_back(output_name_map[name]); + renamed_output_dims.push_back(origin_name_output_dims[name]); } PADDLE_ENFORCE_EQ(output_mapping.empty(), false, platform::errors::PreconditionNotMet( @@ -243,7 +245,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("workspace_size", Get("workspace_size")); op_desc->SetAttr("gpu_id", Get("gpu_device_id")); op_desc->SetAttr("output_name_mapping", output_mapping); - op_desc->SetAttr("origin_output_dims", origin_output_dims); + op_desc->SetAttr("origin_output_dims", renamed_output_dims); op_desc->SetAttr("parameters", params); // we record all inputs' shapes in attr to check if they are consistent diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 41fbbb557d6470228b9315f8d0e0ed1e5ad905ac..527d0ee208578a38a0150d212275f8290902f067 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -144,7 +144,69 @@ class FcOpConverter : public OpConverter { static_cast(bias_num)}; if (engine_->with_dynamic_shape()) { - regist_fc(X, n_output, weight, bias); + // not NCHW layout, but NLP layout with added 'x 1 x 1' + auto x_dim = X->getDimensions(); + if (x_dim.nbDims == 3 || x_dim.nbDims == 2) { + auto output_name = op_desc.Output("Out").front(); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = x_dim.nbDims + 2; + for (int i = 0; i < x_dim.nbDims; i++) { + reshape_before_fc_dim.d[i] = 0; + } + reshape_before_fc_dim.d[x_dim.nbDims] = 1; + reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1; + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + + // add fc layer + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n_output, weight.get(), bias.get()); + fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); + + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + if (x_dim.nbDims == 3) { + if (x_num_col_dims == 2) { + reshape_after_fc_dim.nbDims = 3; + reshape_after_fc_dim.d[0] = 0; + reshape_after_fc_dim.d[1] = 0; + reshape_after_fc_dim.d[2] = 0; + } else { + reshape_after_fc_dim.nbDims = 2; + reshape_after_fc_dim.d[0] = 0; + auto dim = fc_layer->getOutput(0)->getDimensions(); + reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2]; + } + // x_dim.nbDims == 2 + } else { + reshape_after_fc_dim.nbDims = 2; + reshape_after_fc_dim.d[0] = 0; + reshape_after_fc_dim.d[1] = 0; + } + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + + if (activation_type == "relu") { + reshape_after_fc_layer->setName( + ("shuffle_after_fc(Output: " + output_name + ")").c_str()); + nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", + {output_name}, test_mode); + } + } else { + regist_fc(X, n_output, weight, bias); + } return; } // in order to handle situations in NLP models(input dims < 3, @@ -154,12 +216,6 @@ class FcOpConverter : public OpConverter { auto input_d = X->getDimensions().d; int reshape_dim3[3] = {0}; int reshape_dim4[4] = {0}; - PADDLE_ENFORCE_EQ( - x_num_col_dims == 1 || x_num_col_dims == 2, true, - platform::errors::InvalidArgument( - "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter " - "expects x_num_col_dims is either 1 or 2, but got %d", - x_num_col_dims)); PADDLE_ENFORCE_LE(x_num_col_dims, input_dims, platform::errors::InvalidArgument( "Params and input dims mismatch. Paddle-TRT FC " diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index ee04fd372c4588b492948a5acd23493d2ba423c5..8ce46a19d4b06e8a6c3e9da280329382ff52f1ec 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See +the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "network structure"; framework::OpDesc op_desc(op, nullptr); // Declare inputs - // Shouble be a 5 dims tensor. auto* input = engine_->GetITensor(op_desc.Input("Input").front()); // fc weights and fc bias @@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter { int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number")); nvinfer1::ILayer* layer = nullptr; + auto output_name = op_desc.Output("Out")[0]; if (engine_->with_dynamic_shape()) { if (engine_->use_oss()) { @@ -171,6 +171,12 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.data(), plugin_inputs.size(), *plugin); layer = plugin_layer; } else { + PADDLE_ENFORCE_EQ( + input->getDimensions().nbDims, 3, + platform::errors::InvalidArgument( + "The Input dim of the MultiheadMatMul should be 3, " + "but it's (%d) now.", + input->getDimensions().nbDims)); // transpose weight_data from m * n to n * m auto* input_bias_qk = engine_->GetITensor(op_desc.Input("BiasQK").front()); @@ -184,15 +190,37 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight.get(), bias.get()); - auto* fc_out = fc_layer->getOutput(0); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = 5; + reshape_before_fc_dim.d[0] = 0; + reshape_before_fc_dim.d[1] = 0; + reshape_before_fc_dim.d[2] = 0; + reshape_before_fc_dim.d[3] = 1; + reshape_before_fc_dim.d[4] = 1; + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_multihead_mamul(Output: " + output_name + ")") + .c_str()); + + // add layer fc + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, + weight.get(), bias.get()); + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); + + // no need to add shuffle after fc, just change it in + // QkvToContextPluginDynamic + // add qkv to context int head_size = hidden_out / head_number; float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); std::vector plugin_inputs; - plugin_inputs.push_back(fc_out); + plugin_inputs.push_back(fc_layer->getOutput(0)); plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); @@ -208,7 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "You can use the config.SetTRTDynamicShapeInfo(...) interface to set " "the shape information to run the dynamic shape mode.")); } - auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name}, test_mode); #else diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 79992065a22407ce09d7b64ab622eae993d36e9f..9cefb24751e18dfbb3b8283152cbcd58c81adc58 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter { uint32_t axes = std::max(0, input_dims - 3); // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers // support Nd. + // Tips: Dynammic shape alreay fixes. int padded_dims = 0; int explicit_batch = 0; if (engine_->with_dynamic_shape()) explicit_batch = 1; @@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter { } } if (!engine_->with_dynamic_shape()) { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis - padded_dims; } else { - axes = axis; + axes = axis - 1; } } else { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis; } else { - axes = axis + 1; + axes = axis; } } layer->setAxes(1 << axes); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 2ec94f5f98c8d8e4ee9f6facd140d8869d5dedda..11752d71a45e1b8545b727eb63d48fdca6d157a1 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -195,7 +195,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, // current not support axis from input, use default 0 if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } - + if (op_type == "fc" || op_type == "mul") { + const int x_num_col_dims = + desc.HasAttr("x_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) + : (desc.HasAttr("in_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) + : 1); + if (x_num_col_dims != 1 && x_num_col_dims != 2) { + return false; + } + } if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 238daa4a886a48036cdcd29a1173509b791254fd..6d3872aaeb8a77acf1455e4d5e555ee01d36478a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions( "but it's (%d)", output_index)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(hidden_size_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 1e7c83f4c60fb99964bf583087e7fd1f8c32d704..a5fc9e73c5f27f1280171966df853675e2f0d73b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( "it has (%d) inputs", nb_inputs)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(head_size_ * head_number_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index 3b9eea22199d7b1669802fb506fb4218529b4468..7be9e3a740ab1c3532f5a67f06048c6c745eb214 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() { nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) { - PADDLE_ENFORCE_EQ( - inputs[0].nbDims, 5, - platform::errors::InvalidArgument( - "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.", - inputs[0].nbDims)); return inputs[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu index 250b944652b93c54ae9587271256b42c6e1bc6b7..fdb14f9ceaf29fe90cd756b77e7c5afff2296f44 100644 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( output.d[1] = one; output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, *inputs[1].d[0], *one); + // remove padding 1 + output.nbDims -= 2; return output; }