未验证 提交 ef734e84 编写于 作者: W Wangzheee 提交者: GitHub

[Paddle-Trt] Replace fc mul matmul matmul_v2 with matrix_multiply (#52222)

* Paddle-Trt: Replace fc mul matmul matmul_v2 with matrix_multiply
上级 acf55016
......@@ -132,7 +132,7 @@ pass_library(generate_pass DEPS pass_desc_proto)
target_link_libraries(generate_pass pass_desc_proto)
if(WITH_TENSORRT)
pass_library(trt_map_matmul_to_mul_pass inference)
pass_library(trt_map_ops_to_matrix_multiply_pass inference)
pass_library(trt_multihead_matmul_fuse_pass inference)
pass_library(trt_flash_multihead_matmul_fuse_pass inference)
pass_library(trt_cross_multihead_matmul_fuse_pass inference)
......
......@@ -64,7 +64,7 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
platform::errors::Fatal(
"scope must not be null when applying constant floding."));
std::vector<std::string> blacklist{"feed"};
std::vector<std::string> blacklist{"feed", "matrix_multiply"};
auto op_node_sorted = framework::ir::TopologyVarientSort(
*graph, static_cast<framework::ir::SortKind>(0));
......
......@@ -24,10 +24,10 @@ namespace ir {
class Graph;
void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
std::unordered_set<std::string> op_list = {"matmul_v2",
std::unordered_set<std::string> op_list = {"matrix_multiply",
"matmul_v2",
"matmul",
"mul",
"fc",
"depthwise_conv2d",
"conv2d",
"conv2d_transpose"};
......
......@@ -2465,7 +2465,7 @@ PDNode *patterns::ConvElementwiseaddAct::operator()(
PDNode *patterns::VitAttention::operator()(PDNode *in) {
in->AsInput();
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
auto matmul0_op =
pattern->NewNode(matmul0_op_repr())->assert_is_ops(matmul_ops);
......@@ -2504,13 +2504,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
auto slice1_op = pattern->NewNode(slice1_op_repr())->assert_is_op("slice");
auto slice1_out = pattern->NewNode(slice1_out_repr())
->assert_is_op_output("slice", "Out")
->assert_is_op_input("matmul_v2", "Y")
->assert_is_op_input("matrix_multiply", "Y")
->AsIntermediate();
auto slice2_op = pattern->NewNode(slice2_op_repr())->assert_is_op("slice");
auto slice2_out = pattern->NewNode(slice2_out_repr())
->assert_is_op_output("slice", "Out")
->assert_is_op_input("matmul_v2", "X")
->assert_is_op_input("matrix_multiply", "X")
->AsIntermediate();
auto slice3_op = pattern->NewNode(slice3_op_repr())->assert_is_op("slice");
......@@ -2523,13 +2523,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
pattern->NewNode(transpose2_op_repr())->assert_is_op("transpose2");
auto transpose2_out = pattern->NewNode(transpose2_out_repr())
->assert_is_op_output("transpose2", "Out")
->assert_is_op_input("matmul_v2", "Y")
->assert_is_op_input("matrix_multiply", "Y")
->AsIntermediate();
auto matmul1_op =
pattern->NewNode(matmul1_op_repr())->assert_is_op("matmul_v2");
pattern->NewNode(matmul1_op_repr())->assert_is_op("matrix_multiply");
auto matmul1_out = pattern->NewNode(matmul1_out_repr())
->assert_is_op_output("matmul_v2", "Out")
->assert_is_op_output("matrix_multiply", "Out")
->assert_is_op_input("scale", "X")
->AsIntermediate();
......@@ -2543,13 +2543,13 @@ PDNode *patterns::VitAttention::operator()(PDNode *in) {
pattern->NewNode(softmax1_op_repr())->assert_is_op("softmax");
auto softmax1_out = pattern->NewNode(softmax1_out_repr())
->assert_is_op_output("softmax", "Out")
->assert_is_op_input("matmul_v2", "X")
->assert_is_op_input("matrix_multiply", "X")
->AsIntermediate();
auto matmul2_op =
pattern->NewNode(matmul2_op_repr())->assert_is_op("matmul_v2");
pattern->NewNode(matmul2_op_repr())->assert_is_op("matrix_multiply");
auto matmul2_out = pattern->NewNode(matmul2_out_repr())
->assert_is_op_output("matmul_v2", "Out")
->assert_is_op_output("matrix_multiply", "Out")
->assert_is_op_input("transpose2", "X")
->AsIntermediate();
......@@ -4452,6 +4452,16 @@ PDNode *patterns::FusedFeedForwardBwd::operator()(
return out_grad;
}
void patterns::MulMatmulMatmulV2::operator()(
const std::unordered_set<std::string> &ops_type) {
auto ops = pattern->NewNode(ops_repr())->assert_is_ops(ops_type);
auto ops_out = pattern->NewNode(ops_out_repr())
->AsOutput()
->assert_is_ops_output(ops_type, "Out");
ops->LinksTo({ops_out});
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -2146,6 +2146,17 @@ struct MergeLayernormPattern : public PatternBase {
PATTERN_DECL_NODE(layernorm_40_out);
};
// MulMatmulMatmulV2: ops(mul, matmul, matmul_v2)
// Forward pass for ops(mul, matmul, matmul_v2) convert to matrix_multiply.
struct MulMatmulMatmulV2 : public PatternBase {
MulMatmulMatmulV2(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "mul_matmul_matmul_v2") {}
void operator()(const std::unordered_set<std::string>& ops_type);
PATTERN_DECL_NODE(ops);
PATTERN_DECL_NODE(ops_out);
};
// Add support int8 flag
struct AddSupportInt8 : public PatternBase {
AddSupportInt8(PDPattern* pattern, const std::string& name_scope)
......
......@@ -37,7 +37,7 @@ static void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) {
}
PDNode* MultiHeadMatmulRoformerPattern::operator()() {
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
auto* input0 = pattern->NewNode(input0_repr());
input0->assert_is_ops_input(matmul_ops);
......@@ -313,23 +313,6 @@ PDNode* MultiHeadMatmulRoformerPattern::operator()() {
} // namespace patterns
MultiHeadMatmulRoformerFusePass::MultiHeadMatmulRoformerFusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddInput("Y") // the shape shoule be (N*H, N*H)
.IsTensor()
.End()
.AddOutput("Out") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(2)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
// in bias, shape is (B, S, N*H),
......@@ -394,43 +377,6 @@ MultiHeadMatmulRoformerFusePass::MultiHeadMatmulRoformerFusePass() {
// QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
// QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsType<float>() // QK(anyvalue, will copy to new op) QKV(1.0)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
......@@ -825,6 +771,4 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_roformer_fuse_pass)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.LE("matmul", 1)
.EQ("matmul_v2", 0)
.EQ("softmax", 0));
......@@ -202,77 +202,6 @@ QuantDequantFusePass::QuantDequantFusePass() {
.AddAttr("data_format")
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End();
AddOpCompat(OpCompat("mul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y")
.IsBoolEQ(false)
.End();
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsNumGE(0.99f)
.IsNumLE(1.01f)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y")
.IsBoolEQ(false)
.End();
AddOpCompat(OpCompat("fc"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("W")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("in_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("activation_type")
.IsStringIn({"relu", ""})
.End();
AddOpCompat(OpCompat("conv2d_transpose"))
.AddInput("Input")
.IsTensor()
......@@ -379,10 +308,8 @@ void QuantDequantFusePass::DeleteQuant(ir::Graph* graph,
if (quantized_op_type == "conv2d" ||
quantized_op_type == "conv2d_fusion" ||
quantized_op_type == "depthwise_conv2d" ||
quantized_op_type == "fc" ||
quantized_op_type == "conv2d_transpose" ||
quantized_op_type == "mul" || quantized_op_type == "matmul" ||
quantized_op_type == "matmul_v2") {
quantized_op_type == "matrix_multiply") {
op_desc->SetAttr("Input_scale", scale_value);
} else {
PADDLE_THROW(platform::errors::Unimplemented(
......@@ -416,17 +343,14 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
quantized_op_type == "conv2d_transpose") {
weight_name = "Filter";
input_name = "Input";
} else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
quantized_op_type == "matmul_v2") {
} else if (quantized_op_type == "matrix_multiply") {
weight_name = "Y";
input_name = "X";
} else if (quantized_op_type == "fc") {
weight_name = "W";
input_name = "Input";
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"QuantDequantFuse: We only support conv2d, conv2d_fusion, fused_conv2d,"
"conv2d_transpose, fc, mul, matmul, matmul_v2 for now, but received: "
"conv2d_transpose, matrix_multiply(mul/matmul/matmul_v2) for now, but "
"received: "
"%s.",
quantized_op_type));
}
......@@ -514,16 +438,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
// re-write it again when this weight tensor is shared among many ops.
if (!quantized_op_weight_node_set.count(quantized_op_weight_node)) {
quantized_op_weight_node_set.insert(quantized_op_weight_node);
// If quantized op is fc, weight scale size = 1;
// If quantized op is matrix_multiply, weight scale size = 1;
// If quantized op is conv2d, weight scale size = weight dims[0]
// If quantized op is conv2d_transpose, weight scale size = weight dims[1]
if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
quantized_op_type == "matmul_v2" || quantized_op_type == "fc") {
if (quantized_op_type == "matrix_multiply") {
if (dequant_type == "fake_dequantize_max_abs") {
PADDLE_ENFORCE_EQ(weight_scale.size(),
1,
platform::errors::InvalidArgument(
"mul/matmul/matmul_v2 op weight dequantized by "
"matrix_multiply(mul/matmul/matmul_v2) op "
"weight dequantized by "
"[fake_dequantize_max_abs] "
"requires weight scale size = 1, but got %d.",
weight_scale.size()));
......@@ -538,24 +462,27 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
quant_axis == 1,
true,
platform::errors::InvalidArgument(
"'quant_axis' of mul/matmul/fc/matmul_v2 op weight "
"'quant_axis' of matrix_multiply(mul/matmul/matmul_v2) op "
"weight "
"dequantized by "
"[fake_channel_wise_dequantize_max_abs]should be 1, but "
"the received is %d",
quant_axis));
}
PADDLE_ENFORCE_EQ(weight_scale.size(),
static_cast<size_t>(w_dims[1]),
platform::errors::InvalidArgument(
"mul/matmul/matmul_v2 op weight dequantized by "
"[fake_channel_wise_dequantize_max_abs] "
"requires weight scale "
"size = 2nd dim of mul/matmul/matmul_v2's "
"weight, which is %d, "
"but got "
"%d.",
static_cast<size_t>(w_dims[1]),
weight_scale.size()));
PADDLE_ENFORCE_EQ(
weight_scale.size(),
static_cast<size_t>(w_dims[1]),
platform::errors::InvalidArgument(
"matrix_multiply(mul/matmul/matmul_v2) op weight dequantized "
"by "
"[fake_channel_wise_dequantize_max_abs] "
"requires weight scale "
"size = 2nd dim of matrix_multiply(mul/matmul/matmul_v2)'s "
"weight, which is %d, "
"but got "
"%d.",
static_cast<size_t>(w_dims[1]),
weight_scale.size()));
for (int j = 0; j < weight_tensor->numel(); j++) {
quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
}
......@@ -650,11 +577,7 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph,
quantized_op_type == "conv2d_transpose") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Output", {new_output});
} else if (quantized_op_type == "fc") {
new_op_desc.SetInput("Input", {new_input});
new_op_desc.SetOutput("Out", {new_output});
} else if (quantized_op_type == "mul" || quantized_op_type == "matmul" ||
quantized_op_type == "matmul_v2") {
} else if (quantized_op_type == "matrix_multiply") {
new_op_desc.SetInput("X", {new_input});
new_op_desc.SetOutput("Out", {new_output});
}
......@@ -682,12 +605,9 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
std::unordered_set<std::string> quantized_op_types = {
"conv2d",
"fused_conv2d",
"mul",
"matmul",
"matrix_multiply",
"depthwise_conv2d",
"conv2d_transpose",
"fc",
"matmul_v2",
};
auto* scope = param_scope();
......@@ -712,7 +632,6 @@ REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.LE("conv2d", 1)
.EQ("fc", 0)
.LE("conv2d_transpose", 2)
.EQ("fake_quantize_abs_max", 0)
.EQ("fake_quantize_range_abs_max", 0)
......
......@@ -22,7 +22,7 @@ namespace framework {
namespace ir {
///
/// Fuse quant + conv2d/depthwise_conv2d/mul/fc + dequant
/// Fuse quant + conv2d/depthwise_conv2d/matrix_multiply + dequant
///
class QuantDequantFusePass : public FusePassBase {
public:
......
......@@ -110,12 +110,14 @@ void MultiheadMatmul::operator()() {
.LinksTo({multihead_matmul_out});
}
void Fc::operator()() {
// Create nodes for fc.
auto* fc_input =
pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
fc_op->LinksFrom({fc_input});
void MatrixMultiply::operator()() {
// Create nodes for matrix_multiply.
auto* matrix_multiply_input =
pattern->NewNode(matrix_multiply_input_repr())
->assert_is_op_input("matrix_multiply", "X");
auto* matrix_multiply_op = pattern->NewNode(matrix_multiply_op_repr())
->assert_is_op("matrix_multiply");
matrix_multiply_op->LinksFrom({matrix_multiply_input});
}
void Activation::operator()() {
......@@ -146,6 +148,19 @@ void FusedTokenPrune::operator()() {
fused_token_prune_op->LinksFrom({fused_token_prune_input})
.LinksTo({fused_token_prune_output});
}
void ElementWise::operator()() {
// Create nodes for elementwise.
auto* elementwise_input = pattern->NewNode(elementwise_input_repr())
->assert_is_op_input("elementwise_add", "X");
auto* elementwise_op =
pattern->NewNode(elementwise_op_repr())->assert_is_op("elementwise_add");
auto* elementwise_out = pattern->NewNode(elementwise_out_repr())
->assert_is_op_output("elementwise_add");
// Add links for elementwise op.
elementwise_op->LinksFrom({elementwise_input}).LinksTo({elementwise_out});
}
} // namespace patterns
void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
......@@ -400,38 +415,45 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
gpd2(graph, handler2);
GraphPatternDetector gpd3;
patterns::Fc fc(gpd3.mutable_pattern(),
"remove_padding_recover_padding_pass");
fc();
patterns::MatrixMultiply matrix_multiply(
gpd3.mutable_pattern(), "remove_padding_recover_padding_pass");
matrix_multiply();
auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(3) << "remove_padding_recover_padding_pass for transformer: fc";
VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
"matrix_multiply";
GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
GET_IR_NODE_FROM_SUBGRAPH(
matrix_multiply_input, matrix_multiply_input, matrix_multiply);
GET_IR_NODE_FROM_SUBGRAPH(
matrix_multiply_op, matrix_multiply_op, matrix_multiply);
std::vector<int64_t> fc_input_shape = fc_input->Var()->GetShape();
std::vector<int64_t> matrix_multiply_input_shape =
matrix_multiply_input->Var()->GetShape();
check_flag = true;
if ((fc_input_shape.size() != multihead_matmul_input_shape.size()) ||
(fc_input_shape.size() != 3)) {
if ((matrix_multiply_input_shape.size() !=
multihead_matmul_input_shape.size()) ||
(matrix_multiply_input_shape.size() != 3)) {
check_flag = false;
VLOG(3) << "Transformer model remove_padding shape check failed, return "
"remove_padding pass.";
return;
}
if (fc_input_shape[0] != multihead_matmul_input_shape[0]) {
if (matrix_multiply_input_shape[0] != multihead_matmul_input_shape[0]) {
check_flag = false;
}
if (fc_input_shape[1] != multihead_matmul_input_shape[1]) {
if (matrix_multiply_input_shape[1] != multihead_matmul_input_shape[1]) {
check_flag = false;
}
if ((fc_input_shape[2] != multihead_matmul_input_shape[2]) &&
(fc_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
if ((matrix_multiply_input_shape[2] != multihead_matmul_input_shape[2]) &&
(matrix_multiply_input_shape[2] !=
4 * multihead_matmul_input_shape[2])) {
check_flag = false;
}
if (PADDLE_GET_CONST(int, fc_op->Op()->GetAttr("in_num_col_dims")) != 2) {
if (PADDLE_GET_CONST(
int, matrix_multiply_op->Op()->GetAttr("x_num_col_dims")) != 2) {
check_flag = false;
}
if (!check_flag) {
......@@ -439,8 +461,13 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
"remove_padding pass.";
return;
}
insert_remove_padding_op(fc_input, fc_op);
insert_recover_padding_op(fc_op, fc_op->outputs[0]);
matrix_multiply_op->Op()->RemoveAttr("x_num_col_dims");
matrix_multiply_op->Op()->SetAttr("x_num_col_dims", 1);
insert_remove_padding_op(matrix_multiply_input, matrix_multiply_op);
insert_recover_padding_op(matrix_multiply_op,
matrix_multiply_op->outputs[0]);
found_subgraph_count++;
};
gpd3(graph, handler3);
......@@ -617,6 +644,57 @@ void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
};
gpd7(graph, handler7);
// Removed fc_add fuse, elementwise can be used by the optimized model
GraphPatternDetector gpd8;
patterns::ElementWise elementwise(gpd8.mutable_pattern(),
"remove_padding_recover_padding_pass");
elementwise();
auto handler8 = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
"elementwise";
GET_IR_NODE_FROM_SUBGRAPH(
elementwise_input, elementwise_input, elementwise);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_op, elementwise_op, elementwise);
GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, elementwise);
std::vector<int64_t> elementwise_input_shape =
elementwise_input->Var()->GetShape();
check_flag = true;
if (elementwise_input_shape.size() != multihead_matmul_input_shape.size()) {
check_flag = false;
VLOG(3) << "Transformer model remove_padding shape check failed, return "
"remove_padding pass.";
return;
}
if (elementwise_input_shape[0] != multihead_matmul_input_shape[0]) {
check_flag = false;
}
if (elementwise_input_shape[1] != multihead_matmul_input_shape[1]) {
check_flag = false;
}
if ((elementwise_input_shape[2] != multihead_matmul_input_shape[2]) &&
(elementwise_input_shape[2] != 4 * multihead_matmul_input_shape[2])) {
check_flag = false;
}
if (!check_flag) {
VLOG(3) << "Transformer model remove_padding shape check failed, return "
"remove_padding pass.";
return;
}
elementwise_op->Op()->RemoveAttr("axis");
elementwise_op->Op()->SetAttr("axis", 1);
insert_remove_padding_op(elementwise_input, elementwise_op);
insert_recover_padding_op(elementwise_op, elementwise_out);
found_subgraph_count++;
};
gpd8(graph, handler8);
AddStatis(found_subgraph_count);
}
......
......@@ -87,14 +87,14 @@ struct MultiheadMatmul : public PatternBase {
PATTERN_DECL_NODE(multihead_matmul_out);
};
struct Fc : public PatternBase {
Fc(PDPattern *pattern, const std::string &name_scope)
: PatternBase(pattern, name_scope, "fc") {}
struct MatrixMultiply : public PatternBase {
MatrixMultiply(PDPattern *pattern, const std::string &name_scope)
: PatternBase(pattern, name_scope, "matrix_multiply") {}
void operator()();
PATTERN_DECL_NODE(fc_input);
PATTERN_DECL_NODE(fc_op);
PATTERN_DECL_NODE(matrix_multiply_input);
PATTERN_DECL_NODE(matrix_multiply_op);
};
struct Activation : public PatternBase {
......@@ -118,6 +118,17 @@ struct FusedTokenPrune : public PatternBase {
PATTERN_DECL_NODE(fused_token_prune_op);
PATTERN_DECL_NODE(fused_token_prune_output);
};
struct ElementWise : public PatternBase {
ElementWise(PDPattern *pattern, const std::string &name_scope)
: PatternBase(pattern, name_scope, "elementwise") {}
void operator()();
PATTERN_DECL_NODE(elementwise_input);
PATTERN_DECL_NODE(elementwise_op);
PATTERN_DECL_NODE(elementwise_out);
};
} // namespace patterns
class RemovePaddingRecoverPaddingPass : public FusePassBase {
......
......@@ -64,8 +64,8 @@ namespace patterns {
// output
PDNode* TrtCrossMultiHeadMatmulPattern::operator()() {
std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
std::unordered_set<std::string> mul_ops{"matrix_multiply"};
std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
auto* input0 = pattern->NewNode(input0_repr());
auto* input1 = pattern->NewNode(input1_repr());
......@@ -210,23 +210,6 @@ PDNode* TrtCrossMultiHeadMatmulPattern::operator()() {
} // namespace patterns
TrtCrossMultiHeadMatmulFusePass::TrtCrossMultiHeadMatmulFusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddInput("Y") // the shape shoule be (N*H, N*H)
.IsTensor()
.End()
.AddOutput("Out") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(2)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("reshape2"))
.AddInput("X")
.IsTensor()
......@@ -269,43 +252,6 @@ TrtCrossMultiHeadMatmulFusePass::TrtCrossMultiHeadMatmulFusePass() {
// QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
// QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsType<float>() // QK(anyvalue, will copy to new op) QKV(1.0)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
......@@ -584,11 +530,8 @@ REGISTER_PASS(trt_cross_multihead_matmul_fuse_pass,
REGISTER_PASS_CAPABILITY(trt_cross_multihead_matmul_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.LE("elementwise_add", 1)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.LE("matmul", 1)
.EQ("matmul_v2", 0)
.EQ("softmax", 0));
......@@ -156,77 +156,6 @@ TrtDeleteWeightQuantDequantLinearOpPass::
.AddAttr("data_format")
.IsStringIn({"NCHW", "NHWC", "AnyLayout"})
.End();
AddOpCompat(OpCompat("mul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y")
.IsBoolEQ(false)
.End();
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsNumGE(0.99f)
.IsNumLE(1.01f)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y")
.IsBoolEQ(false)
.End();
AddOpCompat(OpCompat("fc"))
.AddInput("Input")
.IsTensor()
.End()
.AddInput("W")
.IsTensor()
.End()
.AddInput("Bias")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("in_num_col_dims")
.IsNumGE(1)
.End()
.AddAttr("activation_type")
.IsStringIn({"relu", ""})
.End();
AddOpCompat(OpCompat("conv2d_transpose"))
.AddInput("Input")
.IsTensor()
......
......@@ -65,8 +65,8 @@ namespace patterns {
// output
PDNode* TrtFlashMultiHeadMatmulPattern::operator()() {
std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
std::unordered_set<std::string> mul_ops{"matrix_multiply"};
std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
auto* input0 = pattern->NewNode(input0_repr());
input0->assert_is_ops_input(mul_ops);
VLOG(5) << "Start match TrtFlashMultiHeadMatmulPattern";
......@@ -209,23 +209,6 @@ PDNode* TrtFlashMultiHeadMatmulPattern::operator()() {
} // namespace patterns
TrtFlashMultiHeadMatmulFusePass::TrtFlashMultiHeadMatmulFusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddInput("Y") // the shape shoule be (N*H, N*H)
.IsTensor()
.End()
.AddOutput("Out") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(2)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("reshape2"))
.AddInput("X")
.IsTensor()
......@@ -268,43 +251,6 @@ TrtFlashMultiHeadMatmulFusePass::TrtFlashMultiHeadMatmulFusePass() {
// QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
// QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsType<float>() // QK(anyvalue, will copy to new op) QKV(1.0)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
......@@ -578,11 +524,8 @@ REGISTER_PASS(trt_flash_multihead_matmul_fuse_pass,
REGISTER_PASS_CAPABILITY(trt_flash_multihead_matmul_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.LE("elementwise_add", 1)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.LE("matmul", 1)
.EQ("matmul_v2", 0)
.EQ("softmax", 0));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class TrtMapMatmul2MulPass : public FusePassBase {
public:
TrtMapMatmul2MulPass();
virtual ~TrtMapMatmul2MulPass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
/*
* Map matmul_v2 to mul, the same as TrtMapMatmul2MulPass.
*/
class TrtMapMatmulV2ToMulPass : public FusePassBase {
public:
TrtMapMatmulV2ToMulPass();
virtual ~TrtMapMatmulV2ToMulPass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
/*
* Map matmul_v2 to matmul, not supoort broadcast.
*/
class TrtMapMatmulV2ToMatmulPass : public FusePassBase {
public:
TrtMapMatmulV2ToMatmulPass();
virtual ~TrtMapMatmulV2ToMatmulPass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
/*
* Fuse squeeze2+matmul to mul, so the optimization can use fc_fuse_pass.
* The squeeze2 op must satisfy the following conditions:
* 1. the rank of input X is 4
* 2. the axis attr is [2, 3]
* 3. the next op is only matmul
*
* The matmul op must satisfy the following conditions:
* 1. the transpose_X and transpose_Y attrs are false
* 2. the alpha attr is 1.0
* 3. the rank of input X and Y is 2
* 4. the next op of matmul is only elementwise_add
*
* Notice:
* the rank of input activation is obtained from var_desc,
* it maybe change in runtime. Therefore, the pass considers
* the above passes to reduce the impact on other models.
*/
class TrtSqueeze2MatmulFusePass : public FusePassBase {
public:
TrtSqueeze2MatmulFusePass();
virtual ~TrtSqueeze2MatmulFusePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
/*
* Fuse reshape2+matmul to mul, so the optimization can use fc_fuse_pass.
* The reshape2 op must satisfy the following conditions:
* 1. reshape2 has one input node, which means it don't
* have Shape or ShapeTensor input
* 2. the rank of input X is 4 and the last two dims of input X is 1
* 3. the rank of shape attr is 2
* 4. the next op is only matmul
*
* The matmul op must satisfy the following conditions:
* 1. the transpose_X and transpose_Y attrs are false
* 2. the alpha attr is 1.0
* 3. the rank of input X and Y is 2
* 4. the next op of matmul is only elementwise_add
*
* Notice:
* the shape and rank of input activation is obtained from var_desc,
* they maybe change in runtime. Therefore, the pass considers
* the above passes to reduce the impact on other models.
*/
class TrtReshape2MatmulFusePass : public FusePassBase {
public:
TrtReshape2MatmulFusePass();
virtual ~TrtReshape2MatmulFusePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
class TrtFlatten2MatmulFusePass : public FusePassBase {
public:
TrtFlatten2MatmulFusePass();
virtual ~TrtFlatten2MatmulFusePass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/trt_map_ops_to_matrix_multiply_pass.h"
#include <cmath>
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
class Node;
TrtMapOpsToMatrixMultiplyPass::TrtMapOpsToMatrixMultiplyPass() {}
void TrtMapOpsToMatrixMultiplyPass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
std::string name_scope = "trt_map_ops_to_matrix_multiply_pass";
FusePassBase::Init(name_scope, graph);
std::unordered_set<std::string> ops_type = {"mul", "matmul", "matmul_v2"};
GraphPatternDetector gpd;
patterns::MulMatmulMatmulV2 mul_matmul_matmul_v2(gpd.mutable_pattern(),
name_scope);
mul_matmul_matmul_v2(ops_type);
int found_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
if (!with_dynamic_shape) {
VLOG(3)
<< "TrtMapOpsToMatrixMultiplyPass need with_dynamic_shape, stop this "
"pass."
"Please reconfig 'SetTRTDynamicShapeInfo'. You can refer to the "
"https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/"
"master/c%2B%2B/gpu/resnet50/resnet50_test.cc";
return;
}
VLOG(4) << "trt map some ops to matrix_multiply";
GET_IR_NODE_FROM_SUBGRAPH(ops, ops, mul_matmul_matmul_v2);
GET_IR_NODE_FROM_SUBGRAPH(ops_out, ops_out, mul_matmul_matmul_v2);
OpDesc desc(ops->Op()->Block());
desc.SetType("matrix_multiply");
desc.SetInput("X", {ops->Op()->Input("X").front()});
desc.SetInput("Y", {ops->Op()->Input("Y").front()});
desc.SetOutput("Out", {ops_out->Name()});
if (ops->Op()->HasAttr("transpose_X") || ops->Op()->HasAttr("trans_x")) {
if (ops->Op()->HasAttr("transpose_X")) {
desc.SetAttr("transpose_x", ops->Op()->GetAttr("transpose_X"));
} else {
desc.SetAttr("transpose_x", ops->Op()->GetAttr("trans_x"));
}
} else {
desc.SetAttr("transpose_x", false);
}
if (ops->Op()->HasAttr("transpose_Y") || ops->Op()->HasAttr("trans_y")) {
if (ops->Op()->HasAttr("transpose_Y")) {
desc.SetAttr("transpose_y", ops->Op()->GetAttr("transpose_Y"));
} else {
desc.SetAttr("transpose_y", ops->Op()->GetAttr("trans_y"));
}
} else {
desc.SetAttr("transpose_y", false);
}
if (ops->Op()->HasAttr("out_threshold")) {
desc.SetAttr("out_threshold", ops->Op()->GetAttr("out_threshold"));
}
// Todo: remove attr(x_num_col_dims, y_num_col_dims, alpha)
if (ops->Op()->HasAttr("x_num_col_dims")) {
desc.SetAttr("x_num_col_dims", ops->Op()->GetAttr("x_num_col_dims"));
} else {
int32_t x_num_col_dims = -1;
desc.SetAttr("x_num_col_dims", x_num_col_dims);
}
// op_teller: Only support y_num_col_dims == y.rank - 1;
int32_t y_num_col_dims = -1;
desc.SetAttr("y_num_col_dims", y_num_col_dims);
float alpha = 1;
if (ops->Op()->HasAttr("alpha")) {
alpha = PADDLE_GET_CONST(float, ops->Op()->GetAttr("alpha"));
}
desc.SetAttr("alpha", alpha);
auto matrix_multiply_node = g->CreateOpNode(&desc);
for (auto node : ops->inputs) {
IR_NODE_LINK_TO(node, matrix_multiply_node);
}
IR_NODE_LINK_TO(matrix_multiply_node, ops_out);
GraphSafeRemoveNodes(graph, {ops});
++found_count;
};
gpd(graph, handler);
AddStatis(found_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(trt_map_ops_to_matrix_multiply_pass,
paddle::framework::ir::TrtMapOpsToMatrixMultiplyPass);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
class Graph;
class TrtMapOpsToMatrixMultiplyPass : public FusePassBase {
public:
TrtMapOpsToMatrixMultiplyPass();
virtual ~TrtMapOpsToMatrixMultiplyPass() {}
protected:
void ApplyImpl(Graph* graph) const override;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -257,18 +257,16 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
}
PDNode* TrtMultiHeadMatmulPattern::operator()() {
std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
auto* input0 = pattern->NewNode(input0_repr());
input0->assert_is_ops_input(mul_ops);
input0->assert_is_op_input("matrix_multiply");
// First path with scale
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(mul_ops);
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matrix_multiply");
auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
->AsInput()
->assert_is_ops_input(mul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul0_out_var =
pattern->NewNode(mul0_out_repr())->assert_is_ops_output(mul_ops);
pattern->NewNode(mul0_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul0) eltadd0;
decltype(mul0) eltadd0_b_var;
......@@ -301,12 +299,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
auto* scale_out_var =
pattern->NewNode(scale_out_repr())->assert_is_op_output("scale");
scale_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
scale_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
auto* matmul_qk =
pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
auto* matmul_qk_out_var =
pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(matmul_qk_repr())->assert_is_op("matrix_multiply");
auto* matmul_qk_out_var = pattern->NewNode(matmul_qk_out_repr())
->assert_is_op_output("matrix_multiply");
matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
auto* eltadd_qk =
......@@ -322,12 +320,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
auto* softmax_qk_out_var =
pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
auto* matmul_qkv =
pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
auto* matmul_qkv_out_var =
pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(matmul_qkv_repr())->assert_is_op("matrix_multiply");
auto* matmul_qkv_out_var = pattern->NewNode(matmul_qkv_out_repr())
->assert_is_op_output("matrix_multiply");
matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
auto* transpose2_qkv =
......@@ -340,15 +338,14 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
->assert_is_op_output("reshape2");
reshape2_qkv_out_var->assert_is_ops_input(mul_ops);
// Second path to matmul
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(mul_ops);
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matrix_multiply");
auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
->AsInput()
->assert_is_ops_input(mul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul1_out_var =
pattern->NewNode(mul1_out_repr())->assert_is_ops_output(mul_ops);
pattern->NewNode(mul1_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul1) eltadd1;
decltype(mul1) eltadd1_b_var;
......@@ -375,16 +372,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
->assert_is_op_output("transpose2");
transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops); // link to matmul qk
transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
"matrix_multiply"); // link to matmul qk
// Third path to matmul
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(mul_ops);
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matrix_multiply");
auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
->AsInput()
->assert_is_ops_input(mul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul2_out_var =
pattern->NewNode(mul2_out_repr())->assert_is_ops_output(mul_ops);
pattern->NewNode(mul2_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul2) eltadd2;
decltype(mul2) eltadd2_b_var;
......@@ -411,8 +408,8 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
->assert_is_op_output("transpose2");
transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops); // link to matmul qkv
transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
"matrix_multiply"); // link to matmul qkv
// Q path
mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
......@@ -449,17 +446,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
}
PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
auto* input0 = pattern->NewNode(input0_repr());
input0->assert_is_ops_input(matmul_ops);
input0->assert_is_op_input("matrix_multiply");
// First path with scale
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops);
auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matrix_multiply");
auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
->AsInput()
->assert_is_ops_input(matmul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul0_out_var =
pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(mul0_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul0) eltadd0;
decltype(mul0) eltadd0_b_var;
......@@ -487,12 +483,13 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
->assert_is_op_output("transpose2");
transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X");
transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply",
"X");
auto* matmul_qk =
pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
auto* matmul_qk_out_var =
pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(matmul_qk_repr())->assert_is_op("matrix_multiply");
auto* matmul_qk_out_var = pattern->NewNode(matmul_qk_out_repr())
->assert_is_op_output("matrix_multiply");
matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
auto* eltadd_qk =
......@@ -508,12 +505,12 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
auto* softmax_qk_out_var =
pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matrix_multiply");
auto* matmul_qkv =
pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
auto* matmul_qkv_out_var =
pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(matmul_qkv_repr())->assert_is_op("matrix_multiply");
auto* matmul_qkv_out_var = pattern->NewNode(matmul_qkv_out_repr())
->assert_is_op_output("matrix_multiply");
matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
auto* transpose2_qkv =
......@@ -526,14 +523,13 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
->assert_is_op_output("reshape2");
reshape2_qkv_out_var->assert_is_ops_input(matmul_ops);
// Second path to matmul
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops);
auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matrix_multiply");
auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
->AsInput()
->assert_is_ops_input(matmul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul1_out_var =
pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(mul1_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul1) eltadd1;
decltype(mul1) eltadd1_b_var;
......@@ -560,16 +556,16 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
->assert_is_op_output("transpose2");
transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops, "Y"); // link to matmul qk
transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
"matrix_multiply", "Y"); // link to matmul qk
// Third path to matmul
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops);
auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matrix_multiply");
auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
->AsInput()
->assert_is_ops_input(matmul_ops, "Y");
->assert_is_op_input("matrix_multiply", "Y");
auto* mul2_out_var =
pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops);
pattern->NewNode(mul2_out_repr())->assert_is_op_output("matrix_multiply");
decltype(mul2) eltadd2;
decltype(mul2) eltadd2_b_var;
......@@ -596,8 +592,8 @@ PDNode* TrtMultiHeadMatmulV3Pattern::operator()() {
pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
->assert_is_op_output("transpose2");
transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
matmul_ops); // link to matmul qkv
transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
"matrix_multiply"); // link to matmul qkv
// Q path
mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
......@@ -642,23 +638,6 @@ void TrtMultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const {
}
TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddInput("Y") // the shape shoule be (N*H, N*H)
.IsTensor()
.End()
.AddOutput("Out") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(2)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
// in bias, shape is (B, S, N*H),
......@@ -738,45 +717,6 @@ TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
.IsType<bool>()
.End();
// QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
// QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsNumEQ(1.0f)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsType<bool>()
.End()
.AddAttr("trans_y")
.IsType<bool>()
.End();
AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
......@@ -1187,23 +1127,6 @@ void TrtMultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
}
TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() {
AddOpCompat(OpCompat("mul"))
.AddInput("X") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddInput("Y") // the shape shoule be (N*H, N*H)
.IsTensor()
.End()
.AddOutput("Out") // the shape shoule be (B, S, N*H)
.IsTensor()
.End()
.AddAttr("x_num_col_dims")
.IsNumEQ(2)
.End()
.AddAttr("y_num_col_dims")
.IsNumEQ(1)
.End();
AddOpCompat(OpCompat("elementwise_add"))
.AddInput("X")
// in bias, shape is (B, S, N*H),
......@@ -1266,45 +1189,6 @@ TrtMultiHeadMatmulV3FusePass::TrtMultiHeadMatmulV3FusePass() {
.IsType<std::vector<int>>()
.End();
// QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S)
// QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N)
AddOpCompat(OpCompat("matmul"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("alpha")
.IsType<float>() // QK(anyvalue, will copy to new op) QKV(1.0)
.End()
.AddAttr("transpose_X")
.IsBoolEQ(false)
.End()
.AddAttr("transpose_Y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("matmul_v2"))
.AddInput("X")
.IsTensor()
.End()
.AddInput("Y")
.IsTensor()
.End()
.AddOutput("Out")
.IsTensor()
.End()
.AddAttr("trans_x")
.IsBoolEQ(false)
.End()
.AddAttr("trans_y") // QK(true) QKV(false)
.IsType<bool>()
.End();
AddOpCompat(OpCompat("softmax"))
.AddInput("X")
.IsTensor()
......@@ -1672,12 +1556,10 @@ REGISTER_PASS(trt_multihead_matmul_fuse_pass_v3,
REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v2)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination()
.EQ("mul", 0)
.LE("elementwise_add", 1)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.LE("matmul", 1)
.EQ("softmax", 0));
REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3)
......@@ -1687,6 +1569,4 @@ REGISTER_PASS_CAPABILITY(trt_multihead_matmul_fuse_pass_v3)
.EQ("reshape2", 0)
.EQ("transpose2", 0)
.EQ("scale", 0)
.LE("matmul", 1)
.EQ("matmul_v2", 0)
.EQ("softmax", 0));
......@@ -176,10 +176,17 @@ void TrtSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
new_desc.SetInput("Bias", {layer_norm_bias->Name()});
if (layer_norm->Op()->HasAttr("out_threshold")) {
new_desc.SetAttr("enable_int8", true);
new_desc.SetAttr("out_threshold",
layer_norm->Op()->GetAttr("out_threshold"));
}
if (subgraph.at(x)->inputs[0]->Op()->HasAttr("out_threshold")) {
new_desc.SetAttr(
"X", subgraph.at(x)->inputs[0]->Op()->GetAttr("out_threshold"));
}
if (subgraph.at(y)->inputs[0]->Op()->HasAttr("out_threshold")) {
new_desc.SetAttr(
"Y", subgraph.at(y)->inputs[0]->Op()->GetAttr("out_threshold"));
}
if (layer_norm->Op()->HasAttr("smooth_scale")) {
new_desc.SetAttr("smooth_scale",
......
......@@ -79,7 +79,7 @@ void VitAttentionFusePass::ApplyImpl(ir::Graph* graph) const {
auto* scope = param_scope();
// pattern
std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
std::unordered_set<std::string> matmul_ops{"matrix_multiply"};
PDNode* x = gpd.mutable_pattern()
->NewNode("x")
->assert_is_ops_input(matmul_ops, "X")
......@@ -173,5 +173,4 @@ REGISTER_PASS_CAPABILITY(vit_attention_fuse_pass)
.EQ("transpose2", 0)
.EQ("slice", 0)
.EQ("scale", 0)
.EQ("softmax", 0)
.EQ("matmul_v2", 0));
.EQ("softmax", 0));
......@@ -2552,13 +2552,11 @@ USE_TRT_CONVERTER(transpose);
USE_TRT_CONVERTER(transpose2);
USE_TRT_CONVERTER(flatten);
USE_TRT_CONVERTER(flatten_contiguous_range);
USE_TRT_CONVERTER(matmul);
USE_TRT_CONVERTER(matmul_v2);
USE_TRT_CONVERTER(matrix_multiply);
USE_TRT_CONVERTER(bmm);
USE_TRT_CONVERTER(conv2d);
USE_TRT_CONVERTER(relu);
USE_TRT_CONVERTER(sigmoid);
USE_TRT_CONVERTER(fc);
USE_TRT_CONVERTER(pool2d);
USE_TRT_CONVERTER(softmax);
USE_TRT_CONVERTER(batch_norm);
......
......@@ -86,17 +86,17 @@ void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
const std::vector<std::string> kTRTSubgraphPasses({
"trt_support_nhwc_pass",
"adaptive_pool2d_convert_global_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_fill_constant_op_pass", //
"delete_quant_dequant_op_pass", //
"delete_quant_dequant_filter_op_pass", //
"trt_delete_weight_dequant_linear_op_pass", //
"delete_quant_dequant_linear_op_pass", //
"identity_scale_op_clean_pass", //
"add_support_int8_pass", //
// "fc_fuse_pass", //
"adaptive_pool2d_convert_global_pass", //
"trt_map_ops_to_matrix_multiply_pass", //
"shuffle_channel_detect_pass", //
"quant_conv2d_dequant_fuse_pass", //
"delete_fill_constant_op_pass", //
"delete_quant_dequant_op_pass", //
"delete_quant_dequant_filter_op_pass", //
"trt_delete_weight_dequant_linear_op_pass", //
"delete_quant_dequant_linear_op_pass", //
"identity_scale_op_clean_pass", //
"add_support_int8_pass", //
"simplify_with_basic_ops_pass", //
"trt_embedding_eltwise_layernorm_fuse_pass", //
"preln_embedding_eltwise_layernorm_fuse_pass", //
......@@ -119,18 +119,12 @@ const std::vector<std::string> kTRTSubgraphPasses({
"trt_skip_layernorm_fuse_pass", //
"preln_skip_layernorm_fuse_pass", //
#endif
"preln_residual_bias_fuse_pass", //
"preln_layernorm_x_fuse_pass", //
"reverse_roll_fuse_pass", //
"conv_bn_fuse_pass", //
"unsqueeze2_eltwise_fuse_pass", //
"trt_squeeze2_matmul_fuse_pass", //
"trt_flatten2_matmul_fuse_pass", //
"trt_map_matmul_v2_to_mul_pass", //
"trt_map_matmul_v2_to_matmul_pass", //
"trt_map_matmul_to_mul_pass", //
"fc_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
"preln_residual_bias_fuse_pass", //
"preln_layernorm_x_fuse_pass", //
"reverse_roll_fuse_pass", //
"conv_bn_fuse_pass", //
"unsqueeze2_eltwise_fuse_pass", //
"conv_elementwise_add_fuse_pass", //
#if defined _WIN32 // Windows CI is TensorRT7.0. Remove this after upgrading.
#else
"trans_layernorm_fuse_pass", //
......@@ -216,10 +210,6 @@ const std::vector<std::string> kTrtLowerPrecisionPasses{
// "conv_eltwiseadd_bn_fuse_pass",
"trt_embedding_eltwise_layernorm_fuse_pass",
"trt_skip_layernorm_fuse_pass",
"trt_map_matmul_v2_to_mul_pass",
"trt_map_matmul_v2_to_matmul_pass",
"trt_map_matmul_to_mul_pass",
"fc_fuse_pass",
"tensorrt_subgraph_pass",
};
......
......@@ -2,11 +2,9 @@
list(
APPEND
CONVERT_FILES
matmul_op.cc
matmul_v2_op.cc
matrix_multiply_op.cc
bmm_op.cc
conv2d_op.cc
fc_op.cc
pool2d_op.cc
elementwise_op.cc
batch_norm_op.cc
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace paddle {
namespace inference {
namespace tensorrt {
namespace {
template <typename T>
void tranpose_weight(const T* src, T* dst, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
dst[j * m + i] = src[i * n + j];
}
}
}
} // namespace
/*
* FC converter convert a MUL op in Fluid to a FC layer in TRT.
*/
class FcOpConverter : public OpConverter {
public:
nvinfer1::ILayer* reshape_before_fc(nvinfer1::ITensor* before_fc,
nvinfer1::Dims x_dim,
int x_num_col_dims,
std::string output_name) {
// add shuffle before fc
nvinfer1::Dims reshape_before_fc_dim;
reshape_before_fc_dim.nbDims = x_num_col_dims + 3;
// padding shape "* x q x 1 x 1"
nvinfer1::ITensor* filal_reshape_before_fc_shape_tensor = nullptr;
if (!engine_->with_dynamic_shape()) {
for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
reshape_before_fc_dim.d[i] = 1;
}
for (int i = 0; i < x_dim.nbDims; i++) {
if (i < x_num_col_dims) {
reshape_before_fc_dim.d[i] = 0;
} else {
reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i];
}
}
} else {
std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
nvinfer1::ITensor* input_shape_tensor = Shape(before_fc);
for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) {
reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < x_dim.nbDims; i++) {
if (i < x_num_col_dims) {
reshape_before_fc_shape_tensor[i] =
GetEleTensorOfShape(input_shape_tensor, i);
} else {
reshape_before_fc_shape_tensor[x_num_col_dims] =
Prod(GetEleTensorOfShape(input_shape_tensor, i),
reshape_before_fc_shape_tensor[x_num_col_dims]);
}
}
filal_reshape_before_fc_shape_tensor =
Concat(reshape_before_fc_shape_tensor);
}
auto* reshape_before_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *before_fc);
if (!engine_->with_dynamic_shape()) {
reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
} else {
reshape_before_fc_layer->setInput(1,
*filal_reshape_before_fc_shape_tensor);
}
reshape_before_fc_layer->setName(
("fc_op_reshape_before_fc: Shuffle (Output: " + output_name + ")")
.c_str());
return reshape_before_fc_layer;
}
nvinfer1::ILayer* reshape_after_fc(nvinfer1::ITensor* after_fc,
nvinfer1::Dims x_dim,
int x_num_col_dims) {
// add shuffle after fc
nvinfer1::Dims reshape_after_fc_dim;
reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
nvinfer1::ITensor* filal_reshape_after_fc_shape_tensor = nullptr;
if (!engine_->with_dynamic_shape()) {
for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
reshape_after_fc_dim.d[i] = 0;
}
} else {
std::vector<int> gather_indices(x_num_col_dims + 1);
std::iota(gather_indices.begin(), gather_indices.end(), 0);
filal_reshape_after_fc_shape_tensor =
Gather(Shape(after_fc), gather_indices);
}
auto* reshape_after_fc_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *after_fc);
if (!engine_->with_dynamic_shape()) {
reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
} else {
reshape_after_fc_layer->setInput(1, *filal_reshape_after_fc_shape_tensor);
}
return reshape_after_fc_layer;
}
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
VLOG(3) << "convert a fc op to tensorrt fc layer without bias";
framework::OpDesc op_desc(op, nullptr);
auto output_name = op_desc.Output("Out").front();
auto input_names = op_desc.InputNames();
bool with_bias = input_names.size() >= 3;
std::string w_name = "Y";
std::string i_name = "X";
if (with_bias) {
w_name = "W";
i_name = "Input";
}
// Declare inputs
auto* X = engine_->GetITensor(op_desc.Input(i_name).front());
auto x_dim = X->getDimensions();
// Declare weights
auto* Y_v = scope.FindVar(op_desc.Input(w_name).front());
PADDLE_ENFORCE_NOT_NULL(
Y_v,
platform::errors::NotFound(
"Can not find %s presistale var of fc in scope.", w_name));
auto* Y_t = Y_v->GetMutable<phi::DenseTensor>();
int x_num_col_dims =
op_desc.HasAttr("x_num_col_dims")
? PADDLE_GET_CONST(int, op_desc.GetAttr("x_num_col_dims"))
: (op_desc.HasAttr("in_num_col_dims")
? PADDLE_GET_CONST(int, op_desc.GetAttr("in_num_col_dims"))
: 1);
const std::string activation_type =
op_desc.HasAttr("activation_type")
? PADDLE_GET_CONST(std::string, op_desc.GetAttr("activation_type"))
: "";
bool enable_int8 = op_desc.HasAttr("enable_int8");
bool support_int8 = false;
if (op_desc.HasAttr("support_int8")) {
support_int8 = PADDLE_GET_CONST(bool, op_desc.GetAttr("support_int8"));
}
float in_scale = 0;
if (enable_int8 || support_int8) {
if (enable_int8) {
in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
} else {
in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
}
engine_->SetTensorDynamicRange(X, in_scale);
}
PADDLE_ENFORCE_EQ(Y_t->dims().size(),
2UL,
platform::errors::InvalidArgument(
"The fc's weight should be a matrix with 2 dims, but "
"it's %d-dimensional.",
Y_t->dims().size())); // a matrix
int m = Y_t->dims()[0];
int n = Y_t->dims()[1];
auto regist_fc = [&](nvinfer1::ITensor* inputs,
int n_output,
TensorRTEngine::Weight& weight,
TensorRTEngine::Weight& bias) {
if (enable_int8 || support_int8) {
// add conv layer
float out_scale = 0;
if (enable_int8) {
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("out_threshold"),
true,
platform::errors::InvalidArgument(
"must have out threshold in fc layers in int8 mode"));
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
} else {
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
}
nvinfer1::DimsHW nv_ksize(1, 1);
auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*inputs,
n_output,
nv_ksize,
weight.get(),
bias.get());
fc_layer_int8->setName(
("fc_op_int8_conv1x1: Convolution (Output: " + output_name + ")")
.c_str());
engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale);
auto* fc_after_reshape_int8 = reshape_after_fc(
fc_layer_int8->getOutput(0), x_dim, x_num_col_dims);
if (activation_type == "relu") {
fc_after_reshape_int8->setName(
("int8_reshape_after_fc: Shuffle (Output: " + output_name + ")")
.c_str());
engine_->SetTensorDynamicRange(fc_after_reshape_int8->getOutput(0),
out_scale);
nvinfer1::IActivationLayer* relu_layer_int8 =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_after_reshape_int8->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_int8,
"relu_after_fc_shuffle",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(fc_after_reshape_int8,
"fc_op_int8_reshape_after_fc: Shuffle",
{output_name},
test_mode);
}
} else {
// add fc layer
auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(engine_,
FullyConnected,
*inputs,
n_output,
weight.get(),
bias.get());
fc_layer_float->setName(
("fc_op_float: FullyConnected (Output: " + output_name + ")")
.c_str());
auto* fc_after_reshape_float = reshape_after_fc(
fc_layer_float->getOutput(0), x_dim, x_num_col_dims);
if (activation_type == "relu") {
fc_after_reshape_float->setName(
("float_reshape_after_fc: Shuffle (Output: " + output_name + ")")
.c_str());
nvinfer1::IActivationLayer* relu_layer_float =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_after_reshape_float->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_float,
"relu_after_fc_shuffle",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(fc_after_reshape_float,
"shuffle_after_fc",
{output_name},
test_mode);
}
}
};
bool transpose_y = false;
if (op_desc.HasAttr("transpose_Y")) {
transpose_y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
}
int weight_w, weight_h;
auto weight = engine_->GetTrtWeight(op_desc.Input(w_name).front(), *Y_t);
if (!transpose_y) {
if (weight.get().type == nvinfer1::DataType::kFLOAT) {
std::vector<float> weight_data_tmp;
weight_data_tmp.reserve(Y_t->numel());
memcpy(weight_data_tmp.data(),
weight.get().values,
Y_t->numel() * sizeof(float));
tranpose_weight(
weight_data_tmp.data(),
const_cast<float*>(static_cast<const float*>(weight.get().values)),
m,
n);
} else if (weight.get().type == nvinfer1::DataType::kHALF) {
std::vector<float16> weight_data_tmp;
weight_data_tmp.reserve(Y_t->numel());
memcpy(weight_data_tmp.data(),
weight.get().values,
Y_t->numel() * sizeof(float16));
tranpose_weight(weight_data_tmp.data(),
const_cast<float16*>(
static_cast<const float16*>(weight.get().values)),
m,
n);
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Paddle-TRT fc convert not supporte dtype, now only support fp32 "
"and fp16."));
}
weight_w = n;
weight_h = m;
} else {
weight_w = m;
weight_h = n;
}
size_t n_output = weight_w;
weight.dims.assign({weight_w, weight_h});
TensorRTEngine::Weight bias{weight.get().type, nullptr, 0};
if (with_bias) {
auto* b_v = scope.GetVar(op_desc.Input("Bias").front());
auto* b_t = b_v->GetMutable<phi::DenseTensor>();
bias = engine_->GetTrtWeight(op_desc.Input("Bias").front(), *b_t);
}
// Running the TRT Static Shape mode: x_num_col_dims-1
if (!engine_->with_dynamic_shape()) {
x_num_col_dims--;
}
// If use tensorrt'oss, the x_dim and x_num_col_dims need change, and can
// not add Shuffle layer in ernie's multihead.
if (x_dim.nbDims == 4 && x_dim.d[2] == 1 && x_dim.d[3] == 1) {
if (enable_int8 || support_int8) {
// add conv1x1 layer
nvinfer1::DimsHW nv_ksize(1, 1);
auto* fc_layer_int8 = TRT_ENGINE_ADD_LAYER(engine_,
Convolution,
*X,
n_output,
nv_ksize,
weight.get(),
bias.get());
if (activation_type == "relu") {
fc_layer_int8->setName(
("ernie_fc_op_int8: Convolution (Output: " + output_name + ")")
.c_str());
PADDLE_ENFORCE_EQ(
op_desc.HasAttr("out_threshold"),
true,
platform::errors::InvalidArgument(
"must have out threshold in fc layers in int8 mode"));
float out_scale = 0;
if (enable_int8) {
out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
} else {
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
}
engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0),
out_scale);
nvinfer1::IActivationLayer* relu_layer_int8 =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_layer_int8->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_int8,
"relu_after_ernie_fc_int8",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(fc_layer_int8,
"ernie_fc_op_int8: Convolution",
{output_name},
test_mode);
}
} else {
// add fc layer
auto* fc_layer_float = TRT_ENGINE_ADD_LAYER(
engine_, FullyConnected, *X, n_output, weight.get(), bias.get());
if (activation_type == "relu") {
fc_layer_float->setName(
("ernie_fc_op_float: (Output: " + output_name + ")").c_str());
nvinfer1::IActivationLayer* relu_layer_float =
TRT_ENGINE_ADD_LAYER(engine_,
Activation,
*(fc_layer_float->getOutput(0)),
nvinfer1::ActivationType::kRELU);
RreplenishLayerAndOutput(relu_layer_float,
"relu_after_ernie_fc_float",
{output_name},
test_mode);
} else {
RreplenishLayerAndOutput(
fc_layer_float, "ernie_fc_op_float", {output_name}, test_mode);
}
}
} else { // need reshape input before and after fc
PADDLE_ENFORCE_GT(
x_dim.nbDims,
x_num_col_dims,
platform::errors::InvalidArgument(
"Params and input dims mismatch. Paddle-TRT FC "
"converter expects x_dim.nbDims > x_num_col_dims, but "
"x_dim.nbDims : %d, x_num_col_dims : %d.",
x_dim.nbDims,
x_num_col_dims));
auto* reshape_before_fc_layer =
reshape_before_fc(X, x_dim, x_num_col_dims, output_name);
auto* reshape_itensor = reshape_before_fc_layer->getOutput(0);
if (enable_int8 || support_int8) {
engine_->SetTensorDynamicRange(reshape_itensor, in_scale);
}
regist_fc(reshape_itensor, n_output, weight, bias);
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* MatMulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*/
class MatMulOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
VLOG(3) << "convert a matmul op to tensorrt matmul layer ";
framework::OpDesc op_desc(op, nullptr);
nvinfer1::ILayer* layer = nullptr;
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
nvinfer1::Dims dims_x = input1->getDimensions();
nvinfer1::Dims dims_y = input2->getDimensions();
bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
auto output_name = op_desc.Output("Out")[0];
float alpha = 1;
if (op_desc.HasAttr("alpha")) {
float alpha_tem = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
alpha = alpha_tem;
}
nvinfer1::MatrixOperation matrix_operation_X =
transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
nvinfer1::MatrixOperation matrix_operation_Y =
transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
if (op_desc.HasAttr("support_int8") &&
PADDLE_GET_CONST(bool, op_desc.GetAttr("support_int8")) &&
engine_->precision() == AnalysisConfig::Precision::kInt8 &&
platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >=
75) {
if (engine_->with_dynamic_shape()) {
VLOG(3) << "Convert a fluid matmul_op_int8_dynamic to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPluginDynamic* plugin =
new plugin::MatmulPluginDynamic(transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(
layer, "matmul_op_int8_dynamic", {output_name}, test_mode);
} else {
VLOG(3) << "Convert a fluid matmul_op_int8_static to TensorRT "
"MatmulPluginLayer";
plugin::MatmulPlugin* plugin = new plugin::MatmulPlugin(
dims_x, dims_y, transpose_X, transpose_Y, alpha);
std::vector<nvinfer1::ITensor*> inputs{input1, input2};
layer = engine_->AddPluginV2IOExt(inputs.data(), inputs.size(), plugin);
RreplenishLayerAndOutput(
layer, "matmul_op_int8_static", {output_name}, test_mode);
}
} else {
VLOG(3) << "Convert a fluid matmul_op_float to TensorRT ";
layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*input1,
matrix_operation_X,
*input2,
matrix_operation_Y);
if (alpha == 1) {
RreplenishLayerAndOutput(
layer, "matmul_op_float_no_alpha", {output_name}, test_mode);
} else {
layer->setName(
("matmul_op_float_has_alpha: MatrixMultiplyLayer (Output: " +
output_name + ")")
.c_str());
// IScaleLayer requires the input must have at least
// three dimensions in static shape mode and at least
// four dimensions in dynamic shape mode.
auto* matmul_out = layer->getOutput(0);
nvinfer1::Dims out_shape = matmul_out->getDimensions();
const int out_dims = out_shape.nbDims;
bool need_change_dim = false;
if (engine_->with_dynamic_shape()) {
if (out_dims == 3) {
need_change_dim = true;
}
} else {
if (out_dims == 2) {
need_change_dim = true;
}
}
if (need_change_dim) {
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = out_dims + 1;
reshape_dim.d[out_dims] = 1;
for (int i = 0; i < out_dims; i++) {
reshape_dim.d[i] = out_shape.d[i];
}
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *matmul_out);
reshape_layer->setReshapeDimensions(reshape_dim);
matmul_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_before: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
}
auto create_weights = [&](float data,
const std::string& type) -> float* {
std::unique_ptr<phi::DenseTensor> tmp_tensor(new phi::DenseTensor());
tmp_tensor->Resize({1});
auto* tmp_data =
tmp_tensor->mutable_data<float>(platform::CPUPlace());
tmp_data[0] = data;
engine_->SetWeights(output_name + "_add_scale_op_" + type,
std::move(tmp_tensor));
return tmp_data;
};
float* alpha_data = create_weights(alpha, "alpha");
float* shift_data = create_weights(0.0, "shift");
float* power_data = create_weights(1.0, "power");
TensorRTEngine::Weight nv_alpha{
nvinfer1::DataType::kFLOAT, static_cast<void*>(alpha_data), 1};
TensorRTEngine::Weight nv_shift{
nvinfer1::DataType::kFLOAT, static_cast<void*>(shift_data), 1};
TensorRTEngine::Weight nv_power{
nvinfer1::DataType::kFLOAT, static_cast<void*>(power_data), 1};
auto* scale_layer = TRT_ENGINE_ADD_LAYER(engine_,
Scale,
*matmul_out,
nvinfer1::ScaleMode::kUNIFORM,
nv_shift.get(),
nv_alpha.get(),
nv_power.get());
auto* scale_out = scale_layer->getOutput(0);
scale_layer->setName(
("matmul_op_float_has_alpha: ScaleLayer (Output: " + output_name +
")")
.c_str());
if (need_change_dim) {
auto* reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scale_out);
reshape_layer->setReshapeDimensions(out_shape);
scale_out = reshape_layer->getOutput(0);
reshape_layer->setName(("matmul_op_float_has_alpha_reshape_after: "
"ShuffleLayer (Output: " +
output_name + ")")
.c_str());
}
engine_->SetITensor(output_name, scale_out);
if (test_mode) { // the test framework can not determine which is the
// output, so place the declaration inside.
engine_->DeclareOutput(output_name);
}
}
}
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(matmul, MatMulOpConverter);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* MatMulV2Op, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
*/
class MatMulV2OpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
framework::OpDesc op_desc(op, nullptr);
nvinfer1::IMatrixMultiplyLayer* layer = nullptr;
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
nvinfer1::Dims dims_x = input1->getDimensions();
nvinfer1::Dims dims_y = input2->getDimensions();
bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_x"));
bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_y"));
auto output_name = op_desc.Output("Out")[0];
nvinfer1::MatrixOperation matrix_operation_X =
transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
nvinfer1::MatrixOperation matrix_operation_Y =
transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
int one_num = 0;
bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
nvinfer1::ITensor* new_shape_tensor = nullptr;
if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
one_num = dims_y.nbDims - dims_x.nbDims;
new_shape_tensor = Shape(input1);
std::vector<int32_t> one_vec(one_num, 1);
auto* one_tensor = Add1DConstantLayer(one_vec);
new_shape_tensor =
Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
reshape_layer->setInput(1, *new_shape_tensor);
layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*reshape_layer->getOutput(0),
matrix_operation_X,
*input2,
matrix_operation_Y);
} else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
one_num = dims_x.nbDims - dims_y.nbDims;
new_shape_tensor = Shape(input2);
std::vector<int32_t> one_vec(one_num, 1);
auto* one_tensor = Add1DConstantLayer(one_vec);
new_shape_tensor =
Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
reshape_layer->setInput(1, *new_shape_tensor);
layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*input1,
matrix_operation_X,
*reshape_layer->getOutput(0),
matrix_operation_Y);
} else {
layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*input1,
matrix_operation_X,
*input2,
matrix_operation_Y);
}
if (dims_x.nbDims == 1)
layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
if (dims_y.nbDims == 1)
layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
// When vec * vec, trt produces a scalar, so to be consistent with paddle,
// we need add a reshape.
if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
auto reshape_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
nvinfer1::Dims reshape_dim;
reshape_dim.nbDims = 1;
reshape_dim.d[0] = 1;
reshape_layer->setReshapeDimensions(reshape_dim);
final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
}
VLOG(3) << "Convert a matmul_v2_op to TensorRT ";
RreplenishLayerAndOutput(
final_layer, "matmul_v2_op", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(matmul_v2, MatMulV2OpConverter);
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
namespace paddle {
namespace inference {
namespace tensorrt {
/*
* After trt_map_ops_to_matrix_multiply_pass(mul, matmul, matmul_v2 ->
* matrix_multiply), use MatrixMultiply layer, ElementWiseOperation::kPROD
* layer.
*/
class MatrixMultiplyOpConverter : public OpConverter {
public:
void operator()(const framework::proto::OpDesc& op,
const framework::Scope& scope,
bool test_mode) override {
VLOG(3)
<< "convert a matrix_multiply op to TensorRT MatrixMultiply layer + "
"ElementWiseOperation::kPROD layer(if alpha != 1).";
// Input: X, Y
// Output: Out
// Attributes: transpose_x, transpose_y, x_num_col_dims, y_num_col_dims,
// alpha. extra Attributes(for quant dequant): X, Y, Out, Input_scale,
// out_threshold.
framework::OpDesc op_desc(op, nullptr);
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
bool enable_int8 =
(engine_->precision() == AnalysisConfig::Precision::kInt8);
float x_scale = 0;
float y_scale = 0;
float out_scale = 0;
if (enable_int8) {
if (op_desc.HasAttr("Input_scale")) {
x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
engine_->SetTensorDynamicRange(input1, x_scale);
}
if (op_desc.HasAttr("X")) {
x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
engine_->SetTensorDynamicRange(input1, x_scale);
}
if (op_desc.HasAttr("Y")) {
y_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Y"));
engine_->SetTensorDynamicRange(input2, y_scale);
}
if (op_desc.HasAttr("out_threshold")) {
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
}
if (op_desc.HasAttr("Out")) {
out_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Out"));
}
}
auto output_name = op_desc.Output("Out")[0];
nvinfer1::Dims dims_x = input1->getDimensions();
int32_t x_rank = dims_x.nbDims;
nvinfer1::Dims dims_y = input2->getDimensions();
int32_t y_rank = dims_y.nbDims;
int32_t x_num_col_dims =
PADDLE_GET_CONST(int32_t, op_desc.GetAttr("x_num_col_dims"));
if (x_num_col_dims < 0) {
x_num_col_dims += x_rank;
}
// Temporarily solve the reformat problem of matrix multiplication, make
// input.rank == 4. Possible solution in trt 8.7.
if (x_rank == 2 && x_num_col_dims == 1 && engine_->use_varseqlen()) {
VLOG(3) << "Temporarily solve the reformat problem of matrix "
"multiplication, make input.rank == 4. ";
auto* reshape_before_matrix =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
std::vector<nvinfer1::ITensor*> reshape_before_tensor;
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 0));
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_matrix->setInput(1, *Concat(reshape_before_tensor));
reshape_before_matrix->setName(
("reshape_before_matrix(Output: " + output_name + ")").c_str());
input1 = reshape_before_matrix->getOutput(0);
dims_x = input1->getDimensions();
x_rank = dims_x.nbDims;
if (enable_int8) {
if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
engine_->SetTensorDynamicRange(input1, x_scale);
}
}
}
if (x_num_col_dims != x_rank - 1) {
std::vector<nvinfer1::ITensor*> before_shape_tensors;
nvinfer1::ITensor* input_shape_tensor = Shape(input1);
for (int i = 0; i < x_num_col_dims; ++i) {
before_shape_tensors.push_back(
GetEleTensorOfShape(input_shape_tensor, i));
}
nvinfer1::ITensor* producted = Add1DConstantLayer(1);
for (int i = x_num_col_dims; i < x_rank; ++i) {
producted = Prod(producted, GetEleTensorOfShape(input_shape_tensor, i));
}
before_shape_tensors.push_back(producted);
nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
auto* reshape_before_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
reshape_before_layer->setInput(1, *before_shape_tensor);
reshape_before_layer->setName(
("reshape_x_before_matrix_multiply: Shuffle (Output: " + output_name +
")")
.c_str());
input1 = reshape_before_layer->getOutput(0);
if (enable_int8) {
if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
engine_->SetTensorDynamicRange(input1, x_scale);
}
}
x_rank = x_num_col_dims + 1;
}
int32_t y_num_col_dims =
PADDLE_GET_CONST(int32_t, op_desc.GetAttr("y_num_col_dims"));
if (y_num_col_dims < 0) {
y_num_col_dims += y_rank;
}
PADDLE_ENFORCE_EQ(
y_num_col_dims,
y_rank - 1,
platform::errors::InvalidArgument(
"The matrix_multiply op'y_num_col_dims should be equal "
"to y'rank - 1, but got y_num_col_dims = %d, and y_rank = %d",
y_num_col_dims,
y_rank - 1));
if (x_rank != 1 && y_rank != 1 && x_rank != y_rank) {
if (x_rank < y_rank) {
std::vector<nvinfer1::ITensor*> before_shape_tensors;
nvinfer1::ITensor* input_shape_tensor = Shape(input1);
for (int i = 0; i < y_rank - x_rank; ++i) {
before_shape_tensors.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < x_rank; ++i) {
before_shape_tensors.push_back(
GetEleTensorOfShape(input_shape_tensor, i));
}
nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
auto* reshape_before_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
reshape_before_layer->setInput(1, *before_shape_tensor);
reshape_before_layer->setName(
("full_x_before_matrix_multiply: Shuffle (Output: " + output_name +
")")
.c_str());
input1 = reshape_before_layer->getOutput(0);
if (enable_int8) {
if (op_desc.HasAttr("Input_scale") || op_desc.HasAttr("X")) {
engine_->SetTensorDynamicRange(input1, x_scale);
}
}
x_rank = y_rank;
} else {
std::vector<nvinfer1::ITensor*> before_shape_tensors;
nvinfer1::ITensor* input_shape_tensor = Shape(input2);
for (int i = 0; i < x_rank - y_rank; ++i) {
before_shape_tensors.push_back(Add1DConstantLayer(1));
}
for (int i = 0; i < y_rank; ++i) {
before_shape_tensors.push_back(
GetEleTensorOfShape(input_shape_tensor, i));
}
nvinfer1::ITensor* before_shape_tensor = Concat(before_shape_tensors);
auto* reshape_before_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
reshape_before_layer->setInput(1, *before_shape_tensor);
reshape_before_layer->setName(
("full_y_before_matrix_multiply: Shuffle (Output: " + output_name +
")")
.c_str());
input2 = reshape_before_layer->getOutput(0);
if (enable_int8) {
if (op_desc.HasAttr("Y")) {
engine_->SetTensorDynamicRange(input2, y_scale);
}
}
}
y_rank = x_rank;
}
nvinfer1::MatrixOperation matrix_operation_x;
nvinfer1::MatrixOperation matrix_operation_y;
if (x_rank == 1) {
matrix_operation_x = nvinfer1::MatrixOperation::kVECTOR;
} else {
bool transpose_x = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_x"));
matrix_operation_x = transpose_x ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
}
if (y_rank == 1) {
matrix_operation_y = nvinfer1::MatrixOperation::kVECTOR;
} else {
bool transpose_y = PADDLE_GET_CONST(bool, op_desc.GetAttr("transpose_y"));
matrix_operation_y = transpose_y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
}
nvinfer1::ILayer* layer = nullptr;
layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*input1,
matrix_operation_x,
*input2,
matrix_operation_y);
if (enable_int8) {
if (op_desc.HasAttr("out_threshold") || op_desc.HasAttr("Out")) {
engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale);
}
}
float alpha = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
if (alpha < 0.999 || alpha > 1.001) {
auto* alpha_tensor = Add1DConstantLayer(alpha);
std::vector<nvinfer1::ITensor*> alpha_shape_tensors;
for (int i = 0; i < layer->getOutput(0)->getDimensions().nbDims; i++) {
alpha_shape_tensors.push_back(Add1DConstantLayer(1));
}
auto* reshape_alpha =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *alpha_tensor);
reshape_alpha->setInput(1, *Concat(alpha_shape_tensors));
layer = TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*layer->getOutput(0),
*reshape_alpha->getOutput(0),
nvinfer1::ElementWiseOperation::kPROD);
}
RreplenishLayerAndOutput(
layer, "matrix_multiply_op", {output_name}, test_mode);
}
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
REGISTER_TRT_OP_CONVERTER(matrix_multiply, MatrixMultiplyOpConverter);
......@@ -71,14 +71,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
int hidden_out = weight_dims[2]; // channels_out
int m = hidden_in;
int n = three * hidden_out;
auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
dst[j * m + i] = src[i * n + j];
}
}
};
tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
int head_number = PADDLE_GET_CONST(int, op_desc.GetAttr("head_number"));
......@@ -102,7 +94,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
nvinfer1::ITensor* mask_tensor;
nvinfer1::ITensor* pos_id_tensor;
nvinfer1::ITensor* max_seqlen_tensor;
auto* new_input = input;
if (flag_varseqlen) {
mask_tensor = engine_->GetITensor("qkv_plugin_mask");
pos_id_tensor = engine_->GetITensor("pos_id");
......@@ -188,7 +179,11 @@ class MultiheadMatMulOpConverter : public OpConverter {
nvinfer1::ILayer* transformer_input_layer = engine_->AddDynamicPlugin(
inputs_transformer.data(), inputs_transformer.size(), plugin);
new_input = transformer_input_layer->getOutput(0);
input = transformer_input_layer->getOutput(0);
if (op_desc.HasAttr("Input_scale")) {
in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
engine_->SetTensorDynamicRange(input, in_scale);
}
mask_tensor = transformer_input_layer->getOutput(1);
pos_id_tensor = transformer_input_layer->getOutput(2);
max_seqlen_tensor = transformer_input_layer->getOutput(3);
......@@ -204,7 +199,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
float dp_probs = 1.0 / 127.0;
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer = TRT_ENGINE_ADD_LAYER(
engine_, Convolution, *new_input, n, nv_ksize, weight, bias);
engine_, Convolution, *input, n, nv_ksize, weight, bias);
fc_layer->setName(
("Multihead: Convolution/FullyConnected: (Output: " +
output_name + ")")
......@@ -261,22 +256,42 @@ class MultiheadMatMulOpConverter : public OpConverter {
RreplenishLayerAndOutput(
plugin_layer, "multihead_matmul", {output_name}, test_mode);
} else {
auto* reshape_before_matrix =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
std::vector<nvinfer1::ITensor*> reshape_before_tensor_matrix;
reshape_before_tensor_matrix.push_back(
GetEleTensorOfShape(Shape(input), 0));
reshape_before_tensor_matrix.push_back(
GetEleTensorOfShape(Shape(input), 1));
reshape_before_matrix->setInput(
1, *Concat(reshape_before_tensor_matrix));
reshape_before_matrix->setName(
("reshape_before_matrix(Output: " + output_name + ")").c_str());
auto* input = reshape_before_matrix->getOutput(0);
if (op_desc.HasAttr("Input_scale")) {
in_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Input_scale"));
engine_->SetTensorDynamicRange(input, in_scale);
}
int head_size = hidden_out / head_number;
// [3, head_number, head_size, hidden_in] -> [head_number, 3,
// head_size,
// hidden_in]
// [hidden_in, 3, head_number, head_size] -> [hidden_in, head_number,
// 3, head_size]
auto transpose_weight_v2 = [](const float* src,
float* dst,
int three,
int head_number,
int head_size,
int hidden_in) {
const int HH = head_size * hidden_in;
for (int i = 0; i < three; ++i) {
for (int n = 0; n < head_number; ++n) {
for (int hh = 0; hh < HH; ++hh) {
dst[n * three * HH + i * HH + hh] =
src[i * head_number * HH + n * HH + hh];
for (int i = 0; i < hidden_in; ++i) {
for (int j = 0; j < three; ++j) {
for (int n = 0; n < head_number; ++n) {
for (int m = 0; m < head_size; ++m) {
dst[i * head_number * three * head_size +
n * three * head_size + j * head_size + m] =
src[i * three * head_number * head_size +
j * head_number * head_size + n * head_size + m];
}
}
}
}
......@@ -309,16 +324,61 @@ class MultiheadMatMulOpConverter : public OpConverter {
transpose_bias_v2(
bias_data_tmp.data(), bias_data, head_number, head_size);
nvinfer1::ILayer* fc_layer = nullptr;
float dp_probs = 1.0 / 127.0;
if (op_desc.HasAttr("Input_scale")) {
nvinfer1::DimsHW nv_ksize(1, 1);
fc_layer = TRT_ENGINE_ADD_LAYER(
engine_, Convolution, *new_input, n, nv_ksize, weight, bias);
} else {
fc_layer = TRT_ENGINE_ADD_LAYER(
engine_, FullyConnected, *new_input, n, weight, bias);
}
nvinfer1::Dims trt_dims_weight;
trt_dims_weight.nbDims = 2;
trt_dims_weight.d[0] = m;
trt_dims_weight.d[1] = n;
auto* weight_tensor =
TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_weight, weight)
->getOutput(0);
bool transpose_x = false;
bool transpose_y = false;
nvinfer1::MatrixOperation matrix_operation_x =
transpose_x ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
nvinfer1::MatrixOperation matrix_operation_y =
transpose_y ? nvinfer1::MatrixOperation::kTRANSPOSE
: nvinfer1::MatrixOperation::kNONE;
auto* matrix_layer = TRT_ENGINE_ADD_LAYER(engine_,
MatrixMultiply,
*input,
matrix_operation_x,
*weight_tensor,
matrix_operation_y);
nvinfer1::Dims trt_dims_bias;
trt_dims_bias.nbDims = 2;
trt_dims_bias.d[0] = 1;
trt_dims_bias.d[1] = n;
auto* bias_tensor =
TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_bias, bias)
->getOutput(0);
auto* add_layer =
TRT_ENGINE_ADD_LAYER(engine_,
ElementWise,
*matrix_layer->getOutput(0),
*bias_tensor,
nvinfer1::ElementWiseOperation::kSUM);
auto* reshape_before_multihead_layer =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *add_layer->getOutput(0));
std::vector<nvinfer1::ITensor*> reshape_tensor;
reshape_tensor.push_back(
GetEleTensorOfShape(Shape(matrix_layer->getOutput(0)), 0));
reshape_tensor.push_back(
GetEleTensorOfShape(Shape(matrix_layer->getOutput(0)), 1));
reshape_tensor.push_back(Add1DConstantLayer(1));
reshape_tensor.push_back(Add1DConstantLayer(1));
reshape_before_multihead_layer->setInput(1, *Concat(reshape_tensor));
reshape_before_multihead_layer->setName(
("reshape_before_multihead_mamul(Output: " + output_name + ")")
.c_str());
if (op_desc.HasAttr("fc_out_threshold")) {
PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
......@@ -328,12 +388,19 @@ class MultiheadMatMulOpConverter : public OpConverter {
"in int8 mode"));
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
engine_->SetTensorDynamicRange(matrix_layer->getOutput(0),
out_scale);
engine_->SetTensorDynamicRange(add_layer->getOutput(0), out_scale);
engine_->SetTensorDynamicRange(
reshape_before_multihead_layer->getOutput(0), out_scale);
if (qkv2context_plugin_int8) {
dp_probs =
PADDLE_GET_CONST(float, op_desc.GetAttr("dp_probs")) / 127.0;
}
}
auto creator = GetPluginRegistry()->getPluginCreator(
"CustomQKVToContextPluginDynamic", "2");
assert(creator != nullptr);
......@@ -375,7 +442,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
free(plugin_collection);
std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.emplace_back(fc_layer->getOutput(0));
plugin_inputs.emplace_back(
reshape_before_multihead_layer->getOutput(0));
plugin_inputs.emplace_back(mask_tensor);
plugin_inputs.emplace_back(pos_id_tensor);
plugin_inputs.emplace_back(
......@@ -389,7 +457,8 @@ class MultiheadMatMulOpConverter : public OpConverter {
if (!flag_varseqlen) {
std::vector<nvinfer1::ITensor*> output_transformer;
output_transformer.emplace_back(plugin_layer->getOutput(0));
output_transformer.emplace_back(input);
output_transformer.emplace_back(
engine_->GetITensor(op_desc.Input("Input").front()));
output_transformer.emplace_back(pos_id_tensor);
plugin::TransformerOutputConvertPlugin* plugin =
new plugin::TransformerOutputConvertPlugin();
......@@ -401,9 +470,23 @@ class MultiheadMatMulOpConverter : public OpConverter {
transformer_output_layer->getOutput(0));
} else {
engine_->SetITensor(output_name, plugin_layer->getOutput(0));
if (op_desc.HasAttr("out_threshold")) {
float out_scale =
PADDLE_GET_CONST(float, op_desc.GetAttr("out_threshold"));
engine_->SetTensorDynamicRange(plugin_layer->getOutput(0),
out_scale);
}
}
}
} else {
auto tranpose_weight = [](const float* src, float* dst, int m, int n) {
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
dst[j * m + i] = src[i * n + j];
}
}
};
tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
if (input_dims.d[1] <= 384 && !bias_qk_attr &&
engine_->precision() != AnalysisConfig::Precision::kFloat32 &&
platform::GetGPUComputeCapability(platform::GetCurrentDeviceId()) >=
......
......@@ -56,6 +56,8 @@ class OneHotOpConverter : public OpConverter {
if (dtype == 6) { // int64
VLOG(3) << "trt not support float64, so it is converted to float32.";
}
} else {
PADDLE_THROW(platform::errors::Fatal("one_hot is not supported"));
}
auto depth_name = op_desc.Input("depth_tensor");
......
......@@ -59,19 +59,6 @@ class OpConverter {
auto op_converter_type_map = OpTeller::Global().GetOpConverterTypeMap();
switch (op_converter_type_map.at(op_desc.Type())) {
case OpConverterType::Default:
if (op_desc.Type() == "mul") {
PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
1UL,
platform::errors::InvalidArgument(
"The input op mul's Input(\"Y\")."
"size() should equal to 1, but reveceid "
"Input(\"Y\").size() = %u.",
op_desc.Input("Y").size()));
std::string Y = op_desc.Input("Y")[0];
if (parameters.count(Y)) {
it = Registry<OpConverter>::Global().Lookup("fc");
}
}
if (op_desc.Type().find("elementwise") != std::string::npos) {
static std::unordered_set<std::string> add_tensor_op_set{
"add", "mul", "sub", "div", "max", "min", "pow", "mod"};
......
......@@ -31,6 +31,7 @@ class SkipLayerNormOpConverter : public OpConverter {
platform::errors::InvalidArgument(
"Skip_layernorm must run the dynamic shape mode."));
framework::OpDesc op_desc(op, nullptr);
auto output_name = op_desc.Output("Out")[0];
auto GetWeight =
[&](const std::string& arg_name) -> TensorRTEngine::Weight {
std::string var_name = op_desc.Input(arg_name).front();
......@@ -42,15 +43,72 @@ class SkipLayerNormOpConverter : public OpConverter {
// Declare inputs
auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
bool enable_int8 =
(engine_->precision() == AnalysisConfig::Precision::kInt8);
float x_scale = 0;
float y_scale = 0;
if (enable_int8) {
if (op_desc.HasAttr("X")) {
x_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("X"));
engine_->SetTensorDynamicRange(input1, x_scale);
}
if (op_desc.HasAttr("Y")) {
y_scale = PADDLE_GET_CONST(float, op_desc.GetAttr("Y"));
engine_->SetTensorDynamicRange(input2, y_scale);
}
}
nvinfer1::Dims dims_x = input1->getDimensions();
int32_t x_rank = dims_x.nbDims;
nvinfer1::Dims dims_y = input2->getDimensions();
int32_t y_rank = dims_y.nbDims;
if ((x_rank == 2 && y_rank == 4) || (y_rank == 2 && x_rank == 4)) {
if (x_rank == 2 && y_rank == 4) {
auto* reshape_before_skiplayn =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
std::vector<nvinfer1::ITensor*> reshape_before_tensor;
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 0));
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input1), 1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_skiplayn->setInput(1, *Concat(reshape_before_tensor));
reshape_before_skiplayn->setName(
("reshape_before_skiplayn(Output: " + output_name + ")").c_str());
input1 = reshape_before_skiplayn->getOutput(0);
if (enable_int8) {
if (op_desc.HasAttr("X")) {
engine_->SetTensorDynamicRange(input1, x_scale);
}
}
} else {
auto* reshape_before_skiplayn =
TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
std::vector<nvinfer1::ITensor*> reshape_before_tensor;
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input2), 0));
reshape_before_tensor.push_back(GetEleTensorOfShape(Shape(input2), 1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_tensor.push_back(Add1DConstantLayer(1));
reshape_before_skiplayn->setInput(1, *Concat(reshape_before_tensor));
reshape_before_skiplayn->setName(
("reshape_before_skiplayn(Output: " + output_name + ")").c_str());
input2 = reshape_before_skiplayn->getOutput(0);
if (enable_int8) {
if (op_desc.HasAttr("Y")) {
engine_->SetTensorDynamicRange(input2, y_scale);
}
}
}
}
std::vector<nvinfer1::ITensor*> inputs;
inputs.push_back(input1);
inputs.push_back(input2);
bool enable_int8 = false;
if (op_desc.HasAttr("enable_int8")) {
enable_int8 = PADDLE_GET_CONST(bool, op_desc.GetAttr("enable_int8"));
}
std::vector<float> smooth_scale;
bool use_smooth = false;
if (op_desc.HasAttr("smooth_scale")) {
......@@ -199,7 +257,6 @@ class SkipLayerNormOpConverter : public OpConverter {
layer = plugin_layer;
}
}
auto output_name = op_desc.Output("Out")[0];
RreplenishLayerAndOutput(layer, "skip_layernorm", {output_name}, test_mode);
}
};
......
......@@ -157,6 +157,13 @@ void TensorRTEngine::FreezeNetwork() {
#else
infer_builder_config_->setMaxWorkspaceSize(max_workspace_);
#endif
#if IS_TRT_VERSION_GE(8500)
infer_builder_config_->setPreviewFeature(
nvinfer1::PreviewFeature::kFASTER_DYNAMIC_SHAPES_0805, true);
#else
#endif
bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
if (enable_fp16) {
bool support_fp16 = infer_builder_->platformHasFastFp16();
......
......@@ -393,62 +393,6 @@ struct SimpleOpTypeSetTeller : public Teller {
return false;
#endif
}
if (op_type == "matmul_v2") {
if (!with_dynamic_shape) {
return false;
}
auto* block = desc.Block();
if (block == nullptr) {
VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
"Developers need to check whether block_desc is passed in "
"the pass.";
return false;
}
return true;
}
if (op_type == "matmul") {
auto* block = desc.Block();
if (block == nullptr) {
VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
"Developers need to check whether block_desc is passed in "
"the pass.";
return false;
}
// not support broadcast
auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
auto* y_var_desc = block->FindVar(desc.Input("Y")[0]);
const auto x_shape = x_var_desc->GetShape();
const auto y_shape = y_var_desc->GetShape();
if (x_shape.size() != y_shape.size()) {
VLOG(3)
<< "matmul op not support broadcast, please check inputs'shape. ";
return false;
}
uint64_t dims = 2;
for (size_t i = 0; i < x_shape.size() - dims; ++i) {
if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) {
VLOG(3) << "matmul op not support broadcast, please check "
"inputs'shape[i]. ";
return false;
}
}
for (auto& param_name : desc.Inputs()) {
for (auto& var_name : param_name.second) {
auto* var_desc = block->FindVar(var_name);
const auto shape = var_desc->GetShape();
if (shape.size() < 3) {
VLOG(3)
<< "matmul op dims < 3 not supported in tensorrt, but got dims "
<< shape.size() << ", so jump it.";
return false;
}
}
}
}
if (op_type == "softmax") {
auto* block = desc.Block();
if (block == nullptr) {
......@@ -2158,63 +2102,6 @@ struct SimpleOpTypeSetTeller : public Teller {
}
}
if (op_type == "fc") {
auto* block = desc.Block();
if (block == nullptr) {
VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
"Developers need to check whether block_desc is passed in "
"the pass.";
return false;
}
// y'shapes == 2
auto fc_inputs = desc.Inputs();
std::string fc_y = "";
if (fc_inputs.find("Y") != fc_inputs.end()) {
fc_y = "Y";
} else if (fc_inputs.find("W") != fc_inputs.end()) {
fc_y = "W";
} else {
VLOG(3) << " input_y(fc_op) must be Y or W ";
return false;
}
// There is currently no input: Y(weight) more than two dimensions
/*
auto* y_var_desc = block->FindVar(desc.Input(fc_y)[0]);
const auto y_shape = y_var_desc->GetShape();
if (y_shape.size() != 2) {
VLOG(3)
<< " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes =
"
<< y_shape.size();
return false;
}
// y_num_col_dims ==1
if (desc.HasAttr("y_num_col_dims")) {
int y_num_col_dims =
PADDLE_GET_CONST(int, desc.GetAttr("y_num_col_dims"));
if (y_num_col_dims != 1) {
VLOG(3) << " fc_op'y_num_col_dims must be 1, but y_num_col_dims = "
<< y_num_col_dims;
return false;
}
}
*/
int x_num_col_dims =
desc.HasAttr("x_num_col_dims")
? PADDLE_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
: (desc.HasAttr("in_num_col_dims")
? PADDLE_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
: 1);
if (x_num_col_dims < 1) {
VLOG(3) << "fc_op expects x_num_col_dims >= 1, "
"but x_num_col_dims = "
<< x_num_col_dims;
return false;
}
}
if (op_type == "reshape" || op_type == "reshape2") {
if (!desc.HasAttr("shape")) {
return false;
......@@ -2798,9 +2685,7 @@ struct SimpleOpTypeSetTeller : public Teller {
private:
// use this set for no calib int8.
std::unordered_set<std::string> int8_teller_set{
"mul",
"matmul",
"matmul_v2",
"matrix_multiply",
"bmm",
"range",
"conv2d",
......@@ -2869,7 +2754,6 @@ struct SimpleOpTypeSetTeller : public Teller {
"conv2d_transpose",
"depthwise_conv2d_transpose",
"leaky_relu",
"fc",
"shuffle_channel",
"where",
"bitwise_not",
......@@ -2958,9 +2842,7 @@ struct SimpleOpTypeSetTeller : public Teller {
"cumsum"};
std::unordered_set<std::string> teller_set{
"mul",
"matmul",
"matmul_v2",
"matrix_multiply",
"bmm",
"range",
"conv2d",
......@@ -3029,7 +2911,6 @@ struct SimpleOpTypeSetTeller : public Teller {
"conv2d_transpose",
"depthwise_conv2d_transpose",
"leaky_relu",
"fc",
"shuffle_channel",
"where",
"bitwise_not",
......
......@@ -72,31 +72,39 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_);
LOG(INFO) << "create fc op";
auto* fc0 = block_desc.AppendOp();
fc0->SetType("fc");
fc0->SetInput("X", std::vector<std::string>({"x"})); // 4 x 1 x 1
fc0->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6
fc0->SetOutput("Out", std::vector<std::string>({"z"})); // 6 x 1 x 1
LOG(INFO) << "create fc op";
auto* fc1 = block_desc.AppendOp();
fc1->SetType("fc");
fc1->SetInput("X", std::vector<std::string>({"z"}));
fc1->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8
fc1->SetOutput("Out", std::vector<std::string>({"z0"})); // 8 x 1 x 1
LOG(INFO) << "create elementwise_add op";
auto* elementwise_add0 = block_desc.AppendOp();
elementwise_add0->SetType("elementwise_add");
elementwise_add0->SetInput("X",
std::vector<std::string>({"x"})); // 2 x 4 x 4 x 4
elementwise_add0->SetInput("Y",
std::vector<std::string>({"y"})); // 1 x 4 x 1 x 1
elementwise_add0->SetOutput(
"Out", std::vector<std::string>({"z"})); // 2 x 4 x 4 x 4
elementwise_add0->SetAttr("axis", static_cast<int32_t>(0));
LOG(INFO) << "create elementwise_add op";
auto* elementwise_add1 = block_desc.AppendOp();
elementwise_add1->SetType("elementwise_add");
elementwise_add1->SetInput("X",
std::vector<std::string>({"z"})); // 2 x 4 x 4 x 4
elementwise_add1->SetInput(
"Y", std::vector<std::string>({"y0"})); // 1 x 4 x 4 x 4
elementwise_add1->SetOutput(
"Out", std::vector<std::string>({"z0"})); // 2 x 4 x 4 x 4
elementwise_add1->SetAttr("axis", static_cast<int32_t>(0));
// Set inputs' variable shape in BlockDesc
// the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
AddTensorToBlockDesc(block_, "z0", std::vector<int64_t>({8, 1, 1}));
// the batch size is 2, so the dims of 'x' is {2, 4}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 4, 4}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({1, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({1, 4, 4, 4}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 4, 4, 4}));
AddTensorToBlockDesc(block_, "z0", std::vector<int64_t>({2, 4, 4, 4}));
// It is wired, need to copy manually.
*block_->add_ops() = *fc0->Proto();
*block_->add_ops() = *fc1->Proto();
*block_->add_ops() = *elementwise_add0->Proto();
*block_->add_ops() = *elementwise_add1->Proto();
ASSERT_EQ(block_->ops_size(), 2);
......@@ -132,9 +140,9 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
engine_op_desc.SetAttr("use_static_engine", true);
engine_op_desc.SetAttr("dynamic_shape_names", std::vector<std::string>{"x"});
engine_op_desc.SetAttr("dynamic_shape_lens", std::vector<int>{4});
engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 4, 1, 1});
engine_op_desc.SetAttr("max_input_shape", std::vector<int>{2, 4, 1, 1});
engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 1, 1});
engine_op_desc.SetAttr("min_input_shape", std::vector<int>{1, 1, 1, 1});
engine_op_desc.SetAttr("max_input_shape", std::vector<int>{16, 16, 16, 16});
engine_op_desc.SetAttr("opt_input_shape", std::vector<int>{2, 4, 4, 4});
engine_op_desc.SetAttr("model_precision",
static_cast<int>(phi::DataType::FLOAT32));
......@@ -151,26 +159,22 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
ctx.PartialInitWithAllocator();
// Prepare variables.
if (allow_build_at_runtime)
CreateCUDATensor(&scope, "x", std::vector<int64_t>({3, 4, 1, 1}));
CreateCUDATensor(&scope, "x", std::vector<int64_t>({32, 4, 4, 4}));
else
CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 1, 1}));
CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4, 4, 4}));
CreateCUDATensor(&scope, "y", std::vector<int64_t>({1, 4, 1, 1}));
CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
CreateCUDATensor(&scope, "y0", std::vector<int64_t>({1, 4, 4, 4}));
CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 4, 4, 4}));
// Execute them.
LOG(INFO) << "engine_op run";
inference::tensorrt::OpTeller::Global().SetOpConverterType(
"fc", inference::tensorrt::OpConverterType::Default);
"elementwise_add", inference::tensorrt::OpConverterType::Default);
engine_op->Run(scope, place);
}
TEST(TensorRTEngineOp, manual) {
DynamicShapeTest(false);
DynamicShapeTest(true);
}
TEST(TensorRTEngineOp, manual) { DynamicShapeTest(false); }
void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
framework::ProgramDesc program;
framework::Scope scope;
......@@ -197,12 +201,12 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
const shape_t& x_shape,
const shape_t& y_shape,
const shape_t& z_shape) {
LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp();
fc->SetType("mul");
fc->SetInput("X", std::vector<std::string>({x_name}));
fc->SetInput("Y", std::vector<std::string>({y_name}));
fc->SetOutput("Out", std::vector<std::string>({z_name}));
LOG(INFO) << "create matrix_multiply op";
auto* matrix_multiply = block_desc.AppendOp();
matrix_multiply->SetType("matrix_multiply");
matrix_multiply->SetInput("X", std::vector<std::string>({x_name}));
matrix_multiply->SetInput("Y", std::vector<std::string>({y_name}));
matrix_multiply->SetOutput("Out", std::vector<std::string>({z_name}));
// Set inputs' variable shape in BlockDesc
if (!x_created) {
......@@ -222,7 +226,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
// It is wired, need to copy manually.
*block_->add_ops() = *fc->Proto();
*block_->add_ops() = *matrix_multiply->Proto();
};
// Test with 4 layer FC
......@@ -293,9 +297,9 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
}
// Test with a larger FC layer.
// TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
// TEST(TensorRTEngineOp, matrix_multiply) { Execute(40, 28, 28); }
} // namespace operators
} // namespace paddle
USE_TRT_CONVERTER(fc)
USE_TRT_CONVERTER(elementwise_add_weight)
......@@ -236,8 +236,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
set_tests_properties(test_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT 240)
set_tests_properties(test_preln_layernorm_x_fuse_pass PROPERTIES TIMEOUT
240)
set_tests_properties(test_trt_flatten2_matmul_fuse_pass PROPERTIES TIMEOUT
240)
set_tests_properties(test_shuffle_channel_detect_pass PROPERTIES TIMEOUT
120)
if(WIN32)
......
......@@ -19,8 +19,6 @@ import numpy as np
from auto_scan_test import IgnoreReasons, PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
import paddle.inference as paddle_infer
class TestFcFusePass(PassAutoScanTest):
r"""
......@@ -45,14 +43,6 @@ class TestFcFusePass(PassAutoScanTest):
# trt static_shape
config = self.create_trt_inference_config()
config.enable_tensorrt_engine(
max_batch_size=8,
workspace_size=102400,
min_subgraph_size=0,
precision_mode=paddle_infer.PrecisionType.Float32,
use_static=False,
use_calib_mode=False,
)
yield config, ['fc'], (1e-5, 1e-5)
def add_ignore_pass_case(self):
......
......@@ -54,7 +54,10 @@ class TestMultiheadMatmulRoformerFusePass(PassAutoScanTest):
"sin_input": [1, 12, 128, 64],
},
)
yield config, ["multihead_matmul_roformer", "matmul"], (1e-2, 1e-3)
yield config, ["multihead_matmul_roformer", "matrix_multiply"], (
1e-2,
1e-3,
)
def sample_program_config(self, draw):
def generate_mul_input():
......
......@@ -19,7 +19,7 @@ from typing import List
import numpy as np
from program_config import ProgramConfig, TensorConfig
from trt_layer_auto_scan_test import TrtLayerAutoScanTest
from trt_layer_auto_scan_test import SkipReasons, TrtLayerAutoScanTest
import paddle.inference as paddle_infer
......@@ -91,17 +91,14 @@ class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
]
# The output has little diff between gpu and trt in CI-Windows-Inference
tol_fp32 = 1e-5
tol_half = 1e-5
if os.name == 'nt':
tol_fp32 = 1e-3
tol_half = 1e-3
tol_fp32 = 1e-3
tol_half = 1e-3
# for dynamic_shape
generate_dynamic_shape(attrs)
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), (1, 3), tol_fp32
yield self.create_inference_config(), (1, 3), (tol_fp32, tol_fp32)
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), tol_half
yield self.create_inference_config(), (1, 3), (tol_half, tol_half)
def add_skip_trt_case(self):
pass
......@@ -185,9 +182,9 @@ class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
# for dynamic_shape
generate_dynamic_shape(attrs)
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), (1, 3), tol_fp32
yield self.create_inference_config(), (1, 3), (tol_fp32, tol_fp32)
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), tol_half
yield self.create_inference_config(), (1, 3), (tol_half, tol_half)
def add_skip_trt_case(self):
pass
......@@ -319,7 +316,20 @@ class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest):
yield self.create_inference_config(), (1, 3), 1e-3
def add_skip_trt_case(self):
pass
def teller1(program_config, predictor_config):
inputs = program_config.inputs
if (
len(inputs['input1_data'].shape) == 1
and len(inputs['input2_data'].shape) == 1
):
return True
return False
self.add_skip_case(
teller1,
SkipReasons.TRT_NOT_IMPLEMENTED,
"If both tensors are one-dimensional, the dot product result is obtained(Out.rank = 0)",
)
def test(self):
self.add_skip_trt_case()
......
......@@ -18,7 +18,7 @@ from typing import List
import numpy as np
from program_config import ProgramConfig, TensorConfig
from trt_layer_auto_scan_test import SkipReasons, TrtLayerAutoScanTest
from trt_layer_auto_scan_test import TrtLayerAutoScanTest
import paddle.inference as paddle_infer
......@@ -29,18 +29,18 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
def sample_program_configs(self):
def generate_input1(batch, dim1):
return np.random.random((batch, dim1, 768)).astype(np.float32)
return np.full((batch, dim1, 768), 1).astype(np.float32)
def generate_input2(shape):
return np.random.random(shape).astype(np.float32)
return np.full(shape, 1).astype(np.float32)
def generate_weight1():
return np.random.random((768, 768)).astype(np.float32)
return np.full((768, 768), 0.1).astype(np.float32)
def generate_weight2():
return np.random.random(768).astype(np.float32)
return np.full((768), 0.1).astype(np.float32)
for batch in [1, 2, 4]:
for batch in [1, 4]:
self.batch = batch
for reshape_shape in [[0, 0, 12, 64]]:
for dim1 in [128]:
......@@ -371,80 +371,33 @@ class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
program_config.ops[i].attrs for i in range(len(program_config.ops))
]
# for static_shape
clear_dynamic_shape()
self.trt_param.precision = paddle_infer.PrecisionType.Float32
self.trt_param.workspace_size = 2013265920
yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 4), (1e-3, 1e-3)
# for dynamic_shape
generate_dynamic_shape(attrs)
self.trt_param.precision = paddle_infer.PrecisionType.Float32
self.trt_param.workspace_size = 2013265920
yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
def add_skip_trt_case(self):
def teller1(program_config, predictor_config):
if self.trt_param.precision == paddle_infer.PrecisionType.Half:
return True
return False
self.add_skip_case(
teller1,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt in fp16 mode.",
)
def teller2(program_config, predictor_config):
if (
self.trt_param.precision == paddle_infer.PrecisionType.Float32
and len(self.dynamic_shape.min_input_shape) != 0
and self.batch > 2
):
return True
return False
self.add_skip_case(
teller2,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2.",
)
def teller3(program_config, predictor_config):
if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
return True
return False
self.add_skip_case(
teller3,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt in int8 mode.",
)
yield self.create_inference_config(), (1, 3), (1e-3, 1e-2)
def test(self):
self.add_skip_trt_case()
self.run_test()
class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
def sample_program_configs(self):
def generate_input1(batch, dim1):
return np.random.random((batch, dim1, 768)).astype(np.float32)
return np.full((batch, dim1, 768), 1).astype(np.float32)
def generate_input2(shape):
return np.random.random(shape).astype(np.float32)
return np.full(shape, 1).astype(np.float32)
def generate_weight1():
return np.random.random((768, 768)).astype(np.float32)
return np.full((768, 768), 0.1).astype(np.float32)
def generate_weight2():
return np.random.random(768).astype(np.float32)
return np.full((768), 0.1).astype(np.float32)
for batch in [1, 2, 4]:
for batch in [4]:
self.batch = batch
for reshape_shape in [[0, 0, 12, 64]]:
for dim1 in [128]:
......@@ -776,15 +729,15 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest):
def sample_program_configs(self):
def generate_input1(batch, length):
return np.zeros((batch, length, 768), dtype=np.float32)
return np.full((batch, length, 768), 0.1).astype(np.float32)
def generate_weight1():
return np.random.rand(768, 2304).astype(np.float32)
return np.full((768, 2304), 0.1).astype(np.float32)
def generate_weight2():
return np.random.rand(2304).astype(np.float32)
return np.full((2304), 0.1).astype(np.float32)
for batch in [2, 4]:
for batch in [4]:
self.batch = batch
for length in [197]:
self.length = length
......@@ -989,17 +942,6 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest):
"input_data1": [1, 197, 768],
}
def generate_static_shape(attrs):
self.dynamic_shape.min_input_shape = {
"input_data1": [1, 197, 768],
}
self.dynamic_shape.max_input_shape = {
"input_data1": [16, 197, 768],
}
self.dynamic_shape.opt_input_shape = {
"input_data1": [1, 197, 768],
}
def clear_dynamic_shape():
self.dynamic_shape.max_input_shape = {}
self.dynamic_shape.min_input_shape = {}
......@@ -1026,7 +968,7 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest):
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), generate_trt_nodes_num(), (
1e-3,
1e-3,
2e-2,
)
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), generate_trt_nodes_num(), (
......@@ -1034,35 +976,7 @@ class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest):
1e-5,
)
# for static_shape
clear_dynamic_shape()
generate_static_shape(attrs)
self.trt_param.workspace_size = 2013265920
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), generate_trt_nodes_num(), (
1e-3,
1e-3,
)
self.trt_param.precision = paddle_infer.PrecisionType.Float32
yield self.create_inference_config(), generate_trt_nodes_num(), (
1e-5,
1e-5,
)
def add_skip_trt_case(self):
def teller1(program_config, predictor_config):
if self.trt_param.precision == paddle_infer.PrecisionType.Half:
return True
return False
self.add_skip_case(
teller1,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt in fp16 mode.",
)
def test(self):
self.add_skip_trt_case()
self.run_test()
......@@ -1072,19 +986,19 @@ class TrtConvertMultiHeadMatmulTest_biasqk_seqseq(TrtLayerAutoScanTest):
def sample_program_configs(self):
def generate_input1(batch, dim1):
return np.random.random((batch, dim1, 768)).astype(np.float32)
return np.full((batch, dim1, 768), 1).astype(np.float32)
def generate_input2(shape):
return np.random.random(shape).astype(np.float32)
return np.full(shape, 1).astype(np.float32)
def generate_weight1():
return np.random.random((768, 768)).astype(np.float32)
return np.full((768, 768), 0.1).astype(np.float32)
def generate_weight2():
return np.random.random(768).astype(np.float32)
return np.full((768), 0.1).astype(np.float32)
def generate_weight3():
return np.random.random((768, 768)).astype(np.float32)
return np.full((768, 768), 0.1).astype(np.float32)
for batch in [2]:
self.batch = batch
......@@ -1423,48 +1337,9 @@ class TrtConvertMultiHeadMatmulTest_biasqk_seqseq(TrtLayerAutoScanTest):
self.trt_param.workspace_size = 2013265920
yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
self.trt_param.precision = paddle_infer.PrecisionType.Half
yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
def add_skip_trt_case(self):
def teller1(program_config, predictor_config):
if self.trt_param.precision == paddle_infer.PrecisionType.Half:
return True
return False
self.add_skip_case(
teller1,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt in fp16 mode.",
)
def teller2(program_config, predictor_config):
if (
self.trt_param.precision == paddle_infer.PrecisionType.Float32
and len(self.dynamic_shape.min_input_shape) != 0
and self.batch > 2
):
return True
return False
self.add_skip_case(
teller2,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2.",
)
def teller3(program_config, predictor_config):
if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
return True
return False
self.add_skip_case(
teller3,
SkipReasons.TRT_NOT_IMPLEMENTED,
"The output has diff between gpu and trt in int8 mode.",
)
yield self.create_inference_config(), (1, 3), (1e-3, 1e-2)
def test(self):
self.add_skip_trt_case()
self.run_test()
......
......@@ -50,7 +50,7 @@ class FCFusePassTRTTest(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
......@@ -78,7 +78,7 @@ class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
......@@ -106,7 +106,7 @@ class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims2Test(InferencePassTest):
......@@ -140,7 +140,7 @@ class FCFusePassTRTDynamicDims2Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
......@@ -174,7 +174,7 @@ class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
......@@ -208,7 +208,7 @@ class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
......@@ -244,7 +244,7 @@ class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
......@@ -280,7 +280,7 @@ class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
......@@ -316,7 +316,7 @@ class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
if core.is_compiled_with_cuda():
use_gpu.append(True)
for i in range(len(use_gpu)):
self.check_output_with_option(use_gpu[i])
self.check_output_with_option(use_gpu[i], atol=1e-4, rtol=1e-3)
if __name__ == "__main__":
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import hypothesis.strategies as st
from auto_scan_test import IgnoreReasons, PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
import paddle.inference as paddle_infer
class TestFlatten2MatmulFusePass(PassAutoScanTest):
r"""
x_var
|
flatten2
\
flatten2_out_var y_var
\ /
matmul bias_var
\ /
elementwise_add
"""
def sample_predictor_configs(self, program_config):
# TRT
config = self.create_trt_inference_config()
config.enable_tensorrt_engine(
max_batch_size=10,
workspace_size=102400,
min_subgraph_size=0,
precision_mode=paddle_infer.PrecisionType.Float32,
use_static=False,
use_calib_mode=False,
)
yield config, ['mul', 'elementwise_add'], (1e-4, 1e-1)
def add_ignore_pass_case(self):
# Here we put some skip rules to avoid known bugs
def teller1(program_config, predictor_config):
y_shape = list(program_config.weights["matmul_y"].shape)
bias_shape = program_config.weights["bias"].shape
axis = program_config.ops[2].attrs["axis"]
# bias should be [mul_y_shape[-1]]
if axis == 0 or bias_shape[0] != y_shape[1] or len(bias_shape) != 1:
return True
return False
self.add_ignore_check_case(
teller1,
IgnoreReasons.PASS_ACCURACY_ERROR,
"The pass error on TRT while shape of bias is not [out_size].",
)
def sample_program_config(self, draw):
# 1. Generate shape and attr of flatten2
x_shape = draw(
st.lists(
st.integers(min_value=1, max_value=10), min_size=4, max_size=4
)
)
# [a, b, c, d] => [a, b*c*d]
flatten_axis = 1
flatten_shape = [x_shape[0], x_shape[1] * x_shape[2] * x_shape[3]]
# 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
alpha = 1.0
transpose_X = False
transpose_Y = False
# 3. Generate legal shape of input:Y of matmul
y_shape = draw(
st.lists(
st.integers(min_value=1, max_value=8), min_size=2, max_size=2
)
)
y_shape[0] = flatten_shape[1]
# 4. Generate legal attr:axis of elementwise_add
axis = draw(st.integers(min_value=-1, max_value=1))
if axis == 0:
axis = -1
bias_shape = [
y_shape[1],
]
flatten2_op = OpConfig(
"flatten2",
inputs={
"X": ["flatten2_x"],
},
axis=flatten_axis,
outputs={"Out": ["flatten2_out"], "XShape": ["xshape"]},
)
matmul_op = OpConfig(
"matmul",
inputs={"X": ["flatten2_out"], "Y": ["matmul_y"]},
outputs={"Out": ["matmul_out"]},
alpha=alpha,
transpose_X=transpose_X,
transpose_Y=transpose_Y,
)
add_op = OpConfig(
"elementwise_add",
inputs={"X": ["matmul_out"], "Y": ["bias"]},
outputs={"Out": ["add_out"]},
axis=axis,
)
ops = [flatten2_op, matmul_op, add_op]
program_config = ProgramConfig(
ops=ops,
weights={
"matmul_y": TensorConfig(shape=y_shape),
"bias": TensorConfig(shape=bias_shape),
},
inputs={
"flatten2_x": TensorConfig(shape=x_shape),
},
outputs=ops[-1].outputs["Out"],
)
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=25,
passes=["trt_flatten2_matmul_fuse_pass"],
)
if __name__ == "__main__":
unittest.main()
......@@ -79,6 +79,14 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest):
self.trt_parameters = TensorRTMatMulQuantDequantDims3Test.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False
)
self.dynamic_shape_params = (
TensorRTMatMulQuantDequantDims3Test.DynamicShapeParam(
{'data': [1, 28, 28]},
{'data': [4, 28, 28]},
{'data': [3, 28, 28]},
False,
)
)
self.activation_quantize_type = 'moving_average_abs_max'
self.weight_quantize_type = 'channel_wise_abs_max'
......@@ -137,7 +145,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
self.label = paddle.static.data(
name='label', shape=[1, 1], dtype='int64'
)
reshape_out = paddle.reshape(self.data, shape=[1, 4, 14, 14])
reshape_out = paddle.reshape(self.data, shape=[0, 4, 14, 14])
matmul_out = paddle.matmul(
x=reshape_out,
y=reshape_out,
......@@ -183,6 +191,14 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest):
self.trt_parameters = TensorRTMatMulQuantDequantDims4Test.TensorRTParam(
1 << 30, 32, 0, AnalysisConfig.Precision.Int8, False, False
)
self.dynamic_shape_params = (
TensorRTMatMulQuantDequantDims4Test.DynamicShapeParam(
{'data': [1, 28, 28]},
{'data': [4, 28, 28]},
{'data': [3, 28, 28]},
False,
)
)
self.activation_quantize_type = 'moving_average_abs_max'
self.weight_quantize_type = 'channel_wise_abs_max'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册