diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index af57f3e2bbbe5d0e1e663725e5424a8a4475ac2f..bcd016a43e9b4139e4b5c46f1c0c798c5d100e79 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -267,6 +267,8 @@ if(WITH_XPU) pass_library(fused_multi_transformer_xpu_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(duplicated_transpose_fuse_pass inference DIR xpu DEPS + ${XPU_PASS_DEPS}) pass_library(fused_multi_transformer_cachekv_layout_trans_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(add_activation_xpu_fuse_pass inference DIR xpu DEPS diff --git a/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..a729c386906ac11a57166526a961d1fe153d9964 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.h" +#include + +#include "glog/logging.h" + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace patterns { + +struct DuplicatedTransposeFusePattern : public PatternBase { + DuplicatedTransposeFusePattern(PDPattern* pattern, + const std::string& name_scope); + + // declare operator node's name + PATTERN_DECL_NODE(transpose_1); + PATTERN_DECL_NODE(transpose_2); + // declare variable node's name + PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(transpose_1_out); + PATTERN_DECL_NODE(transpose_2_out); +}; + +DuplicatedTransposeFusePattern::DuplicatedTransposeFusePattern( + PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, name_scope) { + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("transpose2", "X"); + auto* transpose_1 = + pattern->NewNode(transpose_1_repr())->assert_is_op("transpose2"); + auto* transpose_1_out = pattern->NewNode(transpose_1_out_repr()) + ->assert_is_op_output("transpose2", "Out") + ->assert_has_n_outputs(1) + ->assert_is_op_input("transpose2", "X"); + transpose_1->LinksFrom({x}).LinksTo({transpose_1_out}); + auto* transpose_2 = + pattern->NewNode(transpose_2_repr())->assert_is_op("transpose2"); + auto* transpose_2_out = pattern->NewNode(transpose_2_out_repr()) + ->assert_is_op_output("transpose2", "Out") + ->assert_has_n_outputs(1); + transpose_2->LinksFrom({transpose_1_out}).LinksTo({transpose_2_out}); +} + +} // namespace patterns + +void DuplicatedTransposeFusePass::DuplicatedTranspose(ir::Graph* graph) const { + GraphPatternDetector gpd; + + patterns::DuplicatedTransposeFusePattern pattern(gpd.mutable_pattern(), + name_scope_); + + // int found_subgraph_count = 0; + int delete_counts = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle DuplicatedTransposeFusePass"; + // declare operator node's name + GET_IR_NODE(transpose_1); + GET_IR_NODE(transpose_2); + // declare variable node's name + GET_IR_NODE(x); + GET_IR_NODE(transpose_1_out); + GET_IR_NODE(transpose_2_out); + + auto* block = transpose_1->Op()->Block(); + // Generate transpose2 op + framework::OpDesc transpose_op_desc(block); + transpose_op_desc.SetType("transpose2"); + transpose_op_desc.SetInput("X", {x->Name()}); + auto axis1 = transpose_1->Op()->GetAttrIfExists>("axis"); + if (axis1.empty()) return; + auto axis2 = transpose_2->Op()->GetAttrIfExists>("axis"); + if (axis2.empty()) return; + for (size_t i = 0; i < axis2.size(); i++) { + axis2[i] = axis1[axis2[i]]; + } + transpose_op_desc.SetAttr("axis", axis2); + transpose_op_desc.SetOutput("Out", {transpose_2_out->Name()}); + + auto* transpose = graph->CreateOpNode(&transpose_op_desc); + + IR_NODE_LINK_TO(x, transpose); + IR_NODE_LINK_TO(transpose, transpose_2_out); + // delete useless node + std::unordered_set delete_nodes = { + transpose_1, transpose_2, transpose_1_out}; + GraphSafeRemoveNodes(graph, delete_nodes); + delete_counts++; + }; + gpd(graph, handler); + delete_op_count += delete_counts; + + if (delete_counts > 0) { + LOG(INFO) << "--- delete " << delete_counts << " repeated " + << "transpose2" + << " ops"; + } +} + +void DuplicatedTransposeFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + int repeat_time = 0; + int total_delete_op_count = 0; + // This pass needs to loop run until there are no nodes in the graph that need + // to be deleted. + while (true) { + delete_op_count = 0; + DuplicatedTranspose(graph); + LOG(INFO) << "Round " << repeat_time++ + << ": delete op counts: " << delete_op_count; + total_delete_op_count += delete_op_count; + if (delete_op_count == 0) { + LOG(INFO) << "--- no nodes need to delete --- break"; + break; // No node need to delete. + } + } + LOG(INFO) << "Total delete op counts: " << total_delete_op_count; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(duplicated_transpose_fuse_pass, + paddle::framework::ir::DuplicatedTransposeFusePass); + +REGISTER_PASS_CAPABILITY(duplicated_transpose_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "transpose2", 0)); diff --git a/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.h b/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..4a26620d4c58700118cb4c214141271c63b30380 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/duplicated_transpose_fuse_pass.h @@ -0,0 +1,64 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +/* +Origin subgraph: + x + | + transpose + | + transpose + | + y + +Fused subgraph: + x + | + transpose + | + y +*/ +class DuplicatedTransposeFusePass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + void DuplicatedTranspose(ir::Graph* graph) const; + + const std::string name_scope_{"duplicated_transpose_fuse_pass"}; + + mutable int delete_op_count{0}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index b76435c416f61082d1a7580bf042f3747c5d3fc1..052db0a9e1af6b1f3551c1f0aaa57902ce121727 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -527,6 +527,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "fold_interp_outsize_fuse_pass", "fold_two_squeeze2_fuse_pass", "conv1d_xpu_fuse_pass", + "duplicated_transpose_fuse_pass", "redundant_unsqueeze_squeeze_elimination_pass", "reduce_ops_fuse_pass", "delete_cast_op_pass", diff --git a/test/ir/inference/test_xpu_duplicated_transpose_fuse_pass.py b/test/ir/inference/test_xpu_duplicated_transpose_fuse_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..d25550898a65503985c7136723af39093348add7 --- /dev/null +++ b/test/ir/inference/test_xpu_duplicated_transpose_fuse_pass.py @@ -0,0 +1,87 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import hypothesis.strategies as st +import numpy as np +from auto_scan_test import PassAutoScanTest +from program_config import OpConfig, ProgramConfig, TensorConfig + + +class TestTransposeFusePass(PassAutoScanTest): + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_xpu=True) + yield config, ["transpose2"], (1e-3, 1e-3) + + def sample_program_config(self, draw): + batch_size = draw(st.integers(min_value=1, max_value=4)) + C = draw(st.integers(min_value=1, max_value=64)) + H = draw(st.integers(min_value=1, max_value=64)) + W = draw(st.integers(min_value=1, max_value=64)) + in_shape = [batch_size, C, H, W] + transpose_op1 = OpConfig( + type='transpose2', + inputs={ + "X": ["transpose_in"], + }, + outputs={"Out": ["transpose_out1"]}, + attrs={"axis": [0, 2, 1, 3]}, + ) + + transpose_op2 = OpConfig( + type='transpose2', + inputs={ + "X": ["transpose_out1"], + }, + outputs={"Out": ["transpose_out2"]}, + attrs={"axis": [0, 3, 2, 1]}, + ) + + transpose_op3 = OpConfig( + type='transpose2', + inputs={ + "X": ["transpose_out2"], + }, + outputs={"Out": ["transpose_out3"]}, + attrs={"axis": [0, 1, 3, 2]}, + ) + + ops = [ + transpose_op1, + transpose_op2, + transpose_op3, + ] + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "transpose_in": TensorConfig(shape=in_shape), + }, + outputs=["transpose_out3"], + ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, + max_examples=25, + passes=["duplicated_transpose_fuse_pass"], + ) + + +if __name__ == "__main__": + np.random.seed(200) + unittest.main()