未验证 提交 664a2753 编写于 作者: Z zhupengyang 提交者: GitHub

multi_encoder support adaptive seqlen (#53982)

上级 e135069d
......@@ -233,6 +233,8 @@ if(WITH_XPU)
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
pass_library(multi_encoder_xpu_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS})
pass_library(multi_encoder_xpu_adaptive_seqlen_fuse_pass inference DIR xpu
DEPS ${XPU_PASS_DEPS})
pass_library(multi_encoder_xpu_slice_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS})
pass_library(generate_sequence_xpu_fuse_pass inference DIR xpu DEPS
......@@ -529,4 +531,8 @@ if(WITH_XPU)
test_fused_multi_transformer_cachekv_layout_trans_pass
SRCS xpu/fused_multi_transformer_cachekv_layout_trans_pass_test.cc
DEPS fused_multi_transformer_cachekv_layout_trans_pass)
cc_test(
test_multi_encoder_xpu_adaptive_seqlen_fuse_pass
SRCS xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass_test.cc
DEPS multi_encoder_xpu_adaptive_seqlen_fuse_pass)
endif()
......@@ -61,6 +61,7 @@ static const std::vector<std::string> xpu_support_subgraph_passes = {
"generate_sequence_xpu_fuse_pass",
"embedding_with_eltwise_add_xpu_fuse_pass",
"multi_encoder_xpu_fuse_pass",
"multi_encoder_xpu_adaptive_seqlen_fuse_pass",
"multi_encoder_xpu_slice_fuse_pass",
"fused_multi_transformer_cachekv_layout_trans_pass",
"one_beam_size_fuse_pass",
......
......@@ -824,6 +824,17 @@ struct Layers {
return unary_op("logical_not", input);
}
VarDesc* not_equal(VarDesc* x, VarDesc* y, int axis = -1) {
VarDesc* out = lod_tensor(unique_name());
OpDesc* op = program_.MutableBlock(0)->AppendOp();
op->SetType("not_equal");
op->SetInput("X", {x->Name()});
op->SetInput("Y", {y->Name()});
op->SetAttr("axis", axis);
op->SetOutput("Out", {out->Name()});
return out;
}
VarDesc* stack(std::vector<VarDesc*> inputs, int axis = -1) {
VarDesc* out = lod_tensor(unique_name());
OpDesc* op = program_.MutableBlock(0)->AppendOp();
......@@ -838,6 +849,16 @@ struct Layers {
return out;
}
VarDesc* tile(VarDesc* x, const std::vector<int>& repeat_times = {2}) {
VarDesc* out = lod_tensor(unique_name());
OpDesc* op = program_.MutableBlock(0)->AppendOp();
op->SetType("tile");
op->SetInput("X", {x->Name()});
op->SetAttr("repeat_times", repeat_times);
op->SetOutput("Out", {out->Name()});
return out;
}
private:
VarDesc* lod_tensor(std::string name,
std::vector<int64_t> shape = {},
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/xpu/multi_encoder_xpu_adaptive_seqlen_fuse_pass.h"
#include <string>
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
namespace patterns {
struct AdaptiveSeqlenPatternV1 : public PatternBase {
AdaptiveSeqlenPatternV1(PDPattern* pattern, const std::string& name_scope);
// declare operator node's name
PATTERN_DECL_NODE(embedding_xpu);
PATTERN_DECL_NODE(layer_norm);
PATTERN_DECL_NODE(matmul);
PATTERN_DECL_NODE(scale);
PATTERN_DECL_NODE(stack);
PATTERN_DECL_NODE(multi_encoder_xpu);
// declare variable node's name
PATTERN_DECL_NODE(mask);
PATTERN_DECL_NODE(embedding_xpu_out);
PATTERN_DECL_NODE(layer_norm_out);
PATTERN_DECL_NODE(matmul_out);
PATTERN_DECL_NODE(scale_out);
PATTERN_DECL_NODE(stack_out);
};
AdaptiveSeqlenPatternV1::AdaptiveSeqlenPatternV1(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope, name_scope) {
auto* embedding_xpu = pattern->NewNode(embedding_xpu_repr())
->assert_is_op("embedding_with_eltwise_add_xpu");
auto* embedding_xpu_out =
pattern->NewNode(embedding_xpu_out_repr())
->assert_is_op_output("embedding_with_eltwise_add_xpu", "out")
->assert_is_op_input("layer_norm", "X");
auto* layer_norm =
pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr())
->assert_is_op_output("layer_norm", "Y")
->assert_is_op_input("multi_encoder_xpu", "x");
auto* mask = pattern->NewNode(mask_repr())
->assert_is_op_input("matmul", "X")
->assert_is_op_input("matmul", "Y");
auto* matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul");
auto* matmul_out = pattern->NewNode(matmul_out_repr())
->assert_is_op_output("matmul", "Out")
->assert_is_op_input("scale", "X");
auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
auto* scale_out = pattern->NewNode(scale_out_repr())
->assert_is_op_output("scale", "Out")
->assert_is_op_input("stack", "X");
auto* stack = pattern->NewNode(stack_repr())->assert_is_op("stack");
auto* stack_out = pattern->NewNode(stack_out_repr())
->assert_is_op_output("stack", "Y")
->assert_is_op_input("multi_encoder_xpu", "mask");
auto* multi_encoder_xpu = pattern->NewNode(multi_encoder_xpu_repr())
->assert_is_op("multi_encoder_xpu");
embedding_xpu->LinksTo({embedding_xpu_out});
layer_norm->LinksFrom({embedding_xpu_out}).LinksTo({layer_norm_out});
matmul->LinksFrom({mask}).LinksTo({matmul_out});
scale->LinksFrom({matmul_out}).LinksTo({scale_out});
stack->LinksFrom({scale_out}).LinksTo({stack_out});
multi_encoder_xpu->LinksFrom({layer_norm_out, stack_out});
}
} // namespace patterns
int MultiEncoderXPUAdaptiveSeqlenFusePass::ApplyAdaptiveSeqlenPassV1(
ir::Graph* graph) const {
GraphPatternDetector gpd;
patterns::AdaptiveSeqlenPatternV1 pattern(gpd.mutable_pattern(), name_scope_);
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle ApplyAdaptiveSeqlenPassV1 fuse";
GET_IR_NODE(embedding_xpu);
GET_IR_NODE(layer_norm);
GET_IR_NODE(matmul);
GET_IR_NODE(scale);
GET_IR_NODE(stack);
GET_IR_NODE(multi_encoder_xpu);
GET_IR_NODE(mask);
GET_IR_NODE(embedding_xpu_out);
GET_IR_NODE(layer_norm_out);
GET_IR_NODE(matmul_out);
GET_IR_NODE(scale_out);
GET_IR_NODE(stack_out);
std::string mask_name = mask->Name();
std::string seq_lod_name = mask_name + "_seq_lod";
VarDesc seq_lod_desc(seq_lod_name);
auto* seq_lod = graph->CreateVarNode(&seq_lod_desc);
std::string max_seq_len_name = mask_name + "_max_seq_len";
VarDesc max_seq_len_desc(max_seq_len_name);
auto* max_seq_len = graph->CreateVarNode(&max_seq_len_desc);
embedding_xpu->Op()->SetInput("mask", {mask_name});
embedding_xpu->Op()->SetOutput("seq_lod", {seq_lod_name});
embedding_xpu->Op()->SetOutput("max_seq_len", {max_seq_len_name});
multi_encoder_xpu->Op()->SetInput("seq_lod", {seq_lod_name});
multi_encoder_xpu->Op()->SetInput("max_seq_len", {max_seq_len_name});
multi_encoder_xpu->Op()->RemoveInput("mask");
IR_NODE_LINK_TO(mask, embedding_xpu);
IR_NODE_LINK_TO(embedding_xpu, seq_lod);
IR_NODE_LINK_TO(embedding_xpu, max_seq_len);
IR_NODE_LINK_TO(seq_lod, multi_encoder_xpu);
IR_NODE_LINK_TO(max_seq_len, multi_encoder_xpu);
// delete useless node
std::unordered_set<const Node*> delete_nodes{
matmul, scale, stack, matmul_out, scale_out, stack_out};
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
return found_subgraph_count;
}
namespace patterns {
struct AdaptiveSeqlenPatternV2 : public PatternBase {
AdaptiveSeqlenPatternV2(PDPattern* pattern, const std::string& name_scope);
// declare operator node's name
PATTERN_DECL_NODE(embedding_xpu);
PATTERN_DECL_NODE(layer_norm);
PATTERN_DECL_NODE(not_equal);
PATTERN_DECL_NODE(cast);
PATTERN_DECL_NODE(unsqueeze_0);
PATTERN_DECL_NODE(matmul);
PATTERN_DECL_NODE(scale_0);
PATTERN_DECL_NODE(scale_1);
PATTERN_DECL_NODE(unsqueeze_1);
PATTERN_DECL_NODE(tile);
PATTERN_DECL_NODE(multi_encoder_xpu);
// declare variable node's name
PATTERN_DECL_NODE(mask);
PATTERN_DECL_NODE(embedding_xpu_out);
PATTERN_DECL_NODE(layer_norm_out);
PATTERN_DECL_NODE(not_equal_out);
PATTERN_DECL_NODE(cast_out);
PATTERN_DECL_NODE(unsqueeze_0_out);
PATTERN_DECL_NODE(matmul_out);
PATTERN_DECL_NODE(scale_0_out);
PATTERN_DECL_NODE(scale_1_out);
PATTERN_DECL_NODE(unsqueeze_1_out);
PATTERN_DECL_NODE(tile_out);
};
AdaptiveSeqlenPatternV2::AdaptiveSeqlenPatternV2(PDPattern* pattern,
const std::string& name_scope)
: PatternBase(pattern, name_scope, name_scope) {
auto* embedding_xpu = pattern->NewNode(embedding_xpu_repr())
->assert_is_op("embedding_with_eltwise_add_xpu");
auto* embedding_xpu_out =
pattern->NewNode(embedding_xpu_out_repr())
->assert_is_op_output("embedding_with_eltwise_add_xpu", "out")
->assert_is_op_input("layer_norm", "X");
auto* layer_norm =
pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
auto* layer_norm_out = pattern->NewNode(layer_norm_out_repr())
->assert_is_op_output("layer_norm", "Y")
->assert_is_op_input("multi_encoder_xpu", "x");
auto* mask =
pattern->NewNode(mask_repr())->assert_is_op_input("not_equal", "X");
auto* not_equal =
pattern->NewNode(not_equal_repr())->assert_is_op("not_equal");
auto* not_equal_out = pattern->NewNode(not_equal_out_repr())
->assert_is_op_output("not_equal", "Out")
->assert_is_op_input("cast", "X");
auto* cast = pattern->NewNode(cast_repr())->assert_is_op("cast");
auto* cast_out = pattern->NewNode(cast_out_repr())
->assert_is_op_output("cast", "Out")
->assert_is_op_input("unsqueeze2", "X");
auto* unsqueeze_0 =
pattern->NewNode(unsqueeze_0_repr())->assert_is_op("unsqueeze2");
auto* unsqueeze_0_out = pattern->NewNode(unsqueeze_0_out_repr())
->assert_is_op_output("unsqueeze2", "Out")
->assert_is_op_input("matmul_v2", "X")
->assert_is_op_input("matmul_v2", "Y");
auto* matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2");
auto* matmul_out = pattern->NewNode(matmul_out_repr())
->assert_is_op_output("matmul_v2", "Out")
->assert_is_op_input("scale", "X");
auto* scale_0 = pattern->NewNode(scale_0_repr())->assert_is_op("scale");
auto* scale_0_out = pattern->NewNode(scale_0_out_repr())
->assert_is_op_output("scale", "Out")
->assert_is_op_input("scale", "X");
auto* scale_1 = pattern->NewNode(scale_1_repr())->assert_is_op("scale");
auto* scale_1_out = pattern->NewNode(scale_1_out_repr())
->assert_is_op_output("scale", "Out")
->assert_is_op_input("unsqueeze2", "X");
auto* unsqueeze_1 =
pattern->NewNode(unsqueeze_1_repr())->assert_is_op("unsqueeze2");
auto* unsqueeze_1_out = pattern->NewNode(unsqueeze_1_out_repr())
->assert_is_op_output("unsqueeze2", "Out")
->assert_is_op_input("tile", "X");
auto* tile = pattern->NewNode(tile_repr())->assert_is_op("tile");
auto* tile_out = pattern->NewNode(tile_out_repr())
->assert_is_op_output("tile", "Out")
->assert_is_op_input("multi_encoder_xpu", "mask");
auto* multi_encoder_xpu = pattern->NewNode(multi_encoder_xpu_repr())
->assert_is_op("multi_encoder_xpu");
embedding_xpu->LinksTo({embedding_xpu_out});
layer_norm->LinksFrom({embedding_xpu_out}).LinksTo({layer_norm_out});
not_equal->LinksFrom({mask}).LinksTo({not_equal_out});
cast->LinksFrom({not_equal_out}).LinksTo({cast_out});
unsqueeze_0->LinksFrom({cast_out}).LinksTo({unsqueeze_0_out});
matmul->LinksFrom({unsqueeze_0_out}).LinksTo({matmul_out});
scale_0->LinksFrom({matmul_out}).LinksTo({scale_0_out});
scale_1->LinksFrom({scale_0_out}).LinksTo({scale_1_out});
unsqueeze_1->LinksFrom({scale_1_out}).LinksTo({unsqueeze_1_out});
tile->LinksFrom({unsqueeze_1_out}).LinksTo({tile_out});
multi_encoder_xpu->LinksFrom({layer_norm_out, tile_out});
}
} // namespace patterns
int MultiEncoderXPUAdaptiveSeqlenFusePass::ApplyAdaptiveSeqlenPassV2(
ir::Graph* graph) const {
GraphPatternDetector gpd;
patterns::AdaptiveSeqlenPatternV2 pattern(gpd.mutable_pattern(), name_scope_);
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* graph) {
VLOG(4) << "handle ApplyAdaptiveSeqlenPassV2 fuse";
GET_IR_NODE(embedding_xpu);
GET_IR_NODE(layer_norm);
GET_IR_NODE(not_equal);
GET_IR_NODE(cast);
GET_IR_NODE(unsqueeze_0);
GET_IR_NODE(matmul);
GET_IR_NODE(scale_0);
GET_IR_NODE(scale_1);
GET_IR_NODE(unsqueeze_1);
GET_IR_NODE(tile);
GET_IR_NODE(multi_encoder_xpu);
GET_IR_NODE(mask);
GET_IR_NODE(embedding_xpu_out);
GET_IR_NODE(layer_norm_out);
GET_IR_NODE(not_equal_out);
GET_IR_NODE(cast_out);
GET_IR_NODE(unsqueeze_0_out);
GET_IR_NODE(matmul_out);
GET_IR_NODE(scale_0_out);
GET_IR_NODE(scale_1_out);
GET_IR_NODE(unsqueeze_1_out);
GET_IR_NODE(tile_out);
std::string mask_name = mask->Name();
std::string seq_lod_name = mask_name + "_seq_lod";
VarDesc seq_lod_desc(seq_lod_name);
auto* seq_lod = graph->CreateVarNode(&seq_lod_desc);
std::string max_seq_len_name = mask_name + "_max_seq_len";
VarDesc max_seq_len_desc(max_seq_len_name);
auto* max_seq_len = graph->CreateVarNode(&max_seq_len_desc);
embedding_xpu->Op()->SetInput("mask", {mask_name});
embedding_xpu->Op()->SetOutput("seq_lod", {seq_lod_name});
embedding_xpu->Op()->SetOutput("max_seq_len", {max_seq_len_name});
multi_encoder_xpu->Op()->SetInput("seq_lod", {seq_lod_name});
multi_encoder_xpu->Op()->SetInput("max_seq_len", {max_seq_len_name});
multi_encoder_xpu->Op()->RemoveInput("mask");
IR_NODE_LINK_TO(mask, embedding_xpu);
IR_NODE_LINK_TO(embedding_xpu, seq_lod);
IR_NODE_LINK_TO(embedding_xpu, max_seq_len);
IR_NODE_LINK_TO(seq_lod, multi_encoder_xpu);
IR_NODE_LINK_TO(max_seq_len, multi_encoder_xpu);
// delete useless node
std::unordered_set<const Node*> delete_nodes{not_equal,
cast,
unsqueeze_0,
matmul,
scale_0,
scale_1,
unsqueeze_1,
tile,
not_equal_out,
cast_out,
unsqueeze_0_out,
matmul_out,
scale_0_out,
scale_1_out,
unsqueeze_1_out,
tile_out};
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
return found_subgraph_count;
}
void MultiEncoderXPUAdaptiveSeqlenFusePass::ApplyImpl(ir::Graph* graph) const {
PADDLE_ENFORCE_NOT_NULL(
graph, platform::errors::PreconditionNotMet("graph should not be null."));
Init(name_scope_, graph);
int found_subgraph_count = ApplyAdaptiveSeqlenPassV1(graph);
found_subgraph_count += ApplyAdaptiveSeqlenPassV2(graph);
AddStatis(found_subgraph_count);
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(multi_encoder_xpu_adaptive_seqlen_fuse_pass,
paddle::framework::ir::MultiEncoderXPUAdaptiveSeqlenFusePass);
REGISTER_PASS_CAPABILITY(multi_encoder_xpu_adaptive_seqlen_fuse_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"multi_encoder_xpu", 0));
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace phi {
class DenseTensor;
} // namespace phi
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace framework {
namespace ir {
// support adaptive seq len for bert/ernie
class MultiEncoderXPUAdaptiveSeqlenFusePass : public FusePassBase {
protected:
void ApplyImpl(ir::Graph* graph) const override;
private:
/*
adaptive seqlen V1, before:
inpu_var* mask_var*
| |
| |
embedding_xpu matmul
| |
| |
layer_norm scale
| |
| |
| stack
\ /
\ /
multi_encoder_xpu
|
|
out_var*
after:
inpu_var* mask_var*
\ /
\ /
embedding_xpu
/ \
/ \
embedding_out_var* seq_lod_var*
| |
| |
layer_norm |
\ /
\ /
multi_encoder_xpu
|
|
out_var*
*/
int ApplyAdaptiveSeqlenPassV1(ir::Graph* graph) const;
/*
adaptive seqlen V2, before:
inpu_var* mask_var*
| |
| |
embedding_xpu not_equal
| |
| |
layer_norm cast
| |
| |
| unsqueeze2
| |
| |
| matmul_v2
| |
| |
| scale
| |
| |
| scale
| |
| |
| unsqueeze2
| |
| |
| tile
\ /
\ /
multi_encoder_xpu
|
|
out_var*
after:
inpu_var* mask_var*
\ /
\ /
embedding_xpu
/ \
/ \
embedding_out_var* seq_lod_var*
| |
| |
layer_norm |
\ /
\ /
multi_encoder_xpu
|
|
out_var*
*/
int ApplyAdaptiveSeqlenPassV2(ir::Graph* graph) const;
private:
const std::string name_scope_{"multi_encoder_xpu_adaptive_seqlen_fuse_pass"};
};
} // namespace ir
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
namespace paddle {
namespace framework {
namespace ir {
TEST(MultiEncoderXPUAdaptiveSeqlenFusePass, V1) {
Layers layers;
auto* block = layers.Block();
auto* embedding_xpu_out = layers.data("embedding_xpu_out");
OpDesc* embedding_xpu = block->AppendOp();
embedding_xpu->SetType("embedding_with_eltwise_add_xpu");
embedding_xpu->SetOutput("out", {embedding_xpu_out->Name()});
auto* layer_norm_out = layers.layer_norm(embedding_xpu_out)[0];
auto* mask = layers.data("mask");
auto* matmul_out = layers.matmul(mask, mask);
auto* scale_out = layers.scale(matmul_out);
auto* stack_out = layers.stack({scale_out, scale_out});
OpDesc* multi_encoder_xpu = block->AppendOp();
multi_encoder_xpu->SetType("multi_encoder_xpu");
multi_encoder_xpu->SetInput("x", {layer_norm_out->Name()});
multi_encoder_xpu->SetInput("mask", {stack_out->Name()});
std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
auto pass = PassRegistry::Instance().Get(
"multi_encoder_xpu_adaptive_seqlen_fuse_pass");
pass->Apply(graph.get());
auto num = GetNumOpNodes(graph, "matmul") + GetNumOpNodes(graph, "scale") +
GetNumOpNodes(graph, "stack");
PADDLE_ENFORCE_EQ(
num,
0,
platform::errors::PreconditionNotMet(
"matmul/scale/stack ops should be removed from graph, but graph "
"still has %d ops.",
num));
}
TEST(MultiEncoderXPUAdaptiveSeqlenFusePass, V2) {
Layers layers;
auto* block = layers.Block();
auto* embedding_xpu_out = layers.data("embedding_xpu_out");
OpDesc* embedding_xpu = block->AppendOp();
embedding_xpu->SetType("embedding_with_eltwise_add_xpu");
embedding_xpu->SetOutput("out", {embedding_xpu_out->Name()});
auto* layer_norm_out = layers.layer_norm(embedding_xpu_out)[0];
auto* mask = layers.data("mask");
auto* not_equal_y = layers.data("not_equal_y");
auto* not_equal_out = layers.not_equal(mask, not_equal_y);
auto* cast_out = layers.cast(not_equal_out);
auto* unsqueeze_0_out = layers.unsqueeze2(cast_out);
auto* matmul_out = layers.matmul_v2(unsqueeze_0_out, unsqueeze_0_out);
auto* scale_0_out = layers.scale(matmul_out);
auto* scale_1_out = layers.scale(scale_0_out);
auto* unsqueeze_1_out = layers.unsqueeze2(scale_1_out);
auto* tile_out = layers.tile(unsqueeze_1_out);
OpDesc* multi_encoder_xpu = block->AppendOp();
multi_encoder_xpu->SetType("multi_encoder_xpu");
multi_encoder_xpu->SetInput("x", {layer_norm_out->Name()});
multi_encoder_xpu->SetInput("mask", {tile_out->Name()});
std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
auto pass = PassRegistry::Instance().Get(
"multi_encoder_xpu_adaptive_seqlen_fuse_pass");
pass->Apply(graph.get());
auto num = GetNumOpNodes(graph, "not_equal") + GetNumOpNodes(graph, "cast") +
GetNumOpNodes(graph, "unsqueeze2") +
GetNumOpNodes(graph, "matmul_v2") + GetNumOpNodes(graph, "scale") +
GetNumOpNodes(graph, "tile");
PADDLE_ENFORCE_EQ(num,
0,
platform::errors::PreconditionNotMet(
"not_equal/cast/unsqueeze2/matmul_v2/scale ops should "
"be removed from graph, but graph "
"still has %d ops.",
num));
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(multi_encoder_xpu_adaptive_seqlen_fuse_pass);
......@@ -1205,7 +1205,10 @@ std::vector<PatternParam> MultiEncoderXPUFusePass::GeneratePatternParams()
const {
return std::vector<PatternParam>{
// Params are arranged in alphabetic order
{"gelu", "matmul_v2", "matmul", "matmul_v2", false, false, true}};
{"gelu", "matmul_v2", "matmul", "matmul_v2", false, false, true},
{"gelu", "matmul_v2", "matmul_v2", "matmul_v2", false, true, true},
{"gelu", "mul", "matmul", "matmul", false, true, true},
};
}
} // namespace ir
......
......@@ -51,6 +51,7 @@ Origin subgraph:
| | | |
| v_transpose q_transpose k_transpose
| | | |
| | | (scale)
| | \ /
| | qk_matmul
| | |
......
......@@ -74,15 +74,12 @@ MultiEncoderXPUSlicePattern::MultiEncoderXPUSlicePattern(
->assert_more([](Node* node) {
std::vector<int> axes =
PADDLE_GET_CONST(std::vector<int>, node->Op()->GetAttr("axes"));
std::vector<int> decrease_axis = PADDLE_GET_CONST(
std::vector<int>, node->Op()->GetAttr("decrease_axis"));
std::vector<int> starts = PADDLE_GET_CONST(
std::vector<int>, node->Op()->GetAttr("starts"));
std::vector<int> ends =
PADDLE_GET_CONST(std::vector<int>, node->Op()->GetAttr("ends"));
return axes.size() == 1 && axes[0] == 1 &&
decrease_axis.size() == 1 && decrease_axis[0] == 1 &&
starts.size() == 1 && starts[0] == 0 && //
return axes.size() == 1 && axes[0] == 1 && starts.size() == 1 &&
starts[0] == 0 && //
ends.size() == 1 && ends[0] == 1;
});
auto* slice_out = pattern->NewNode(slice_out_repr())
......
......@@ -517,6 +517,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
"generate_sequence_xpu_fuse_pass",
"embedding_with_eltwise_add_xpu_fuse_pass",
"multi_encoder_xpu_fuse_pass",
"multi_encoder_xpu_adaptive_seqlen_fuse_pass",
"multi_encoder_xpu_slice_fuse_pass",
"fused_multi_transformer_cachekv_layout_trans_pass",
"one_beam_size_fuse_pass",
......
......@@ -15,13 +15,14 @@
optional : bias, branch, branch_max ,x_max
- op : embedding_with_eltwise_add_xpu
args : (Tensor[] ids, Tensor[] tables, int64_t padding_idx)
output: Tensor
args : (Tensor[] ids, Tensor[] tables, Tensor mask, int64_t padding_idx)
output: Tensor(out), Tensor(seq_lod), Tensor(max_seq_len)
infer_meta :
func: EmbeddingWithEltwiseAddXPUInferMeta
kernel:
func: embedding_with_eltwise_add_xpu
data_type: tables
optional : mask, seq_lod, max_seq_len
- op : fc_xpu
args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha)
......@@ -77,11 +78,11 @@
data_type : dtype
- op : multi_encoder_xpu
args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, Tensor seq_lod, Tensor max_seq_len, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
infer_meta :
func : MultiEncoderXPUInferMeta
kernel :
func : multi_encoder_xpu
data_type : x
optional : mask, x_fp16, out_fp16
optional : mask, seq_lod, max_seq_len, x_fp16, out_fp16
......@@ -165,7 +165,10 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
void EmbeddingWithEltwiseAddXPUInferMeta(
const std::vector<const MetaTensor*>& ids,
const std::vector<const MetaTensor*>& tables,
MetaTensor* out) {
const MetaTensor& mask,
MetaTensor* out,
MetaTensor* seq_lod,
MetaTensor* max_seq_len) {
PADDLE_ENFORCE_GT(ids.size(),
0UL,
phi::errors::InvalidArgument(
......@@ -226,6 +229,8 @@ void MultiEncoderXPUInferMeta(
const std::vector<const MetaTensor*>& ln_scale,
const std::vector<const MetaTensor*>& ln_bias,
const MetaTensor& mask,
const MetaTensor& seq_lod,
const MetaTensor& max_seq_len,
int layer_num,
bool norm_before,
int hidden_dim,
......
......@@ -44,7 +44,10 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
void EmbeddingWithEltwiseAddXPUInferMeta(
const std::vector<const MetaTensor*>& ids,
const std::vector<const MetaTensor*>& tables,
MetaTensor* out);
const MetaTensor& mask,
MetaTensor* out,
MetaTensor* seq_lod,
MetaTensor* max_seq_len);
void FcXPUInferMeta(const MetaTensor& x,
const MetaTensor& x_max,
......@@ -72,6 +75,8 @@ void MultiEncoderXPUInferMeta(
const std::vector<const MetaTensor*>& ln_scale,
const std::vector<const MetaTensor*>& ln_bias,
const MetaTensor& mask,
const MetaTensor& seq_lod,
const MetaTensor& max_seq_len,
int layer_num,
bool norm_before,
int hidden_dim,
......
......@@ -24,14 +24,14 @@ void EmbeddingWithEltwiseAddXpuKernel(
const Context& ctx,
const std::vector<const DenseTensor*>& ids,
const std::vector<const DenseTensor*>& tables,
const paddle::optional<DenseTensor>& mask,
int64_t padding_idx,
DenseTensor* out) {
DenseTensor* out,
DenseTensor* seq_lod,
DenseTensor* max_seq_len) {
using XPUType = typename XPUTypeTrait<T>::Type;
auto& id_dims = ids[0]->dims();
int idx_len = id_dims[0] * id_dims[1];
int emb_layer_num = ids.size();
int embed_dim = tables[0]->dims()[1];
std::vector<int> table_lens_cpu;
int emb_dim = tables[0]->dims()[1];
std::vector<int> table_lens;
std::vector<const float*> arg_tables;
for (auto* table : tables) {
auto& table_dims = table->dims();
......@@ -39,16 +39,16 @@ void EmbeddingWithEltwiseAddXpuKernel(
table_dims.size(),
2,
errors::InvalidArgument(
"The table_dims size [%d] should be equal 2.",
table_dims.size())); /* shape like [table_len, embed_dim] */
"The table_dims size [%d] should be equal to 2.",
table_dims.size())); /* shape like [table_len, emb_dim] */
PADDLE_ENFORCE_EQ(
table_dims[1],
embed_dim,
emb_dim,
errors::InvalidArgument(
"Every embed_dim [%d] should be equal the first one [%d].",
"Every emb_dim [%d] should be equal to the first one [%d].",
table_dims[1],
embed_dim));
table_lens_cpu.push_back(table_dims[0]);
emb_dim));
table_lens.push_back(table_dims[0]);
if (std::is_same<T, phi::dtype::float16>::value) {
DenseTensor table_data_fp32_t;
ctx.template Alloc<float>(&table_data_fp32_t,
......@@ -64,26 +64,99 @@ void EmbeddingWithEltwiseAddXpuKernel(
arg_tables.push_back(table->data<float>());
}
}
std::vector<std::vector<int>> int_idx(emb_layer_num,
std::vector<int>(idx_len, 0));
std::vector<xpu::VectorParam<int>> arg_ids;
int emb_layer_num = ids.size();
for (int i = 0; i < emb_layer_num; i++) {
PADDLE_ENFORCE_EQ(
ids[i]->dtype() == phi::DataType::INT64 ||
ids[i]->dtype() == phi::DataType::INT32,
true,
auto id_dtype = ids[i]->dtype();
PADDLE_ENFORCE(
id_dtype == phi::DataType::INT64 || id_dtype == phi::DataType::INT32,
errors::InvalidArgument(
"The data type of ids should be int64 or int32, but got %s.",
ids[i]->dtype()));
for (int j = 0; j < idx_len; j++) {
if (ids[i]->dtype() == phi::DataType::INT64) {
int_idx[i][j] = static_cast<int>(ids[i]->data<int64_t>()[j]);
} else if (ids[i]->dtype() == phi::DataType::INT32) {
int_idx[i][j] = ids[i]->data<int>()[j];
DataTypeToString(id_dtype)));
}
auto& id_dims = ids[0]->dims();
int batch_size = id_dims[0];
int max_seq_len_value = id_dims[1];
int ids_len = id_dims[0] * id_dims[1];
std::vector<std::vector<int>> int_ids(emb_layer_num,
std::vector<int>(ids_len, 0));
std::vector<xpu::VectorParam<int>> arg_ids;
auto* mask_tensor = mask.get_ptr();
if (mask_tensor != nullptr) {
auto mask_dtype = mask_tensor->dtype();
PADDLE_ENFORCE(
mask_dtype == phi::DataType::INT64 ||
mask_dtype == phi::DataType::FLOAT32,
errors::InvalidArgument(
"The data type of mask should be int64 or float32, but got %s.",
DataTypeToString(mask_dtype)));
max_seq_len->Resize({1});
ctx.template HostAlloc<int>(max_seq_len)[0] = max_seq_len_value;
seq_lod->Resize({batch_size + 1});
int* seq_lod_data = ctx.template HostAlloc<int>(seq_lod);
seq_lod_data[0] = 0;
for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
int cur_batch_seq_len = 0;
for (int seq_idx = 0; seq_idx < max_seq_len_value; seq_idx++) {
int mask_idx = batch_idx * max_seq_len_value + seq_idx;
if ((mask_dtype == phi::DataType::INT64 &&
mask->data<int64_t>()[mask_idx] > 0) ||
(mask_dtype == phi::DataType::FLOAT32 &&
fabs(mask->data<float>()[mask_idx]) > 1e-5)) {
cur_batch_seq_len++;
} else {
break;
}
}
PADDLE_ENFORCE_GT(
cur_batch_seq_len,
0,
errors::PreconditionNotMet(
"cur_batch_seq_len should be greater than 0, but got %d.",
cur_batch_seq_len));
seq_lod_data[batch_idx + 1] = seq_lod_data[batch_idx] + cur_batch_seq_len;
}
out->Resize({batch_size, seq_lod_data[batch_size], emb_dim});
for (int i = 0; i < emb_layer_num; i++) {
if (ids[i]->dtype() == DataType::INT64) {
auto* ids_data = ids[i]->data<int64_t>();
for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
for (int j = 0;
j < seq_lod_data[batch_idx + 1] - seq_lod_data[batch_idx];
j++) {
int_ids[i][seq_lod_data[batch_idx] + j] =
ids_data[batch_idx * max_seq_len_value + j];
}
}
} else {
auto* ids_data = ids[i]->data<int>();
for (int batch_idx = 0; batch_idx < batch_size; batch_idx++) {
for (int j = 0;
j < seq_lod_data[batch_idx + 1] - seq_lod_data[batch_idx];
j++) {
int_ids[i][seq_lod_data[batch_idx] + j] =
ids_data[batch_idx * max_seq_len_value + j];
}
}
}
arg_ids.push_back(
xpu::VectorParam<int>{int_ids[i].data(), ids_len, nullptr});
}
} else {
for (int i = 0; i < emb_layer_num; i++) {
for (int j = 0; j < ids_len; j++) {
if (ids[i]->dtype() == phi::DataType::INT64) {
int_ids[i][j] = static_cast<int>(ids[i]->data<int64_t>()[j]);
} else if (ids[i]->dtype() == phi::DataType::INT32) {
int_ids[i][j] = ids[i]->data<int>()[j];
}
}
arg_ids.push_back(
xpu::VectorParam<int>{int_ids[i].data(), ids_len, nullptr});
}
arg_ids.push_back(
xpu::VectorParam<int>{int_idx[i].data(), idx_len, nullptr});
}
ctx.template Alloc<T>(out);
......@@ -95,10 +168,10 @@ void EmbeddingWithEltwiseAddXpuKernel(
arg_tables, /* tables */
out_fp32_t.data<float>(),
arg_ids,
table_lens_cpu,
embed_dim,
std::vector<float>(table_lens_cpu.size(), 1.0f),
std::vector<int>(table_lens_cpu.size(), padding_idx));
table_lens,
emb_dim,
std::vector<float>(table_lens.size(), 1.0f),
std::vector<int>(table_lens.size(), padding_idx));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_with_eltwise_add_xpu");
r = xpu::cast(ctx.x_context(),
......@@ -112,10 +185,10 @@ void EmbeddingWithEltwiseAddXpuKernel(
arg_tables, /* tables */
out->data<float>(),
arg_ids,
table_lens_cpu,
embed_dim,
std::vector<float>(table_lens_cpu.size(), 1.0f),
std::vector<int>(table_lens_cpu.size(), padding_idx));
table_lens,
emb_dim,
std::vector<float>(table_lens.size(), 1.0f),
std::vector<int>(table_lens.size(), padding_idx));
PADDLE_ENFORCE_XDNN_SUCCESS(r, "embedding_with_eltwise_add_xpu");
}
}
......@@ -130,4 +203,7 @@ PD_REGISTER_KERNEL(embedding_with_eltwise_add_xpu,
float,
phi::dtype::float16) {
kernel->InputAt(0).SetBackend(phi::Backend::CPU);
kernel->InputAt(2).SetBackend(phi::Backend::CPU);
kernel->OutputAt(1).SetBackend(phi::Backend::CPU);
kernel->OutputAt(2).SetBackend(phi::Backend::CPU);
}
......@@ -29,6 +29,8 @@ void MultiEncoderXPUKernel(const Context& ctx,
const std::vector<const DenseTensor*>& ln_scale,
const std::vector<const DenseTensor*>& ln_bias,
const paddle::optional<DenseTensor>& mask,
const paddle::optional<DenseTensor>& seq_lod,
const paddle::optional<DenseTensor>& max_seq_len,
int layer_num,
bool norm_before,
int hidden_dim,
......@@ -89,19 +91,52 @@ void MultiEncoderXPUKernel(const Context& ctx,
}
const float* mask_data =
mask.get_ptr() == nullptr ? nullptr : mask.get_ptr()->data<float>();
const int* seq_lod_data =
seq_lod.get_ptr() == nullptr ? nullptr : seq_lod.get_ptr()->data<int>();
const int* max_seq_len_data = max_seq_len.get_ptr() == nullptr
? nullptr
: max_seq_len.get_ptr()->data<int>();
xpu::Activation_t qkv_act(static_cast<xpu::Activation_t::act_enum>(act_type));
int batch = x.dims()[0];
int max_seqlen = x.dims()[1];
// matmul_size * layer_num
std::vector<xpu::QuantType> quant_types(8 * layer_num,
xpu::QuantType::NOT_QUANT);
if (mask_data) {
if (seq_lod_data) {
xpu::VectorParam<int> query_lod = {
seq_lod_data, seq_lod.get_ptr()->numel(), nullptr};
int max_seq_len_value = slice_idx == -1 ? max_seq_len_data[0] : -1;
xpu::QKVAttnParam qkv_attn_param(query_lod,
head_num,
size_per_head,
qkv_act,
slice_idx,
true,
max_seq_len_value,
hidden_dim,
norm_before,
false);
qkv_attn_param.quant_type_.assign(quant_types.begin(), quant_types.end());
qkv_attn_param.scale_of_hidden_units = ffn_hidden_dim_scale;
int r = xpu::transformer_encoder<XPUTypeFP16, int16_t, int16_t>(
ctx.x_context(),
x_fp16_data,
fc_weight_data,
out_fp16_data,
fc_input_max_data,
fc_weight_max_data,
fc_bias_data,
ln_scale_data,
ln_bias_data,
qkv_attn_param);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "multi_encoder_xpu");
} else if (mask_data) {
auto mask_dims = mask.get_ptr()->dims();
std::vector<int> mask_shape(mask_dims.Get(),
mask_dims.Get() + mask_dims.size());
int max_seq_len_value = x.dims()[1];
xpu::QKVAttnParam qkv_attn_param(batch,
max_seqlen,
max_seq_len_value,
head_num,
size_per_head,
mask_shape,
......@@ -128,9 +163,10 @@ void MultiEncoderXPUKernel(const Context& ctx,
PADDLE_ENFORCE_XDNN_SUCCESS(r, "multi_encoder_xpu");
} else {
// When no mask input, like VIT, create LOD to act as vsl.
int max_seq_len_value = x.dims()[1];
std::vector<int> lod;
for (int i = 0; i < batch + 1; i++) {
lod.push_back(i * max_seqlen);
lod.push_back(i * max_seq_len_value);
}
xpu::VectorParam<int> query_lod = {
lod.data(), static_cast<int>(lod.size()), nullptr};
......@@ -180,4 +216,7 @@ PD_REGISTER_KERNEL(multi_encoder_xpu,
ALL_LAYOUT,
phi::fusion::MultiEncoderXPUKernel,
float,
phi::dtype::float16) {}
phi::dtype::float16) {
kernel->InputAt(7).SetBackend(phi::Backend::CPU);
kernel->InputAt(8).SetBackend(phi::Backend::CPU);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册