未验证 提交 c8aa6405 编写于 作者: Z zhupengyang 提交者: GitHub

[XPU] fix dropout pass; add multi_encoder_xpu_fuse_pass & multi_encoder_xpu kernel (#50499)

上级 df207283
......@@ -213,10 +213,17 @@ endif()
if(WITH_XPU)
cc_library(
quant_utils
xpu_quant_utils
SRCS xpu/quant_utils.cc
DEPS pass)
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS quant_utils)
cc_library(
xpu_pass_utils
SRCS xpu/pass_utils.cc
DEPS pass)
set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils)
pass_library(fc_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
pass_library(multi_encoder_xpu_fuse_pass inference DIR xpu DEPS
${XPU_PASS_DEPS})
endif()
cc_library(
......
......@@ -25,71 +25,52 @@ namespace paddle {
namespace framework {
namespace ir {
#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
#define GET_NODES \
GET_IR_NODE(any_op_out); \
GET_IR_NODE(dropout_op); \
GET_IR_NODE(dropout_op_out); \
GET_IR_NODE(dropout_op_outmask); \
GET_IR_NODE(any_op2);
#define GET_IR_NODE(node_) GET_IR_NODE_FROM_SUBGRAPH(node_, node_, pattern)
void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const {
const std::string pattern_name = "delete_dropout_op_pattern";
FusePassBase::Init(pattern_name, graph);
GraphPatternDetector gpd;
patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name);
pattern();
int found_subgraph_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_NODES;
IR_NODE_LINK_TO(any_op_out, any_op2);
std::string any_op_out_name = any_op_out->Var()->Name();
std::string dropout_op_out_name = dropout_op_out->Var()->Name();
// any_op2
auto* any_op2_desc = any_op2->Op();
auto var_map = any_op2_desc->Inputs();
std::string arg_name = "";
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(),
name_m.second.end(),
dropout_op_out_name) != name_m.second.end()) {
arg_name = name_m.first;
}
}
if (arg_name.size() == 0) {
LOG(INFO) << "Delete dropout op pass: can not find the input "
<< dropout_op_out_name;
return;
}
// modify the any_op2's inputs
for (auto& name_m : var_map) {
if (std::find(name_m.second.begin(),
name_m.second.end(),
dropout_op_out_name) != name_m.second.end()) {
std::vector<std::string> new_inputs;
for (auto& i_n : name_m.second) {
if (i_n != dropout_op_out_name) {
new_inputs.push_back(i_n);
}
GET_IR_NODE(dropout_op_x);
GET_IR_NODE(dropout_op);
GET_IR_NODE(dropout_op_out);
GET_IR_NODE(dropout_op_mask);
// link dropout_op_out to pre_op
auto dropout_op_x_name = dropout_op_x->Var()->Name();
auto dropout_op_out_name = dropout_op_out->Var()->Name();
auto pre_ops = dropout_op_x->inputs;
if (pre_ops.empty()) return;
auto pre_op_desc = pre_ops[0]->Op();
auto pre_op_outs = pre_op_desc->Outputs();
for (auto& out_var : pre_op_outs) {
auto names = out_var.second;
for (size_t i = 0; i < names.size(); i++) {
if (names[i] == dropout_op_x_name) {
names[i] = dropout_op_out_name;
pre_op_desc->SetOutput(out_var.first, names);
break;
}
new_inputs.push_back(any_op_out_name);
any_op2_desc->SetInput(name_m.first, new_inputs);
any_op2_desc->Flush();
}
}
any_op2_desc->Flush();
IR_NODE_LINK_TO(pre_ops[0], dropout_op_out);
// Delete the unneeded nodes.
GraphSafeRemoveNodes(graph,
{dropout_op, dropout_op_out, dropout_op_outmask});
// delete useless node
std::unordered_set<const Node*> delete_nodes{
dropout_op_x, dropout_op, dropout_op_mask};
GraphSafeRemoveNodes(graph, delete_nodes);
found_subgraph_count++;
};
gpd(graph, handler);
AddStatis(found_subgraph_count);
}
DeleteDropoutOpXPass::DeleteDropoutOpXPass() {
......@@ -279,6 +260,10 @@ void DeleteDropoutOpXPass::ReplaceOutputVar(Node* op,
REGISTER_PASS(delete_dropout_op_pass,
paddle::framework::ir::DeleteDropoutOpPass);
REGISTER_PASS_CAPABILITY(delete_dropout_op_pass)
.AddCombination(
paddle::framework::compatible::OpVersionComparatorCombination().EQ(
"dropout", 0));
REGISTER_PASS(delete_dropout_op_x_pass,
paddle::framework::ir::DeleteDropoutOpXPass);
......
......@@ -3034,26 +3034,19 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
}
void patterns::DeleteDropoutOpPattern::operator()() {
auto any_op_out = pattern->NewNode(any_op_out_repr())
auto dropout_op_x = pattern->NewNode(dropout_op_x_repr())
->assert_is_op_input("dropout", "X")
->AsInput();
auto dropout_op =
pattern->NewNode(dropout_op_repr())->assert_is_op("dropout");
auto dropout_op = pattern->NewNode(dropout_op_repr())
->assert_is_op("dropout")
->assert_op_attr("dropout_implementation",
std::string("upscale_in_train"));
auto dropout_op_out = pattern->NewNode(dropout_op_out_repr())
->assert_is_op_output("dropout", "Out")
->AsIntermediate();
auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr())
->assert_is_op_output("dropout", "Mask")
->AsOutput();
auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput();
dropout_op->LinksFrom({any_op_out});
dropout_op_out->LinksFrom({dropout_op});
dropout_op_outmask->LinksFrom({dropout_op});
any_op2->LinksFrom({dropout_op_out});
->assert_is_op_output("dropout", "Out");
auto dropout_op_mask = pattern->NewNode(dropout_op_mask_repr())
->assert_is_op_output("dropout", "Mask");
dropout_op->LinksFrom({dropout_op_x})
.LinksTo({dropout_op_out, dropout_op_mask});
}
void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node,
......
......@@ -1763,11 +1763,10 @@ struct DeleteDropoutOpPattern : public PatternBase {
void operator()();
PATTERN_DECL_NODE(any_op_out);
PATTERN_DECL_NODE(dropout_op_x);
PATTERN_DECL_NODE(dropout_op);
PATTERN_DECL_NODE(dropout_op_out);
PATTERN_DECL_NODE(dropout_op_outmask);
PATTERN_DECL_NODE(any_op2);
PATTERN_DECL_NODE(dropout_op_mask);
};
struct DeleteQuantDequantOpPattern : public PatternBase {
......
......@@ -176,15 +176,6 @@ class FcXPUFusePass : public FusePassBase {
const std::string& act_type) const;
const std::string name_scope_{"fc_xpu_fuse_pass"};
const std::map<std::string, int> act_map_{{"", 0},
{"relu", 1},
{"sigmoid", 2},
{"tanh", 3},
{"gelu", 4},
{"leaky_relu", 5},
{"hard_swish", 14},
{"hard_sigmoid", 15},
{"relu6", 17}};
};
void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
......@@ -246,17 +237,13 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
mul_w_max_var->SetPersistable(true);
auto mul_w_max_tensor =
scope->Var(mul_w_max_name)->GetMutable<phi::DenseTensor>();
auto* xpu_ctx = static_cast<phi::XPUContext*>(
platform::DeviceContextPool::Instance().Get(phi::XPUPlace()));
int max_ptr_size = xpu_ctx->x_context()->max_ptr_size();
bool transpose_w = false;
if (mul_type == "matmul") {
transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y"));
} else if (mul_type == "matmul_v2") {
transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
}
QuantWeight<int16_t>(
mul_w_tensor, mul_w_max_tensor, !transpose_w, max_ptr_size);
QuantWeight<int16_t>(mul_w_tensor, mul_w_max_tensor, !transpose_w);
}
// Generate fc_xpu op
......@@ -288,7 +275,7 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
fc_xpu_op_desc.SetAttr("act_type", 0);
fc_xpu_op_desc.SetAttr("act_alpha", 0.f);
if (act) {
fc_xpu_op_desc.SetAttr("act_type", act_map_.at(act_type));
fc_xpu_op_desc.SetAttr("act_type", ConvertActivationType(act_type));
if (act_type == "leaky_relu") {
fc_xpu_op_desc.SetAttr(
"act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("alpha")));
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
namespace paddle {
namespace framework {
namespace ir {
int ConvertActivationType(std::string act_type) {
if (act_type == "") {
return static_cast<int>(xpu::Activation_t::LINEAR);
} else if (act_type == "relu") {
return static_cast<int>(xpu::Activation_t::RELU);
} else if (act_type == "sigmoid") {
return static_cast<int>(xpu::Activation_t::SIGMOID);
} else if (act_type == "tanh") {
return static_cast<int>(xpu::Activation_t::TANH);
} else if (act_type == "gelu") {
return static_cast<int>(xpu::Activation_t::GELU);
} else if (act_type == "leaky_relu") {
return static_cast<int>(xpu::Activation_t::LEAKY_RELU);
} else if (act_type == "exp") {
return static_cast<int>(xpu::Activation_t::EXP);
} else if (act_type == "hard_swish") {
return static_cast<int>(xpu::Activation_t::HARD_SWISH);
} else if (act_type == "hard_sigmoid") {
return static_cast<int>(xpu::Activation_t::HARD_SIGMOID);
} else if (act_type == "swish") {
return static_cast<int>(xpu::Activation_t::SWISH);
} else if (act_type == "relu6") {
return static_cast<int>(xpu::Activation_t::RELU6);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Not support convert activation_type(%s).", act_type));
}
return -1;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include <string>
namespace paddle {
namespace framework {
......@@ -42,6 +43,8 @@ namespace ir {
IR_NODE_LINK_TO(a, b) \
}
int ConvertActivationType(std::string act_type);
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -16,20 +16,34 @@
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace paddle {
namespace framework {
namespace ir {
template <typename T>
static void Transpose(const T* in, T* out, int h, int w) {
for (int h1 = 0; h1 < w; ++h1) {
for (int w1 = 0; w1 < h; ++w1) {
out[h1 * h + w1] = in[w1 * w + h1];
}
}
void Transpose2D(const phi::DenseTensor& in, phi::DenseTensor* out) {
auto in_dims = in.dims();
PADDLE_ENFORCE_EQ(
in_dims.size(),
2,
platform::errors::InvalidArgument(
"In dims rank should be 2, but received in dims size is [%d].",
in_dims.size()));
out->Resize({in_dims[1], in_dims[0]});
out->set_type(in.type());
auto* dev_ctx = static_cast<phi::CPUContext*>(
platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
dev_ctx->Alloc<T>(out);
std::vector<int> axis{1, 0};
phi::funcs::Transpose<phi::CPUContext, T, 2> trans2d;
trans2d(*dev_ctx, in, out, axis);
}
template void Transpose2D<float>(const phi::DenseTensor& in,
phi::DenseTensor* out);
static float FindMaxAbs(const float* data, int len) {
float max_f = 0.0f;
for (int i = 0; i < len; ++i) {
......@@ -136,25 +150,20 @@ void QuantFP32ToIntX<int16_t>(const float* src_ptr,
template <typename T>
void QuantWeight(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size) {
bool transpose) {
// Transpose
auto* weight_data = weight->data<float>();
auto dims = weight->dims();
auto size = weight->numel();
std::vector<float> transpose_data(weight_data, weight_data + size);
phi::DenseTensor weight_trans;
if (transpose) {
PADDLE_ENFORCE_EQ(
dims.size(),
2,
platform::errors::InvalidArgument(
"Only support 2D weight, but received weight rank is [%d].",
dims.size()));
Transpose(weight_data, transpose_data.data(), dims[0], dims[1]);
weight->Resize({dims[1], dims[0]});
Transpose2D<float>(*weight, &weight_trans);
weight_data = weight_trans.data<float>();
weight->Resize(weight_trans.dims());
}
weight_data = transpose_data.data();
// Find max
auto* xpu_ctx = static_cast<phi::XPUContext*>(
platform::DeviceContextPool::Instance().Get(phi::XPUPlace()));
int max_ptr_size = xpu_ctx->x_context()->max_ptr_size();
int size = weight->numel();
float max_val = FindMaxAbs(weight_data, size);
std::vector<float> max_vec(max_ptr_size, max_val);
weight_max->set_type(paddle::experimental::CppTypeToDataType<float>::Type());
......@@ -173,8 +182,7 @@ void QuantWeight(phi::DenseTensor* weight,
template void QuantWeight<int16_t>(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size);
bool transpose);
} // namespace ir
} // namespace framework
......
......@@ -19,14 +19,16 @@ namespace paddle {
namespace framework {
namespace ir {
template <typename T>
void Transpose2D(const phi::DenseTensor& in, phi::DenseTensor* out);
// 1. Quant weight from fp32 to int16/int31
// 2. Weight data is in-place update.
// 3. Generate weight max tensor
template <typename T>
void QuantWeight(phi::DenseTensor* weight,
phi::DenseTensor* weight_max,
bool transpose,
int max_ptr_size);
bool transpose);
} // namespace ir
} // namespace framework
......
......@@ -2745,8 +2745,6 @@ void OperatorWithKernel::ParseMultiInputDataType(
const phi::DenseTensor* t = nullptr;
if (var->IsType<phi::DenseTensor>()) {
t = &var->Get<phi::DenseTensor>();
} else if (var->IsType<phi::DenseTensor>()) {
t = &var->Get<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
t = &(var->Get<phi::SelectedRows>().value());
} else if (var->IsType<phi::SparseCooTensor>()) {
......@@ -2866,8 +2864,6 @@ phi::DenseTensor* OperatorWithKernel::GetTensorFormInputSafely(
phi::DenseTensor* t = nullptr;
if (var->IsType<phi::DenseTensor>()) {
t = var->GetMutable<phi::DenseTensor>();
} else if (var->IsType<phi::DenseTensor>()) {
t = var->GetMutable<phi::DenseTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
t = var->GetMutable<phi::SelectedRows>()->mutable_value();
} else {
......
......@@ -517,7 +517,7 @@ void CpuPassStrategy::EraseFcMkldnnPasses() {
XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
passes_.assign({
"delete_dropout_op_pass",
// "multi_encoder_xpu_fuse_pass",
"multi_encoder_xpu_fuse_pass",
// "embedding_with_eltwise_add_xpu_fuse_pass",
"fc_xpu_fuse_pass",
// "multi_encoder_slice_link_xpu_fuse_pass",
......
......@@ -5,8 +5,19 @@
func : FcXPUInferMeta
kernel :
func : fc_xpu
data_type : x
optional : bias
- op : multi_encoder_xpu
args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
infer_meta :
func : MultiEncoderXPUInferMeta
kernel :
func : multi_encoder_xpu
data_type : x
optional : mask, x_fp16, out_fp16
- op : share_buffer
args : (Tensor[] x, bool[] share_dims_and_dtype={})
output : Tensor[](out){x.size()}, Tensor[](xout){x.size()}
......
......@@ -421,6 +421,7 @@ XPUOpMap& get_kl2_ops() {
phi::DataType::FLOAT16,
phi::DataType::INT32,
phi::DataType::INT64})},
{"multi_encoder_xpu", XPUKernelSet({phi::DataType::FLOAT32})},
{"nearest_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})},
{"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
{"not_equal",
......
......@@ -42,4 +42,41 @@ void FcXPUInferMeta(const MetaTensor& x,
out->set_layout(x.layout());
}
void MultiEncoderXPUInferMeta(
const MetaTensor& x,
const std::vector<const MetaTensor*>& fc_weight,
const std::vector<const MetaTensor*>& fc_weight_max,
const std::vector<const MetaTensor*>& fc_bias,
const std::vector<const MetaTensor*>& ln_scale,
const std::vector<const MetaTensor*>& ln_bias,
const MetaTensor& mask,
int layer_num,
bool norm_before,
int hidden_dim,
int head_num,
int size_per_head,
int ffn_hidden_dim_scale,
int act_type,
int relative_type,
int slice_idx,
MetaTensor* out,
MetaTensor* x_fp16,
MetaTensor* out_fp16) {
auto x_dims = x.dims();
x_fp16->set_dims(x_dims);
x_fp16->set_dtype(DataType::FLOAT16);
x_fp16->set_layout(x.layout());
out->set_dtype(x.dtype());
out->set_layout(x.layout());
out_fp16->set_dtype(DataType::FLOAT16);
out_fp16->set_layout(x.layout());
if (slice_idx == -1) {
out->set_dims(x_dims);
out_fp16->set_dims(x_dims);
} else {
out->set_dims({x_dims[0], x_dims[2]});
out_fp16->set_dims({x_dims[0], x_dims[2]});
}
}
} // namespace phi
......@@ -34,4 +34,25 @@ void FcXPUInferMeta(const MetaTensor& x,
float act_alpha,
MetaTensor* out);
void MultiEncoderXPUInferMeta(
const MetaTensor& x,
const std::vector<const MetaTensor*>& fc_weight,
const std::vector<const MetaTensor*>& fc_weight_max,
const std::vector<const MetaTensor*>& fc_bias,
const std::vector<const MetaTensor*>& ln_scale,
const std::vector<const MetaTensor*>& ln_bias,
const MetaTensor& mask,
int layer_num,
bool norm_before,
int hidden_dim,
int head_num,
int size_per_head,
int ffn_hidden_dim_scale,
int act_type,
int relative_type,
int slice_idx,
MetaTensor* out,
MetaTensor* x_fp16,
MetaTensor* out_fp16);
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, sofint16_tare
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
namespace fusion {
template <typename T, typename Context>
void MultiEncoderXPUKernel(const Context& ctx,
const DenseTensor& x,
const std::vector<const DenseTensor*>& fc_weight,
const std::vector<const DenseTensor*>& fc_weight_max,
const std::vector<const DenseTensor*>& fc_bias,
const std::vector<const DenseTensor*>& ln_scale,
const std::vector<const DenseTensor*>& ln_bias,
const paddle::optional<DenseTensor>& mask,
int layer_num,
bool norm_before,
int hidden_dim,
int head_num,
int size_per_head,
int ffn_hidden_dim_scale,
int act_type,
int relative_type,
int slice_idx,
DenseTensor* out,
DenseTensor* x_fp16,
DenseTensor* out_fp16) {
using float16 = typename XPUTypeTrait<phi::dtype::float16>::Type;
// XPU2 only support fp16 input/output.
float16* x_fp16_data = reinterpret_cast<float16*>(
ctx.template Alloc<phi::dtype::float16>(x_fp16));
int r_cast_x = xpu::cast_v2<float, float16>(
ctx.x_context(), x.data<T>(), x_fp16_data, x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r_cast_x,
"multi_encoder_xpu(cast x from fp32 to fp16)");
float16* out_fp16_data = reinterpret_cast<float16*>(
ctx.template Alloc<phi::dtype::float16>(out_fp16));
// q,k,v weight are fused.
// Each encoder's weight should be: w0, null, null, w3, w4, w5
std::vector<const float*> fc_input_max_data;
std::vector<const int16_t*> fc_weight_data;
std::vector<const float*> fc_weight_max_data;
std::vector<const float*> fc_bias_data;
for (size_t i = 0; i < fc_weight.size(); i++) {
fc_weight_data.push_back(fc_weight[i]->data<int16_t>());
fc_weight_max_data.push_back(fc_weight_max[i]->data<float>());
fc_bias_data.push_back(fc_bias[i]->data<float>());
if (i % 4 == 0) {
fc_weight_data.push_back(nullptr);
fc_weight_data.push_back(nullptr);
fc_weight_max_data.push_back(nullptr);
fc_weight_max_data.push_back(nullptr);
fc_bias_data.push_back(nullptr);
fc_bias_data.push_back(nullptr);
}
}
std::vector<const float*> ln_scale_data;
std::vector<const float*> ln_bias_data;
for (size_t i = 0; i < ln_scale.size(); i++) {
ln_scale_data.push_back(ln_scale[i]->data<float>());
ln_bias_data.push_back(ln_bias[i]->data<float>());
}
const T* mask_data =
mask.get_ptr() == nullptr ? nullptr : mask.get_ptr()->data<T>();
xpu::Activation_t qkv_act(static_cast<xpu::Activation_t::act_enum>(act_type));
int batch = x.dims()[0];
int max_seqlen = x.dims()[1];
// matmul_size * layer_num
std::vector<xpu::QuantType> quant_types(8 * layer_num,
xpu::QuantType::NOT_QUANT);
if (mask_data) {
auto mask_dims = mask.get_ptr()->dims();
std::vector<int> mask_shape(mask_dims.Get(),
mask_dims.Get() + mask_dims.size());
xpu::QKVAttnParam qkv_attn_param(batch,
max_seqlen,
head_num,
size_per_head,
mask_shape,
qkv_act,
slice_idx,
true,
hidden_dim,
norm_before,
false);
qkv_attn_param.quant_type_.assign(quant_types.begin(), quant_types.end());
qkv_attn_param.scale_of_hidden_units = ffn_hidden_dim_scale;
int r =
xpu::transformer_encoder<float16, int16_t, int16_t>(ctx.x_context(),
x_fp16_data,
fc_weight_data,
out_fp16_data,
fc_input_max_data,
fc_weight_max_data,
fc_bias_data,
ln_scale_data,
ln_bias_data,
qkv_attn_param,
mask_data);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "multi_encoder_xpu");
} else {
// When no mask input, like VIT, create LOD to act as vsl.
std::vector<int> lod;
for (int i = 0; i < batch + 1; i++) {
lod.push_back(i * max_seqlen);
}
xpu::VectorParam<int> query_lod = {
lod.data(), static_cast<int>(lod.size()), nullptr};
// No need to pad, no matter slice or not
xpu::QKVAttnParam qkv_attn_param(query_lod,
head_num,
size_per_head,
qkv_act,
slice_idx,
true,
-1,
hidden_dim,
norm_before,
false);
qkv_attn_param.quant_type_.assign(quant_types.begin(), quant_types.end());
qkv_attn_param.scale_of_hidden_units = ffn_hidden_dim_scale;
int r =
xpu::transformer_encoder<float16, int16_t, int16_t>(ctx.x_context(),
x_fp16_data,
fc_weight_data,
out_fp16_data,
fc_input_max_data,
fc_weight_max_data,
fc_bias_data,
ln_scale_data,
ln_bias_data,
qkv_attn_param);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "multi_encoder_xpu");
}
int r_cast_out = xpu::cast_v2<float16, float>(
ctx.x_context(), out_fp16_data, ctx.template Alloc<T>(out), out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r_cast_out,
"multi_encoder_xpu(cast out from fp16 to fp32)");
}
} // namespace fusion
} // namespace phi
PD_REGISTER_KERNEL(multi_encoder_xpu,
XPU,
ALL_LAYOUT,
phi::fusion::MultiEncoderXPUKernel,
float) {}
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import hypothesis.strategies as st
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestDeleteDropoutOpPass(PassAutoScanTest):
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_xpu=True)
yield config, ["elementwise_add", "relu", "relu6"], (1e-5, 1e-5)
def sample_program_config(self, draw):
add_op = OpConfig(
"elementwise_add",
inputs={"X": ["add_x"], "Y": ["add_y"]},
outputs={"Out": ["add_out"]},
axis=-1,
)
dropout_op = OpConfig(
"dropout",
inputs={"X": ["add_out"]},
outputs={"Out": ["dropout_out"], "Mask": ["dropout_mask"]},
dropout_implementation="upscale_in_train",
dropout_prob=0.1,
fix_seed=False,
is_test=True,
seed=0,
)
relu_op = OpConfig(
"relu",
inputs={"X": ["dropout_out"]},
outputs={"Out": ["relu_out"]},
)
relu6_op = OpConfig(
"relu6",
inputs={"X": ["dropout_out"]},
outputs={"Out": ["relu6_out"]},
)
ops = [add_op, dropout_op, relu_op, relu6_op]
add_x_shape = draw(
st.lists(
st.integers(min_value=1, max_value=4), min_size=2, max_size=4
)
)
program_config = ProgramConfig(
ops=ops,
weights={},
inputs={
"add_x": TensorConfig(shape=add_x_shape),
"add_y": TensorConfig(shape=add_x_shape),
},
outputs=["relu_out", "relu6_out"],
)
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=1,
min_success_num=1,
passes=["delete_dropout_op_pass"],
)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import hypothesis.strategies as st
import numpy as np
from auto_scan_test import PassAutoScanTest
from program_config import OpConfig, ProgramConfig, TensorConfig
class TestMultiEncoderXPUFusePass(PassAutoScanTest):
def sample_predictor_configs(self, program_config):
config = self.create_inference_config(use_xpu=True)
yield config, ["multi_encoder_xpu"], (1e-1, 1e-1)
def sample_program_config(self, draw):
# q: matmul+add+reshape+transpose
q_matmul_op = OpConfig(
"matmul_v2",
inputs={"X": ["q_matmul_x"], "Y": ["q_matmul_w"]},
outputs={"Out": ["q_matmul_out"]},
trans_x=False,
trans_y=False,
)
q_add_op = OpConfig(
"elementwise_add",
inputs={"X": ["q_matmul_out"], "Y": ["q_add_bias"]},
outputs={"Out": ["q_add_out"]},
axis=2,
)
q_reshape_op = OpConfig(
"reshape2",
inputs={"X": ["q_add_out"]},
outputs={"Out": ["q_reshape_out"], "XShape": ["q_reshape_xshape"]},
shape=[0, 0, 12, 64],
)
q_transpose_op = OpConfig(
"transpose2",
inputs={"X": ["q_reshape_out"]},
outputs={
"Out": ["q_transpose_out"],
"XShape": ["q_transpose_xshape"],
},
axis=[0, 2, 1, 3],
)
# k: matmul+add+reshape+transpose
k_matmul_op = OpConfig(
"matmul_v2",
inputs={"X": ["q_matmul_x"], "Y": ["k_matmul_w"]},
outputs={"Out": ["k_matmul_out"]},
trans_x=False,
trans_y=False,
)
k_add_op = OpConfig(
"elementwise_add",
inputs={"X": ["k_matmul_out"], "Y": ["k_add_bias"]},
outputs={"Out": ["k_add_out"]},
axis=2,
)
k_reshape_op = OpConfig(
"reshape2",
inputs={"X": ["k_add_out"]},
outputs={"Out": ["k_reshape_out"], "XShape": ["k_reshape_xshape"]},
shape=[0, 0, 12, 64],
)
k_transpose_op = OpConfig(
"transpose2",
inputs={"X": ["k_reshape_out"]},
outputs={
"Out": ["k_transpose_out"],
"XShape": ["k_transpose_xshape"],
},
axis=[0, 2, 1, 3],
)
# v: matmul+add+reshape+transpose
v_matmul_op = OpConfig(
"matmul_v2",
inputs={"X": ["q_matmul_x"], "Y": ["v_matmul_w"]},
outputs={"Out": ["v_matmul_out"]},
trans_x=False,
trans_y=False,
)
v_add_op = OpConfig(
"elementwise_add",
inputs={"X": ["v_matmul_out"], "Y": ["v_add_bias"]},
outputs={"Out": ["v_add_out"]},
axis=2,
)
v_reshape_op = OpConfig(
"reshape2",
inputs={"X": ["v_add_out"]},
outputs={"Out": ["v_reshape_out"], "XShape": ["v_reshape_xshape"]},
shape=[0, 0, 12, 64],
)
v_transpose_op = OpConfig(
"transpose2",
inputs={"X": ["v_reshape_out"]},
outputs={
"Out": ["v_transpose_out"],
"XShape": ["v_transpose_xshape"],
},
axis=[0, 2, 1, 3],
)
# qk: matmul+add+softmax
qk_matmul_op = OpConfig(
"matmul",
inputs={"X": ["q_transpose_out"], "Y": ["k_transpose_out"]},
outputs={"Out": ["qk_matmul_out"]},
alpha=0.125,
transpose_X=False,
transpose_Y=True,
)
qk_add_op = OpConfig(
"elementwise_add",
inputs={"X": ["qk_matmul_out"], "Y": ["qk_add_mask"]},
outputs={"Out": ["qk_add_out"]},
axis=-1,
)
qk_softmax_op = OpConfig(
"softmax",
inputs={"X": ["qk_add_out"]},
outputs={"Out": ["qk_softmax_out"]},
axis=-1,
)
# qkv
qkv_matmul_0_op = OpConfig(
"matmul_v2",
inputs={"X": ["qk_softmax_out"], "Y": ["v_transpose_out"]},
outputs={"Out": ["qkv_matmul_0_out"]},
trans_x=False,
trans_y=False,
)
qkv_transpose_op = OpConfig(
"transpose2",
inputs={"X": ["qkv_matmul_0_out"]},
outputs={
"Out": ["qkv_transpose_out"],
"XShape": ["qkv_transpose_xshape"],
},
axis=[0, 2, 1, 3],
)
qkv_reshape_op = OpConfig(
"reshape2",
inputs={"X": ["qkv_transpose_out"]},
outputs={
"Out": ["qkv_reshape_out"],
"XShape": ["qkv_reshape_xshape"],
},
shape=[0, 0, 768],
)
qkv_matmul_1_op = OpConfig(
"matmul_v2",
inputs={"X": ["qkv_reshape_out"], "Y": ["qkv_matmul_1_w"]},
outputs={"Out": ["qkv_matmul_1_out"]},
trans_x=False,
trans_y=False,
)
qkv_add_0_op = OpConfig(
"elementwise_add",
inputs={"X": ["qkv_matmul_1_out"], "Y": ["qkv_add_0_bias"]},
outputs={"Out": ["qkv_add_0_out"]},
axis=2,
)
qkv_add_1_op = OpConfig(
"elementwise_add",
inputs={"X": ["qkv_add_0_out"], "Y": ["q_matmul_x"]},
outputs={"Out": ["qkv_add_1_out"]},
axis=-1,
)
ln_1_op = OpConfig(
"layer_norm",
inputs={
"X": ["qkv_add_1_out"],
"Bias": ["ln_1_bias"],
"Scale": ["ln_1_scale"],
},
outputs={
"Y": ["ln_1_out"],
"Mean": ["ln_1_mean"],
"Variance": ["ln_1_variance"],
},
begin_norm_axis=2,
epsilon=1e-14,
)
qkv_matmul_2_op = OpConfig(
"matmul_v2",
inputs={"X": ["ln_1_out"], "Y": ["qkv_matmul_2_w"]},
outputs={"Out": ["qkv_matmul_2_out"]},
trans_x=False,
trans_y=False,
)
qkv_add_2_op = OpConfig(
"elementwise_add",
inputs={"X": ["qkv_matmul_2_out"], "Y": ["qkv_add_2_bias"]},
outputs={"Out": ["qkv_add_2_out"]},
axis=2,
)
qkv_act_op = OpConfig(
"gelu",
inputs={"X": ["qkv_add_2_out"]},
outputs={"Out": ["qkv_act_out"]},
approximate=False,
)
qkv_matmul_3_op = OpConfig(
"matmul_v2",
inputs={"X": ["qkv_act_out"], "Y": ["qkv_matmul_3_w"]},
outputs={"Out": ["qkv_matmul_3_out"]},
trans_x=False,
trans_y=False,
)
qkv_add_3_op = OpConfig(
"elementwise_add",
inputs={"X": ["qkv_matmul_3_out"], "Y": ["qkv_add_3_bias"]},
outputs={"Out": ["qkv_add_3_out"]},
axis=2,
)
qkv_add_4_op = OpConfig(
"elementwise_add",
inputs={"X": ["ln_1_out"], "Y": ["qkv_add_3_out"]},
outputs={"Out": ["qkv_add_4_out"]},
axis=-1,
)
ln_2_op = OpConfig(
"layer_norm",
inputs={
"X": ["qkv_add_4_out"],
"Bias": ["ln_2_bias"],
"Scale": ["ln_2_scale"],
},
outputs={
"Y": ["ln_2_out"],
"Mean": ["ln_2_mean"],
"Variance": ["ln_2_variance"],
},
begin_norm_axis=2,
epsilon=1e-14,
)
ops = [
q_matmul_op,
q_add_op,
q_reshape_op,
q_transpose_op,
k_matmul_op,
k_add_op,
k_reshape_op,
k_transpose_op,
v_matmul_op,
v_add_op,
v_reshape_op,
v_transpose_op,
qk_matmul_op,
qk_add_op,
qk_softmax_op,
qkv_matmul_0_op,
qkv_transpose_op,
qkv_reshape_op,
qkv_matmul_1_op,
qkv_add_0_op,
qkv_add_1_op,
ln_1_op,
qkv_matmul_2_op,
qkv_add_2_op,
qkv_act_op,
qkv_matmul_3_op,
qkv_add_3_op,
qkv_add_4_op,
ln_2_op,
]
q_matmul_x_shape = draw(
st.lists(
st.integers(min_value=3, max_value=10), min_size=3, max_size=3
)
)
q_matmul_x_shape[2] = 768
q_matmul_w_shape = [q_matmul_x_shape[2], q_matmul_x_shape[2]]
q_add_bias_shape = [q_matmul_x_shape[2]]
qk_add_mask_shape = [q_matmul_x_shape[0], 1, 1, q_matmul_x_shape[1]]
qkv_matmul_2_w_shape = [q_matmul_x_shape[2], 3072]
qkv_add_2_bias_shape = [qkv_matmul_2_w_shape[1]]
qkv_matmul_3_w_shape = [3072, q_matmul_x_shape[2]]
qkv_add_3_bias_shape = [qkv_matmul_3_w_shape[1]]
ln_1_bias_shape = [q_matmul_x_shape[2]]
# def generate_q_matmul_w():
# return np.random.random(x_shape).astype(np.float32)
program_config = ProgramConfig(
ops=ops,
weights={
"q_matmul_w": TensorConfig(shape=q_matmul_w_shape),
"q_add_bias": TensorConfig(shape=q_add_bias_shape),
"k_matmul_w": TensorConfig(shape=q_matmul_w_shape),
"k_add_bias": TensorConfig(shape=q_add_bias_shape),
"v_matmul_w": TensorConfig(shape=q_matmul_w_shape),
"v_add_bias": TensorConfig(shape=q_add_bias_shape),
"qkv_matmul_1_w": TensorConfig(shape=q_matmul_w_shape),
"qkv_add_0_bias": TensorConfig(shape=q_add_bias_shape),
"qkv_matmul_2_w": TensorConfig(shape=qkv_matmul_2_w_shape),
"qkv_add_2_bias": TensorConfig(shape=qkv_add_2_bias_shape),
"qkv_matmul_3_w": TensorConfig(shape=qkv_matmul_3_w_shape),
"qkv_add_3_bias": TensorConfig(shape=qkv_add_3_bias_shape),
"ln_1_bias": TensorConfig(shape=ln_1_bias_shape),
"ln_1_scale": TensorConfig(shape=ln_1_bias_shape),
"ln_2_bias": TensorConfig(shape=ln_1_bias_shape),
"ln_2_scale": TensorConfig(shape=ln_1_bias_shape),
},
inputs={
"q_matmul_x": TensorConfig(shape=q_matmul_x_shape),
"qk_add_mask": TensorConfig(shape=qk_add_mask_shape),
},
outputs=["ln_2_out"],
)
return program_config
def test(self):
self.run_and_statis(
quant=False,
max_examples=2,
min_success_num=2,
passes=["multi_encoder_xpu_fuse_pass"],
)
if __name__ == "__main__":
np.random.seed(200)
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册