未验证 提交 2afa9b76 编写于 作者: W wanghuancoder 提交者: GitHub

[Eager] Menual fused attention in eager (#43974)

* fused_gate_attention manual code in eager
上级 9aaae254
......@@ -67,3 +67,37 @@ fused_feedforward_dygraph_function(
const paddle::experimental::Tensor& Ln2Scale,
const paddle::experimental::Tensor& Ln2Bias,
const paddle::framework::AttributeMap& attr_map);
std::tuple<paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor>
fused_attention_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& LnScale,
const paddle::experimental::Tensor& LnBias,
const paddle::experimental::Tensor& QKVW,
const paddle::experimental::Tensor& QKVBias,
const paddle::experimental::Tensor& CacheKV,
const paddle::experimental::Tensor& SrcMask,
const paddle::experimental::Tensor& OutLinearW,
const paddle::experimental::Tensor& OutLinearBias,
const paddle::experimental::Tensor& Ln2Scale,
const paddle::experimental::Tensor& Ln2Bias,
const paddle::framework::AttributeMap& attr_map);
......@@ -12,6 +12,14 @@ cc_library(
add_dependencies(fused_feedforward_fwd_func eager_codegen)
cc_library(
fused_attention_fwd_func
SRCS fused_attention_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(fused_attention_fwd_func eager_codegen)
set(fluid_manual_functions
fused_gate_attention_fwd_func fused_feedforward_fwd_func
fused_attention_fwd_func
PARENT_SCOPE)
......@@ -8,6 +8,11 @@ cc_library(
SRCS fused_feedforward_node.cc
DEPS ${eager_deps} ${fluid_deps})
cc_library(
fused_attention_node
SRCS fused_attention_node.cc
DEPS ${eager_deps} ${fluid_deps})
set(fluid_manual_nodes
fused_gate_attention_node fused_feedforward_node
fused_gate_attention_node fused_feedforward_node fused_attention_node
PARENT_SCOPE)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "glog/logging.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/all.h"
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
fused_attentionGradNodeCompat::operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>& grads,
bool create_graph,
bool is_new_grad) {
VLOG(3) << "Running Eager Backward Node: fused_attentionGradNodeCompat";
const auto& out_metas = OutputMeta();
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
outputs(23);
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
hooked_grads0 = fused_attentionGradNodeCompat::ApplyGradientHooks(grads);
bool pre_layer_norm = false;
if (attr_map_.count("pre_layer_norm")) {
pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm"));
}
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
{{"AttnDropoutMaskOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutMaskOut_))},
{"AttnDropoutOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_))},
{"DropoutMaskOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->DropoutMaskOut_))},
{"FMHAOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_))},
{"OutLinearOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_))},
{"OutLinearW",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearW_))},
{"QKOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_))},
{"QKTVOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_))},
{"QKVOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_))},
{"QKVW",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->QKVW_))},
{"SoftmaxOut",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_))},
{"TransposeOut2",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_))},
{"X",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->X_))},
{"Y@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[19])}};
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
if ((!out_metas[7].empty()) && (!(out_metas[7][0].IsStopGradient()))) {
outs0.insert({"OutLinearW@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) {
outs0.insert({"QKVW@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
outs0.insert({"X@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
auto QKVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVOut_);
if (QKVOut.defined() && (!out_metas[15].empty()) &&
(!out_metas[15][0].IsStopGradient()))
outs0["QKVOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto QKTVOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKTVOut_);
if (QKTVOut.defined() && (!out_metas[16].empty()) &&
(!out_metas[16][0].IsStopGradient()))
outs0["QKTVOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto TransposeOut2 =
egr::EagerUtils::RecoverTensorWrapper(&this->TransposeOut2_);
if (TransposeOut2.defined() && (!out_metas[17].empty()) &&
(!out_metas[17][0].IsStopGradient()))
outs0["TransposeOut2@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto QKOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKOut_);
if (QKOut.defined() && (!out_metas[18].empty()) &&
(!out_metas[18][0].IsStopGradient()))
outs0["QKOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto SoftmaxOut = egr::EagerUtils::RecoverTensorWrapper(&this->SoftmaxOut_);
if (SoftmaxOut.defined() && (!out_metas[19].empty()) &&
(!out_metas[19][0].IsStopGradient()))
outs0["SoftmaxOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto AttnDropoutOut =
egr::EagerUtils::RecoverTensorWrapper(&this->AttnDropoutOut_);
if (AttnDropoutOut.defined() && (!out_metas[20].empty()) &&
(!out_metas[20][0].IsStopGradient()))
outs0["AttnDropoutOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto FMHAOut = egr::EagerUtils::RecoverTensorWrapper(&this->FMHAOut_);
if (FMHAOut.defined() && (!out_metas[21].empty()) &&
(!out_metas[21][0].IsStopGradient()))
outs0["FMHAOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto OutLinearOut =
egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearOut_);
if (OutLinearOut.defined() && (!out_metas[22].empty()) &&
(!out_metas[22][0].IsStopGradient()))
outs0["OutLinearOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
auto QKVBias = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBias_);
if (QKVBias.defined()) {
ins0["QKVBias"] = egr::EagerUtils::TrySyncToVars(QKVBias);
auto QKVBiasOut = egr::EagerUtils::RecoverTensorWrapper(&this->QKVBiasOut_);
ins0["QKVBiasOut"] = egr::EagerUtils::TrySyncToVars(QKVBiasOut);
if (QKVBias.defined() && (!out_metas[4].empty()) &&
(!out_metas[4][0].IsStopGradient()))
outs0["QKVBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (QKVBiasOut.defined() && (!out_metas[11].empty()) &&
(!out_metas[11][0].IsStopGradient()))
outs0["QKVBiasOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto SrcMask = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMask_);
if (SrcMask.defined()) {
ins0["SrcMask"] = egr::EagerUtils::TrySyncToVars(SrcMask);
auto SrcMaskOut = egr::EagerUtils::RecoverTensorWrapper(&this->SrcMaskOut_);
ins0["SrcMaskOut"] = egr::EagerUtils::TrySyncToVars(SrcMaskOut);
if (SrcMaskOut.defined() && (!out_metas[12].empty()) &&
(!out_metas[12][0].IsStopGradient()))
outs0["SrcMaskOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto OutLinearBias =
egr::EagerUtils::RecoverTensorWrapper(&this->OutLinearBias_);
if (OutLinearBias.defined()) {
ins0["OutLinearBias"] = egr::EagerUtils::TrySyncToVars(OutLinearBias);
if (OutLinearBias.defined() && (!out_metas[8].empty()) &&
(!out_metas[8][0].IsStopGradient()))
outs0["OutLinearBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
if (pre_layer_norm) {
auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_);
if (LnScale.defined()) {
ins0["LnScale"] = egr::EagerUtils::TrySyncToVars(LnScale);
if (LnScale.defined() && (!out_metas[1].empty()) &&
(!out_metas[1][0].IsStopGradient()))
outs0["LnScale@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_);
if (LnBias.defined()) {
ins0["LnBias"] = egr::EagerUtils::TrySyncToVars(LnBias);
if (LnBias.defined() && (!out_metas[2].empty()) &&
(!out_metas[2][0].IsStopGradient()))
outs0["LnBias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_);
if (LnOut.defined()) {
ins0["LnOut"] = egr::EagerUtils::TrySyncToVars(LnOut);
if (LnOut.defined() && (!out_metas[13].empty()) &&
(!out_metas[13][0].IsStopGradient()))
outs0["LnOut@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto LnMean = egr::EagerUtils::RecoverTensorWrapper(&this->LnMean_);
if (LnMean.defined()) {
ins0["LnMean"] = egr::EagerUtils::TrySyncToVars(LnMean);
}
auto LnVariance = egr::EagerUtils::RecoverTensorWrapper(&this->LnVariance_);
if (LnVariance.defined()) {
ins0["LnVariance"] = egr::EagerUtils::TrySyncToVars(LnVariance);
}
} else {
auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
if (Ln2Scale.defined()) {
ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
if (Ln2Scale.defined() && (!out_metas[9].empty()) &&
(!out_metas[9][0].IsStopGradient()))
outs0["Ln2Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
if (Ln2Bias.defined()) {
ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
if (Ln2Bias.defined() && (!out_metas[10].empty()) &&
(!out_metas[10][0].IsStopGradient()))
outs0["Ln2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto BiasDropoutResidualOut =
egr::EagerUtils::RecoverTensorWrapper(&this->BiasDropoutResidualOut_);
auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_);
auto Ln2Variance =
egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_);
ins0["BiasDropoutResidualOut"] =
egr::EagerUtils::TrySyncToVars(BiasDropoutResidualOut);
ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean);
ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance);
if (BiasDropoutResidualOut.defined() && (!out_metas[14].empty()) &&
(!out_metas[14][0].IsStopGradient()))
outs0["BiasDropoutResidualOut@GRAD"] = {
std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto& attrs_map0 = this->attr_map_;
// Pass the entire attribute map to TraceOp
// The underlying kernel will pickup whatever attribute they need at runtime
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_attention_grad",
ins0,
outs0,
attrs_map0,
egr::Controller::Instance().GetExpectedPlace(),
&this->default_attr_map_,
false,
{});
if (outs0.find("OutLinearW@GRAD") != outs0.end()) {
outputs[7] = egr::EagerUtils::GetOutputs(outs0["OutLinearW@GRAD"]);
}
if (outs0.find("QKVW@GRAD") != outs0.end()) {
outputs[3] = egr::EagerUtils::GetOutputs(outs0["QKVW@GRAD"]);
}
if (outs0.find("X@GRAD") != outs0.end()) {
outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]);
}
if (outs0.find("QKVOut@GRAD") != outs0.end()) {
outputs[15] = egr::EagerUtils::GetOutputs(outs0["QKVOut@GRAD"]);
}
if (outs0.find("QKTVOut@GRAD") != outs0.end()) {
outputs[16] = egr::EagerUtils::GetOutputs(outs0["QKTVOut@GRAD"]);
}
if (outs0.find("TransposeOut2@GRAD") != outs0.end()) {
outputs[17] = egr::EagerUtils::GetOutputs(outs0["TransposeOut2@GRAD"]);
}
if (outs0.find("QKOut@GRAD") != outs0.end()) {
outputs[18] = egr::EagerUtils::GetOutputs(outs0["QKOut@GRAD"]);
}
if (outs0.find("SoftmaxOut@GRAD") != outs0.end()) {
outputs[19] = egr::EagerUtils::GetOutputs(outs0["SoftmaxOut@GRAD"]);
}
if (outs0.find("AttnDropoutOut@GRAD") != outs0.end()) {
outputs[20] = egr::EagerUtils::GetOutputs(outs0["AttnDropoutOut@GRAD"]);
}
if (outs0.find("FMHAOut@GRAD") != outs0.end()) {
outputs[21] = egr::EagerUtils::GetOutputs(outs0["FMHAOut@GRAD"]);
}
if (outs0.find("OutLinearOut@GRAD") != outs0.end()) {
outputs[22] = egr::EagerUtils::GetOutputs(outs0["OutLinearOut@GRAD"]);
}
if (QKVBias.defined()) {
if (outs0.find("QKVBias@GRAD") != outs0.end()) {
outputs[4] = egr::EagerUtils::GetOutputs(outs0["QKVBias@GRAD"]);
}
if (outs0.find("QKVBiasOut@GRAD") != outs0.end()) {
outputs[11] = egr::EagerUtils::GetOutputs(outs0["QKVBiasOut@GRAD"]);
}
}
if (SrcMask.defined()) {
if (outs0.find("SrcMaskOut@GRAD") != outs0.end()) {
outputs[12] = egr::EagerUtils::GetOutputs(outs0["SrcMaskOut@GRAD"]);
}
}
if (OutLinearBias.defined()) {
if (outs0.find("OutLinearBias@GRAD") != outs0.end()) {
outputs[8] = egr::EagerUtils::GetOutputs(outs0["OutLinearBias@GRAD"]);
}
}
if (pre_layer_norm) {
auto LnScale = egr::EagerUtils::RecoverTensorWrapper(&this->LnScale_);
if (LnScale.defined()) {
if (outs0.find("LnScale@GRAD") != outs0.end()) {
outputs[1] = egr::EagerUtils::GetOutputs(outs0["LnScale@GRAD"]);
}
}
auto LnBias = egr::EagerUtils::RecoverTensorWrapper(&this->LnBias_);
if (LnBias.defined()) {
if (outs0.find("LnBias@GRAD") != outs0.end()) {
outputs[2] = egr::EagerUtils::GetOutputs(outs0["LnBias@GRAD"]);
}
}
auto LnOut = egr::EagerUtils::RecoverTensorWrapper(&this->LnOut_);
if (LnOut.defined()) {
if (outs0.find("LnOut@GRAD") != outs0.end()) {
outputs[13] = egr::EagerUtils::GetOutputs(outs0["LnOut@GRAD"]);
}
}
} else {
auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
if (Ln2Scale.defined()) {
if (outs0.find("Ln2Scale@GRAD") != outs0.end()) {
outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]);
}
}
auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
if (Ln2Bias.defined()) {
if (outs0.find("Ln2Bias@GRAD") != outs0.end()) {
outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]);
}
}
if (outs0.find("BiasDropoutResidualOut@GRAD") != outs0.end()) {
outputs[14] =
egr::EagerUtils::GetOutputs(outs0["BiasDropoutResidualOut@GRAD"]);
}
}
if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
return outputs;
}
......@@ -329,3 +329,205 @@ class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_;
};
class fused_attentionGradNodeCompat : public egr::GradNodeBase {
public:
fused_attentionGradNodeCompat() : egr::GradNodeBase() {
VLOG(7) << " Construct fused_attentionGradNodeCompat ";
}
fused_attentionGradNodeCompat(size_t bwd_in_slot_num, size_t bwd_out_slot_num)
: egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
VLOG(7) << " Construct fused_attentionGradNodeCompat ";
}
~fused_attentionGradNodeCompat() override {
VLOG(6) << " Destruct fused_attentionGradNodeCompat ";
}
virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>, // NOLINT
egr::kSlotSmallVectorSize>& grads, // NOLINT
bool create_graph = false,
bool is_new_grad = false) override;
void ClearTensorWrappers() override {
AttnDropoutMaskOut_.clear();
AttnDropoutOut_.clear();
BiasDropoutResidualOut_.clear();
DropoutMaskOut_.clear();
FMHAOut_.clear();
Ln2Bias_.clear();
Ln2Mean_.clear();
Ln2Scale_.clear();
Ln2Variance_.clear();
OutLinearBias_.clear();
OutLinearOut_.clear();
OutLinearW_.clear();
QKOut_.clear();
QKTVOut_.clear();
QKVBias_.clear();
QKVBiasOut_.clear();
QKVOut_.clear();
QKVW_.clear();
SoftmaxOut_.clear();
SrcMask_.clear();
SrcMaskOut_.clear();
TransposeOut2_.clear();
X_.clear();
SetIsTensorWrappersCleared(true);
}
std::string name() override { return "fused_attentionGradNodeCompat"; }
std::shared_ptr<GradNodeBase> Copy() const override {
{
auto copied_node = std::shared_ptr<fused_attentionGradNodeCompat>(
new fused_attentionGradNodeCompat(*this));
return copied_node;
}
}
// SetX, SetY, ...
void SetTensorWrapperAttnDropoutMaskOut(
const paddle::experimental::Tensor& AttnDropoutMaskOut) {
AttnDropoutMaskOut_ = egr::TensorWrapper(AttnDropoutMaskOut, false);
}
void SetTensorWrapperAttnDropoutOut(
const paddle::experimental::Tensor& AttnDropoutOut) {
AttnDropoutOut_ = egr::TensorWrapper(AttnDropoutOut, false);
}
void SetTensorWrapperBiasDropoutResidualOut(
const paddle::experimental::Tensor& BiasDropoutResidualOut) {
BiasDropoutResidualOut_ = egr::TensorWrapper(BiasDropoutResidualOut, false);
}
void SetTensorWrapperDropoutMaskOut(
const paddle::experimental::Tensor& DropoutMaskOut) {
DropoutMaskOut_ = egr::TensorWrapper(DropoutMaskOut, false);
}
void SetTensorWrapperFMHAOut(const paddle::experimental::Tensor& FMHAOut) {
FMHAOut_ = egr::TensorWrapper(FMHAOut, false);
}
void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) {
Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
}
void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) {
Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
}
void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) {
Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
}
void SetTensorWrapperLn2Variance(
const paddle::experimental::Tensor& Ln2Variance) {
Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
}
void SetTensorWrapperOutLinearBias(
const paddle::experimental::Tensor& OutLinearBias) {
OutLinearBias_ = egr::TensorWrapper(OutLinearBias, false);
}
void SetTensorWrapperOutLinearOut(
const paddle::experimental::Tensor& OutLinearOut) {
OutLinearOut_ = egr::TensorWrapper(OutLinearOut, false);
}
void SetTensorWrapperOutLinearW(
const paddle::experimental::Tensor& OutLinearW) {
OutLinearW_ = egr::TensorWrapper(OutLinearW, false);
}
void SetTensorWrapperQKOut(const paddle::experimental::Tensor& QKOut) {
QKOut_ = egr::TensorWrapper(QKOut, false);
}
void SetTensorWrapperQKTVOut(const paddle::experimental::Tensor& QKTVOut) {
QKTVOut_ = egr::TensorWrapper(QKTVOut, false);
}
void SetTensorWrapperQKVBias(const paddle::experimental::Tensor& QKVBias) {
QKVBias_ = egr::TensorWrapper(QKVBias, false);
}
void SetTensorWrapperQKVBiasOut(
const paddle::experimental::Tensor& QKVBiasOut) {
QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, false);
}
void SetTensorWrapperQKVOut(const paddle::experimental::Tensor& QKVOut) {
QKVOut_ = egr::TensorWrapper(QKVOut, false);
}
void SetTensorWrapperQKVW(const paddle::experimental::Tensor& QKVW) {
QKVW_ = egr::TensorWrapper(QKVW, false);
}
void SetTensorWrapperSoftmaxOut(
const paddle::experimental::Tensor& SoftmaxOut) {
SoftmaxOut_ = egr::TensorWrapper(SoftmaxOut, false);
}
void SetTensorWrapperSrcMask(const paddle::experimental::Tensor& SrcMask) {
SrcMask_ = egr::TensorWrapper(SrcMask, false);
}
void SetTensorWrapperSrcMaskOut(
const paddle::experimental::Tensor& SrcMaskOut) {
SrcMaskOut_ = egr::TensorWrapper(SrcMaskOut, false);
}
void SetTensorWrapperTransposeOut2(
const paddle::experimental::Tensor& TransposeOut2) {
TransposeOut2_ = egr::TensorWrapper(TransposeOut2, false);
}
void SetTensorWrapperX(const paddle::experimental::Tensor& X) {
X_ = egr::TensorWrapper(X, false);
}
void SetTensorWrapperLnScale(const paddle::experimental::Tensor& LnScale) {
LnScale_ = egr::TensorWrapper(LnScale, false);
}
void SetTensorWrapperLnBias(const paddle::experimental::Tensor& LnBias) {
LnBias_ = egr::TensorWrapper(LnBias, false);
}
void SetTensorWrapperLnOut(const paddle::experimental::Tensor& LnOut) {
LnOut_ = egr::TensorWrapper(LnOut, false);
}
void SetTensorWrapperLnMean(const paddle::experimental::Tensor& LnMean) {
LnMean_ = egr::TensorWrapper(LnMean, false);
}
void SetTensorWrapperLnVariance(
const paddle::experimental::Tensor& LnVariance) {
LnVariance_ = egr::TensorWrapper(LnVariance, false);
}
// SetAttrMap
void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
attr_map_ = std::move(attr_map);
}
void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
default_attr_map_ = std::move(default_attr_map);
}
private:
// TensorWrappers
egr::TensorWrapper AttnDropoutMaskOut_;
egr::TensorWrapper AttnDropoutOut_;
egr::TensorWrapper BiasDropoutResidualOut_;
egr::TensorWrapper DropoutMaskOut_;
egr::TensorWrapper FMHAOut_;
egr::TensorWrapper Ln2Bias_;
egr::TensorWrapper Ln2Mean_;
egr::TensorWrapper Ln2Scale_;
egr::TensorWrapper Ln2Variance_;
egr::TensorWrapper OutLinearBias_;
egr::TensorWrapper OutLinearOut_;
egr::TensorWrapper OutLinearW_;
egr::TensorWrapper QKOut_;
egr::TensorWrapper QKTVOut_;
egr::TensorWrapper QKVBias_;
egr::TensorWrapper QKVBiasOut_;
egr::TensorWrapper QKVOut_;
egr::TensorWrapper QKVW_;
egr::TensorWrapper SoftmaxOut_;
egr::TensorWrapper SrcMask_;
egr::TensorWrapper SrcMaskOut_;
egr::TensorWrapper TransposeOut2_;
egr::TensorWrapper X_;
egr::TensorWrapper LnScale_;
egr::TensorWrapper LnBias_;
egr::TensorWrapper LnOut_;
egr::TensorWrapper LnMean_;
egr::TensorWrapper LnVariance_;
// Attribute Map
paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_;
};
......@@ -51,8 +51,10 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
"split", "rnn"};
/* --- Black Ops list that's NO NEED to apply code generation --- */
static std::unordered_set<std::string> black_ops_list = {
"run_program", "fused_gate_attention", "fused_feedforward"};
static std::unordered_set<std::string> black_ops_list = {"run_program",
"fused_gate_attention",
"fused_feedforward",
"fused_attention"};
static std::string LegalizeVariableName(const std::string& var_name) {
std::string ret = var_name;
......
......@@ -26,9 +26,7 @@ from paddle import tensor
from paddle.fluid import layers
import unittest
from op_test import OpTest
from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
_enable_legacy_dygraph()
from paddle.fluid.framework import default_main_program
default_main_program().random_seed = 42
......
......@@ -26,11 +26,9 @@ import unittest
from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
from test_sparse_attention_op import get_cuda_version
from paddle import _C_ops
from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
from paddle.fluid.framework import default_main_program
from paddle.fluid import core
_enable_legacy_dygraph()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"Paddle is not compiled with CUDA")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册