未验证 提交 54a9daf2 编写于 作者: W wanghuancoder 提交者: GitHub

[Eager] Menual fused feed forward (#43994)

* fused_gate_attention manual code in eager

* Menual fused_feedforward in eager

* fix test case
上级 502062da
...@@ -42,3 +42,28 @@ fused_gate_attention_dygraph_function( ...@@ -42,3 +42,28 @@ fused_gate_attention_dygraph_function(
const paddle::experimental::Tensor& OutLinearWeight, const paddle::experimental::Tensor& OutLinearWeight,
const paddle::experimental::Tensor& OutLinearBias, const paddle::experimental::Tensor& OutLinearBias,
const paddle::framework::AttributeMap& attr_map); const paddle::framework::AttributeMap& attr_map);
std::tuple<paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor>
fused_feedforward_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Dropout1Seed,
const paddle::experimental::Tensor& Dropout2Seed,
const paddle::experimental::Tensor& Linear1Weight,
const paddle::experimental::Tensor& Linear1Bias,
const paddle::experimental::Tensor& Linear2Weight,
const paddle::experimental::Tensor& Linear2Bias,
const paddle::experimental::Tensor& Ln1Scale,
const paddle::experimental::Tensor& Ln1Bias,
const paddle::experimental::Tensor& Ln2Scale,
const paddle::experimental::Tensor& Ln2Bias,
const paddle::framework::AttributeMap& attr_map);
...@@ -5,6 +5,13 @@ cc_library( ...@@ -5,6 +5,13 @@ cc_library(
add_dependencies(fused_gate_attention_fwd_func eager_codegen) add_dependencies(fused_gate_attention_fwd_func eager_codegen)
cc_library(
fused_feedforward_fwd_func
SRCS fused_feedforward_fwd_func.cc
DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
add_dependencies(fused_feedforward_fwd_func eager_codegen)
set(fluid_manual_functions set(fluid_manual_functions
fused_gate_attention_fwd_func fused_gate_attention_fwd_func fused_feedforward_fwd_func
PARENT_SCOPE) PARENT_SCOPE)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/amp_auto_cast.h"
#include "paddle/fluid/eager/amp_utils.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#pragma GCC diagnostic ignored "-Wunused-variable"
std::tuple<paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor,
paddle::experimental::Tensor>
fused_feedforward_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Dropout1Seed,
const paddle::experimental::Tensor& Dropout2Seed,
const paddle::experimental::Tensor& Linear1Weight,
const paddle::experimental::Tensor& Linear1Bias,
const paddle::experimental::Tensor& Linear2Weight,
const paddle::experimental::Tensor& Linear2Bias,
const paddle::experimental::Tensor& Ln1Scale,
const paddle::experimental::Tensor& Ln1Bias,
const paddle::experimental::Tensor& Ln2Scale,
const paddle::experimental::Tensor& Ln2Bias,
const paddle::framework::AttributeMap& attr_map) {
paddle::platform::RecordEvent dygraph_entrance_record_event(
"fused_feedforward dygraph",
paddle::platform::TracerEventType::Operator,
1);
VLOG(3) << "Running Eager Forward Op: fused_feedforward";
// Dygraph Forward Pass
if (egr::Controller::Instance().GetAMPLevel() !=
paddle::imperative::AmpLevel::O0) {
VLOG(5) << "Check and Prepare For AMP";
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
amp_tensors_vector = {{X}, {Linear1Weight}, {Linear2Weight}};
if (Dropout1Seed.initialized())
amp_tensors_vector.push_back({Dropout1Seed});
if (Dropout2Seed.initialized())
amp_tensors_vector.push_back({Dropout2Seed});
if (Linear1Bias.initialized()) amp_tensors_vector.push_back({Linear1Bias});
if (Linear2Bias.initialized()) amp_tensors_vector.push_back({Linear2Bias});
if (Ln1Scale.initialized()) amp_tensors_vector.push_back({Ln1Scale});
if (Ln1Bias.initialized()) amp_tensors_vector.push_back({Ln1Bias});
if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale});
if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias});
auto amp_dst_dtype =
egr::GetAmpDestDtype("fused_feedforward", amp_tensors_vector);
auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_feedforward");
auto NEW_Linear1Weight = egr::AmpAutoCast(
"Linear1Weight", Linear1Weight, amp_dst_dtype, "fused_feedforward");
auto NEW_Linear2Weight = egr::AmpAutoCast(
"Linear2Weight", Linear2Weight, amp_dst_dtype, "fused_feedforward");
auto NEW_Dropout1Seed =
((Dropout1Seed.initialized()) ? egr::AmpAutoCast("Dropout1Seed",
Dropout1Seed,
amp_dst_dtype,
"fused_feedforward")
: Dropout1Seed);
auto NEW_Dropout2Seed =
((Dropout2Seed.initialized()) ? egr::AmpAutoCast("Dropout2Seed",
Dropout2Seed,
amp_dst_dtype,
"fused_feedforward")
: Dropout2Seed);
auto NEW_Linear1Bias =
((Linear1Bias.initialized()) ? egr::AmpAutoCast("Linear1Bias",
Linear1Bias,
amp_dst_dtype,
"fused_feedforward")
: Linear1Bias);
auto NEW_Linear2Bias =
((Linear2Bias.initialized()) ? egr::AmpAutoCast("Linear2Bias",
Linear2Bias,
amp_dst_dtype,
"fused_feedforward")
: Linear2Bias);
auto NEW_Ln1Scale =
((Ln1Scale.initialized())
? egr::AmpAutoCast(
"Ln1Scale", Ln1Scale, amp_dst_dtype, "fused_feedforward")
: Ln1Scale);
auto NEW_Ln1Bias =
((Ln1Bias.initialized())
? egr::AmpAutoCast(
"Ln1Bias", Ln1Bias, amp_dst_dtype, "fused_feedforward")
: Ln1Bias);
auto NEW_Ln2Scale =
((Ln2Scale.initialized())
? egr::AmpAutoCast(
"Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_feedforward")
: Ln2Scale);
auto NEW_Ln2Bias =
((Ln2Bias.initialized())
? egr::AmpAutoCast(
"Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_feedforward")
: Ln2Bias);
{
paddle::imperative::AutoCastGuard guard(
egr::Controller::Instance().GetCurrentTracer(),
paddle::imperative::AmpLevel::O0);
return fused_feedforward_dygraph_function(NEW_X,
NEW_Dropout1Seed,
NEW_Dropout2Seed,
NEW_Linear1Weight,
NEW_Linear1Bias,
NEW_Linear2Weight,
NEW_Linear2Bias,
NEW_Ln1Scale,
NEW_Ln1Bias,
NEW_Ln2Scale,
NEW_Ln2Bias,
attr_map);
}
}
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
{{"X", egr::EagerUtils::TrySyncToVars(X)},
{"Linear1Weight", egr::EagerUtils::TrySyncToVars(Linear1Weight)},
{"Linear2Weight", egr::EagerUtils::TrySyncToVars(Linear2Weight)}};
if (Dropout1Seed.initialized())
ins["Dropout1Seed"] = egr::EagerUtils::TrySyncToVars(Dropout1Seed);
if (Dropout2Seed.initialized())
ins["Dropout2Seed"] = egr::EagerUtils::TrySyncToVars(Dropout2Seed);
if (Linear1Bias.initialized())
ins["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias);
if (Linear2Bias.initialized())
ins["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias);
if (Ln1Scale.initialized())
ins["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale);
if (Ln1Bias.initialized())
ins["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias);
if (Ln2Scale.initialized())
ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
if (Ln2Bias.initialized())
ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
{{"Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Dropout1Mask",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Dropout2Mask",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Ln1Mean",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Ln1Variance",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Ln2Mean",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Ln2Variance",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Linear1Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Ln1Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Dropout1Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}},
{"Dropout2Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}}};
// Prepare Autograd Meta
egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X);
egr::AutogradMeta* p_autograd_Dropout1Seed =
egr::EagerUtils::nullable_autograd_meta(Dropout1Seed);
egr::AutogradMeta* p_autograd_Dropout2Seed =
egr::EagerUtils::nullable_autograd_meta(Dropout2Seed);
egr::AutogradMeta* p_autograd_Linear1Weight =
egr::EagerUtils::nullable_autograd_meta(Linear1Weight);
egr::AutogradMeta* p_autograd_Linear1Bias =
egr::EagerUtils::nullable_autograd_meta(Linear1Bias);
egr::AutogradMeta* p_autograd_Linear2Weight =
egr::EagerUtils::nullable_autograd_meta(Linear2Weight);
egr::AutogradMeta* p_autograd_Linear2Bias =
egr::EagerUtils::nullable_autograd_meta(Linear2Bias);
egr::AutogradMeta* p_autograd_Ln1Scale =
egr::EagerUtils::nullable_autograd_meta(Ln1Scale);
egr::AutogradMeta* p_autograd_Ln1Bias =
egr::EagerUtils::nullable_autograd_meta(Ln1Bias);
egr::AutogradMeta* p_autograd_Ln2Scale =
egr::EagerUtils::nullable_autograd_meta(Ln2Scale);
egr::AutogradMeta* p_autograd_Ln2Bias =
egr::EagerUtils::nullable_autograd_meta(Ln2Bias);
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad =
egr::EagerUtils::ComputeRequireGrad(trace_backward,
p_autograd_X,
p_autograd_Dropout1Seed,
p_autograd_Dropout2Seed,
p_autograd_Linear1Weight,
p_autograd_Linear1Bias,
p_autograd_Linear2Weight,
p_autograd_Linear2Bias,
p_autograd_Ln1Scale,
p_autograd_Ln1Bias,
p_autograd_Ln2Scale,
p_autograd_Ln2Bias);
paddle::framework::AttributeMap attrs = attr_map;
paddle::framework::AttributeMap default_attrs;
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_feedforward",
ins,
outs,
attrs,
egr::Controller::Instance().GetExpectedPlace(),
&default_attrs,
true,
{});
paddle::experimental::Tensor Out;
egr::EagerUtils::GetOutput(outs["Out"][0], &Out);
paddle::experimental::Tensor Dropout1Mask;
egr::EagerUtils::GetOutput(outs["Dropout1Mask"][0], &Dropout1Mask);
paddle::experimental::Tensor Dropout2Mask;
egr::EagerUtils::GetOutput(outs["Dropout2Mask"][0], &Dropout2Mask);
paddle::experimental::Tensor Ln1Mean;
egr::EagerUtils::GetOutput(outs["Ln1Mean"][0], &Ln1Mean);
paddle::experimental::Tensor Ln1Variance;
egr::EagerUtils::GetOutput(outs["Ln1Variance"][0], &Ln1Variance);
paddle::experimental::Tensor Ln2Mean;
egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean);
paddle::experimental::Tensor Ln2Variance;
egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance);
paddle::experimental::Tensor Linear1Out;
egr::EagerUtils::GetOutput(outs["Linear1Out"][0], &Linear1Out);
paddle::experimental::Tensor Ln1Out;
egr::EagerUtils::GetOutput(outs["Ln1Out"][0], &Ln1Out);
paddle::experimental::Tensor Dropout1Out;
egr::EagerUtils::GetOutput(outs["Dropout1Out"][0], &Dropout1Out);
paddle::experimental::Tensor Dropout2Out;
egr::EagerUtils::GetOutput(outs["Dropout2Out"][0], &Dropout2Out);
{
paddle::platform::RecordEvent node_creation_record_event(
"fused_feedforward node_creation",
paddle::platform::TracerEventType::Operator,
1);
egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out);
egr::AutogradMeta* p_autograd_Dropout1Mask =
egr::EagerUtils::autograd_meta(&Dropout1Mask);
egr::AutogradMeta* p_autograd_Dropout2Mask =
egr::EagerUtils::autograd_meta(&Dropout2Mask);
egr::AutogradMeta* p_autograd_Ln1Mean =
egr::EagerUtils::autograd_meta(&Ln1Mean);
egr::AutogradMeta* p_autograd_Ln1Variance =
egr::EagerUtils::autograd_meta(&Ln1Variance);
egr::AutogradMeta* p_autograd_Ln2Mean =
egr::EagerUtils::autograd_meta(&Ln2Mean);
egr::AutogradMeta* p_autograd_Ln2Variance =
egr::EagerUtils::autograd_meta(&Ln2Variance);
egr::AutogradMeta* p_autograd_Linear1Out =
egr::EagerUtils::autograd_meta(&Linear1Out);
egr::AutogradMeta* p_autograd_Ln1Out =
egr::EagerUtils::autograd_meta(&Ln1Out);
egr::AutogradMeta* p_autograd_Dropout1Out =
egr::EagerUtils::autograd_meta(&Dropout1Out);
egr::AutogradMeta* p_autograd_Dropout2Out =
egr::EagerUtils::autograd_meta(&Dropout2Out);
if (require_any_grad) {
VLOG(6) << " Construct Grad for fused_feedforward ";
egr::EagerUtils::PassStopGradient(false,
p_autograd_Out,
p_autograd_Dropout1Mask,
p_autograd_Dropout2Mask,
p_autograd_Ln1Mean,
p_autograd_Ln1Variance,
p_autograd_Ln2Mean,
p_autograd_Ln2Variance,
p_autograd_Linear1Out,
p_autograd_Ln1Out,
p_autograd_Dropout1Out,
p_autograd_Dropout2Out);
// Create GradOpNode
auto grad_node = std::shared_ptr<fused_feedforwardGradNodeCompat>(
new fused_feedforwardGradNodeCompat(11, 11));
bool pre_layer_norm = false;
if (attrs.count("pre_layer_norm")) {
pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm"));
}
// Set Attributes
grad_node->SetAttrMap(std::move(attrs));
grad_node->SetDefaultAttrMap(std::move(default_attrs));
grad_node->SetTensorWrapperX(X);
grad_node->SetTensorWrapperLinear1Weight(Linear1Weight);
grad_node->SetTensorWrapperLinear1Bias(Linear1Bias);
grad_node->SetTensorWrapperLinear2Weight(Linear2Weight);
grad_node->SetTensorWrapperDropout1Mask(Dropout1Mask);
grad_node->SetTensorWrapperDropout2Mask(Dropout2Mask);
grad_node->SetTensorWrapperLinear1Out(Linear1Out);
grad_node->SetTensorWrapperDropout1Out(Dropout1Out);
grad_node->SetTensorWrapperDropout2Out(Dropout2Out);
grad_node->SetGradOutMeta(X, 0);
grad_node->SetGradOutMeta(Linear1Weight, 3);
grad_node->SetGradOutMeta(Linear1Bias, 4);
grad_node->SetGradOutMeta(Linear2Weight, 5);
if (pre_layer_norm) {
grad_node->SetTensorWrapperLn1Scale(Ln1Scale);
grad_node->SetTensorWrapperLn1Bias(Ln1Bias);
grad_node->SetTensorWrapperLn1Out(Ln1Out);
grad_node->SetTensorWrapperLn1Mean(Ln1Mean);
grad_node->SetTensorWrapperLn1Variance(Ln1Variance);
grad_node->SetGradOutMeta(Ln1Scale, 7);
grad_node->SetGradOutMeta(Ln1Bias, 8);
} else {
grad_node->SetTensorWrapperLn2Scale(Ln2Scale);
grad_node->SetGradOutMeta(Ln2Scale, 9);
grad_node->SetTensorWrapperLn2Bias(Ln2Bias);
grad_node->SetGradOutMeta(Ln2Bias, 10);
grad_node->SetTensorWrapperLn2Mean(Ln2Mean);
grad_node->SetTensorWrapperLn2Variance(Ln2Variance);
}
if (Linear2Bias.initialized()) {
grad_node->SetTensorWrapperLinear2Bias(Linear2Bias);
grad_node->SetGradOutMeta(Linear2Bias, 6);
}
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0);
egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
grad_node->SetGradInMeta(Out, 0);
egr::EagerUtils::CheckAndRetainGrad(Out);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Mask, 1);
grad_node->SetGradInMeta(Dropout1Mask, 1);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Mask, 2);
grad_node->SetGradInMeta(Dropout2Mask, 2);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Mean, 3);
grad_node->SetGradInMeta(Ln1Mean, 3);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Variance, 4);
grad_node->SetGradInMeta(Ln1Variance, 4);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 5);
grad_node->SetGradInMeta(Ln2Mean, 5);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 6);
grad_node->SetGradInMeta(Ln2Variance, 6);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Linear1Out, 7);
grad_node->SetGradInMeta(Linear1Out, 7);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Out, 8);
grad_node->SetGradInMeta(Ln1Out, 8);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Out, 9);
grad_node->SetGradInMeta(Dropout1Out, 9);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Out, 10);
grad_node->SetGradInMeta(Dropout2Out, 10);
}
}
return std::make_tuple(Out,
Dropout1Mask,
Dropout2Mask,
Ln1Mean,
Ln1Variance,
Ln2Mean,
Ln2Variance,
Linear1Out,
Ln1Out,
Dropout1Out,
Dropout2Out);
}
...@@ -3,6 +3,11 @@ cc_library( ...@@ -3,6 +3,11 @@ cc_library(
SRCS fused_gate_attention_node.cc SRCS fused_gate_attention_node.cc
DEPS ${eager_deps} ${fluid_deps}) DEPS ${eager_deps} ${fluid_deps})
cc_library(
fused_feedforward_node
SRCS fused_feedforward_node.cc
DEPS ${eager_deps} ${fluid_deps})
set(fluid_manual_nodes set(fluid_manual_nodes
fused_gate_attention_node fused_gate_attention_node fused_feedforward_node
PARENT_SCOPE) PARENT_SCOPE)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "glog/logging.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/all.h"
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
fused_feedforwardGradNodeCompat::operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>& grads,
bool create_graph,
bool is_new_grad) {
VLOG(3) << "Running Eager Backward Node: fused_feedforwardGradNodeCompat";
const auto& out_metas = OutputMeta();
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
outputs(11);
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
hooked_grads0 =
fused_feedforwardGradNodeCompat::ApplyGradientHooks(grads);
bool pre_layer_norm = false;
if (attr_map_.count("pre_layer_norm")) {
pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm"));
}
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
{{"Dropout1Mask",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Mask_))},
{"Dropout1Out",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Out_))},
{"Dropout2Mask",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Mask_))},
{"Dropout2Out",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Out_))},
{"Linear1Out",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Out_))},
{"Linear1Weight",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Weight_))},
{"Linear2Weight",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Weight_))},
{"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])},
{"X",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->X_))}};
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
auto Linear1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Bias_);
if (Linear1Bias.defined())
ins0["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias);
if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) {
outs0.insert({"Linear1Weight@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[5].empty()) && (!(out_metas[5][0].IsStopGradient()))) {
outs0.insert({"Linear2Weight@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
outs0.insert({"X@GRAD",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if (Linear1Bias.defined() && (!out_metas[4].empty()) &&
(!out_metas[4][0].IsStopGradient()))
outs0["Linear1Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (pre_layer_norm) {
auto Ln1Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Scale_);
if (Ln1Scale.defined())
ins0["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale);
auto Ln1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Bias_);
if (Ln1Bias.defined())
ins0["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias);
auto Ln1Out = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Out_);
if (Ln1Out.defined())
ins0["Ln1Out"] = egr::EagerUtils::TrySyncToVars(Ln1Out);
auto Ln1Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Mean_);
if (Ln1Mean.defined())
ins0["Ln1Mean"] = egr::EagerUtils::TrySyncToVars(Ln1Mean);
auto Ln1Variance =
egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Variance_);
if (Ln1Variance.defined())
ins0["Ln1Variance"] = egr::EagerUtils::TrySyncToVars(Ln1Variance);
if (Ln1Scale.defined() && (!out_metas[7].empty()) &&
(!out_metas[7][0].IsStopGradient()))
outs0["Ln1Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (Ln1Bias.defined() && (!out_metas[8].empty()) &&
(!out_metas[8][0].IsStopGradient()))
outs0["Ln1Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
} else {
auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_);
if (Ln2Scale.defined())
ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale);
auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_);
if (Ln2Bias.defined())
ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias);
auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_);
if (Ln2Mean.defined())
ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean);
auto Ln2Variance =
egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_);
if (Ln2Variance.defined())
ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance);
if (Ln2Scale.defined() && (!out_metas[9].empty()) &&
(!out_metas[9][0].IsStopGradient()))
outs0["Ln2Scale@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
if (Ln2Bias.defined() && (!out_metas[10].empty()) &&
(!out_metas[10][0].IsStopGradient()))
outs0["Ln2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto Linear2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Bias_);
if (Linear2Bias.defined()) {
ins0["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias);
if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient()))
outs0["Linear2Bias@GRAD"] = {std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())};
}
auto& attrs_map0 = this->attr_map_;
// Pass the entire attribute map to TraceOp
// The underlying kernel will pickup whatever attribute they need at runtime
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_feedforward_grad",
ins0,
outs0,
attrs_map0,
egr::Controller::Instance().GetExpectedPlace(),
&this->default_attr_map_,
false,
{});
if (outs0.find("Linear1Weight@GRAD") != outs0.end()) {
outputs[3] = egr::EagerUtils::GetOutputs(outs0["Linear1Weight@GRAD"]);
}
if (outs0.find("Linear2Weight@GRAD") != outs0.end()) {
outputs[5] = egr::EagerUtils::GetOutputs(outs0["Linear2Weight@GRAD"]);
}
if (outs0.find("X@GRAD") != outs0.end()) {
outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]);
}
if (outs0.find("Linear1Bias@GRAD") != outs0.end()) {
outputs[4] = egr::EagerUtils::GetOutputs(outs0["Linear1Bias@GRAD"]);
}
if (pre_layer_norm) {
if (outs0.find("Ln1Scale@GRAD") != outs0.end()) {
outputs[7] = egr::EagerUtils::GetOutputs(outs0["Ln1Scale@GRAD"]);
}
if (outs0.find("Ln1Bias@GRAD") != outs0.end()) {
outputs[8] = egr::EagerUtils::GetOutputs(outs0["Ln1Bias@GRAD"]);
}
} else {
if (outs0.find("Ln2Bias@GRAD") != outs0.end()) {
outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]);
}
if (outs0.find("Ln2Scale@GRAD") != outs0.end()) {
outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]);
}
}
if (Linear2Bias.defined()) {
if (outs0.find("Linear2Bias@GRAD") != outs0.end()) {
outputs[6] = egr::EagerUtils::GetOutputs(outs0["Linear2Bias@GRAD"]);
}
}
if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
return outputs;
}
...@@ -174,3 +174,158 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { ...@@ -174,3 +174,158 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_; paddle::framework::AttributeMap default_attr_map_;
}; };
class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
public:
fused_feedforwardGradNodeCompat() : egr::GradNodeBase() {
VLOG(7) << " Construct fused_feedforwardGradNodeCompat ";
}
fused_feedforwardGradNodeCompat(size_t bwd_in_slot_num,
size_t bwd_out_slot_num)
: egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
VLOG(7) << " Construct fused_feedforwardGradNodeCompat ";
}
~fused_feedforwardGradNodeCompat() override {
VLOG(6) << " Destruct fused_feedforwardGradNodeCompat ";
}
virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>, // NOLINT
egr::kSlotSmallVectorSize>& grads, // NOLINT
bool create_graph = false,
bool is_new_grad = false) override;
void ClearTensorWrappers() override {
Dropout1Mask_.clear();
Dropout1Out_.clear();
Dropout2Mask_.clear();
Dropout2Out_.clear();
Linear1Bias_.clear();
Linear1Out_.clear();
Linear1Weight_.clear();
Linear2Bias_.clear();
Linear2Weight_.clear();
Ln2Bias_.clear();
Ln2Mean_.clear();
Ln2Scale_.clear();
Ln2Variance_.clear();
X_.clear();
SetIsTensorWrappersCleared(true);
}
std::string name() override { return "fused_feedforwardGradNodeCompat"; }
std::shared_ptr<GradNodeBase> Copy() const override {
{
auto copied_node = std::shared_ptr<fused_feedforwardGradNodeCompat>(
new fused_feedforwardGradNodeCompat(*this));
return copied_node;
}
}
// SetX, SetY, ...
void SetTensorWrapperDropout1Mask(
const paddle::experimental::Tensor& Dropout1Mask) {
Dropout1Mask_ = egr::TensorWrapper(Dropout1Mask, false);
}
void SetTensorWrapperDropout1Out(
const paddle::experimental::Tensor& Dropout1Out) {
Dropout1Out_ = egr::TensorWrapper(Dropout1Out, false);
}
void SetTensorWrapperDropout2Mask(
const paddle::experimental::Tensor& Dropout2Mask) {
Dropout2Mask_ = egr::TensorWrapper(Dropout2Mask, false);
}
void SetTensorWrapperDropout2Out(
const paddle::experimental::Tensor& Dropout2Out) {
Dropout2Out_ = egr::TensorWrapper(Dropout2Out, false);
}
void SetTensorWrapperLinear1Bias(
const paddle::experimental::Tensor& Linear1Bias) {
Linear1Bias_ = egr::TensorWrapper(Linear1Bias, false);
}
void SetTensorWrapperLinear1Out(
const paddle::experimental::Tensor& Linear1Out) {
Linear1Out_ = egr::TensorWrapper(Linear1Out, false);
}
void SetTensorWrapperLinear1Weight(
const paddle::experimental::Tensor& Linear1Weight) {
Linear1Weight_ = egr::TensorWrapper(Linear1Weight, false);
}
void SetTensorWrapperLinear2Bias(
const paddle::experimental::Tensor& Linear2Bias) {
Linear2Bias_ = egr::TensorWrapper(Linear2Bias, false);
}
void SetTensorWrapperLinear2Weight(
const paddle::experimental::Tensor& Linear2Weight) {
Linear2Weight_ = egr::TensorWrapper(Linear2Weight, false);
}
void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) {
Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false);
}
void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) {
Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false);
}
void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) {
Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false);
}
void SetTensorWrapperLn2Variance(
const paddle::experimental::Tensor& Ln2Variance) {
Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false);
}
void SetTensorWrapperX(const paddle::experimental::Tensor& X) {
X_ = egr::TensorWrapper(X, false);
}
void SetTensorWrapperLn1Scale(const paddle::experimental::Tensor& Ln1Scale) {
Ln1Scale_ = egr::TensorWrapper(Ln1Scale, false);
}
void SetTensorWrapperLn1Bias(const paddle::experimental::Tensor& Ln1Bias) {
Ln1Bias_ = egr::TensorWrapper(Ln1Bias, false);
}
void SetTensorWrapperLn1Out(const paddle::experimental::Tensor& Ln1Out) {
Ln1Out_ = egr::TensorWrapper(Ln1Out, false);
}
void SetTensorWrapperLn1Mean(const paddle::experimental::Tensor& Ln1Mean) {
Ln1Mean_ = egr::TensorWrapper(Ln1Mean, false);
}
void SetTensorWrapperLn1Variance(
const paddle::experimental::Tensor& Ln1Variance) {
Ln1Variance_ = egr::TensorWrapper(Ln1Variance, false);
}
// SetAttrMap
void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
attr_map_ = std::move(attr_map);
}
void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
default_attr_map_ = std::move(default_attr_map);
}
private:
// TensorWrappers
egr::TensorWrapper Dropout1Mask_;
egr::TensorWrapper Dropout1Out_;
egr::TensorWrapper Dropout2Mask_;
egr::TensorWrapper Dropout2Out_;
egr::TensorWrapper Linear1Bias_;
egr::TensorWrapper Linear1Out_;
egr::TensorWrapper Linear1Weight_;
egr::TensorWrapper Linear2Bias_;
egr::TensorWrapper Linear2Weight_;
egr::TensorWrapper Ln2Bias_;
egr::TensorWrapper Ln2Mean_;
egr::TensorWrapper Ln2Scale_;
egr::TensorWrapper Ln2Variance_;
egr::TensorWrapper X_;
egr::TensorWrapper Ln1Scale_;
egr::TensorWrapper Ln1Bias_;
egr::TensorWrapper Ln1Out_;
egr::TensorWrapper Ln1Mean_;
egr::TensorWrapper Ln1Variance_;
// Attribute Map
paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_;
};
...@@ -52,7 +52,7 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = { ...@@ -52,7 +52,7 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
/* --- Black Ops list that's NO NEED to apply code generation --- */ /* --- Black Ops list that's NO NEED to apply code generation --- */
static std::unordered_set<std::string> black_ops_list = { static std::unordered_set<std::string> black_ops_list = {
"run_program", "fused_gate_attention"}; "run_program", "fused_gate_attention", "fused_feedforward"};
static std::string LegalizeVariableName(const std::string& var_name) { static std::string LegalizeVariableName(const std::string& var_name) {
std::string ret = var_name; std::string ret = var_name;
......
...@@ -23,9 +23,7 @@ from paddle.nn.layer.norm import LayerNorm ...@@ -23,9 +23,7 @@ from paddle.nn.layer.norm import LayerNorm
from paddle.nn.layer.common import Linear, Dropout from paddle.nn.layer.common import Linear, Dropout
import unittest import unittest
from op_test import OpTest from op_test import OpTest
from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph from paddle.fluid.framework import default_main_program
_enable_legacy_dygraph()
class TestFusedFFNOp(OpTest): class TestFusedFFNOp(OpTest):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册