diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h index 3715544b923aa834a22b41692e64e25e798d868e..397e549e6147346334c95367a132f287f2a55ec9 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -42,3 +42,28 @@ fused_gate_attention_dygraph_function( const paddle::experimental::Tensor& OutLinearWeight, const paddle::experimental::Tensor& OutLinearBias, const paddle::framework::AttributeMap& attr_map); + +std::tuple +fused_feedforward_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Dropout1Seed, + const paddle::experimental::Tensor& Dropout2Seed, + const paddle::experimental::Tensor& Linear1Weight, + const paddle::experimental::Tensor& Linear1Bias, + const paddle::experimental::Tensor& Linear2Weight, + const paddle::experimental::Tensor& Linear2Bias, + const paddle::experimental::Tensor& Ln1Scale, + const paddle::experimental::Tensor& Ln1Bias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 2a7d72eb7cabd1720635e2aec7186a4cc3fb99ac..305df1c92c6e18df4bad1cb5c666241fec54db4c 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -5,6 +5,13 @@ cc_library( add_dependencies(fused_gate_attention_fwd_func eager_codegen) +cc_library( + fused_feedforward_fwd_func + SRCS fused_feedforward_fwd_func.cc + DEPS ${eager_deps} ${fluid_deps} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +add_dependencies(fused_feedforward_fwd_func eager_codegen) + set(fluid_manual_functions - fused_gate_attention_fwd_func + fused_gate_attention_fwd_func fused_feedforward_fwd_func PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc new file mode 100644 index 0000000000000000000000000000000000000000..e246649314b523898c7e602ebaddfecccf6334a2 --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/amp_auto_cast.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" + +std::tuple +fused_feedforward_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Dropout1Seed, + const paddle::experimental::Tensor& Dropout2Seed, + const paddle::experimental::Tensor& Linear1Weight, + const paddle::experimental::Tensor& Linear1Bias, + const paddle::experimental::Tensor& Linear2Weight, + const paddle::experimental::Tensor& Linear2Bias, + const paddle::experimental::Tensor& Ln1Scale, + const paddle::experimental::Tensor& Ln1Bias, + const paddle::experimental::Tensor& Ln2Scale, + const paddle::experimental::Tensor& Ln2Bias, + const paddle::framework::AttributeMap& attr_map) { + paddle::platform::RecordEvent dygraph_entrance_record_event( + "fused_feedforward dygraph", + paddle::platform::TracerEventType::Operator, + 1); + VLOG(3) << "Running Eager Forward Op: fused_feedforward"; + // Dygraph Forward Pass + + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {{X}, {Linear1Weight}, {Linear2Weight}}; + if (Dropout1Seed.initialized()) + amp_tensors_vector.push_back({Dropout1Seed}); + if (Dropout2Seed.initialized()) + amp_tensors_vector.push_back({Dropout2Seed}); + if (Linear1Bias.initialized()) amp_tensors_vector.push_back({Linear1Bias}); + if (Linear2Bias.initialized()) amp_tensors_vector.push_back({Linear2Bias}); + if (Ln1Scale.initialized()) amp_tensors_vector.push_back({Ln1Scale}); + if (Ln1Bias.initialized()) amp_tensors_vector.push_back({Ln1Bias}); + if (Ln2Scale.initialized()) amp_tensors_vector.push_back({Ln2Scale}); + if (Ln2Bias.initialized()) amp_tensors_vector.push_back({Ln2Bias}); + + auto amp_dst_dtype = + egr::GetAmpDestDtype("fused_feedforward", amp_tensors_vector); + + auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_feedforward"); + auto NEW_Linear1Weight = egr::AmpAutoCast( + "Linear1Weight", Linear1Weight, amp_dst_dtype, "fused_feedforward"); + auto NEW_Linear2Weight = egr::AmpAutoCast( + "Linear2Weight", Linear2Weight, amp_dst_dtype, "fused_feedforward"); + auto NEW_Dropout1Seed = + ((Dropout1Seed.initialized()) ? egr::AmpAutoCast("Dropout1Seed", + Dropout1Seed, + amp_dst_dtype, + "fused_feedforward") + : Dropout1Seed); + auto NEW_Dropout2Seed = + ((Dropout2Seed.initialized()) ? egr::AmpAutoCast("Dropout2Seed", + Dropout2Seed, + amp_dst_dtype, + "fused_feedforward") + : Dropout2Seed); + auto NEW_Linear1Bias = + ((Linear1Bias.initialized()) ? egr::AmpAutoCast("Linear1Bias", + Linear1Bias, + amp_dst_dtype, + "fused_feedforward") + : Linear1Bias); + auto NEW_Linear2Bias = + ((Linear2Bias.initialized()) ? egr::AmpAutoCast("Linear2Bias", + Linear2Bias, + amp_dst_dtype, + "fused_feedforward") + : Linear2Bias); + auto NEW_Ln1Scale = + ((Ln1Scale.initialized()) + ? egr::AmpAutoCast( + "Ln1Scale", Ln1Scale, amp_dst_dtype, "fused_feedforward") + : Ln1Scale); + auto NEW_Ln1Bias = + ((Ln1Bias.initialized()) + ? egr::AmpAutoCast( + "Ln1Bias", Ln1Bias, amp_dst_dtype, "fused_feedforward") + : Ln1Bias); + auto NEW_Ln2Scale = + ((Ln2Scale.initialized()) + ? egr::AmpAutoCast( + "Ln2Scale", Ln2Scale, amp_dst_dtype, "fused_feedforward") + : Ln2Scale); + auto NEW_Ln2Bias = + ((Ln2Bias.initialized()) + ? egr::AmpAutoCast( + "Ln2Bias", Ln2Bias, amp_dst_dtype, "fused_feedforward") + : Ln2Bias); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return fused_feedforward_dygraph_function(NEW_X, + NEW_Dropout1Seed, + NEW_Dropout2Seed, + NEW_Linear1Weight, + NEW_Linear1Bias, + NEW_Linear2Weight, + NEW_Linear2Bias, + NEW_Ln1Scale, + NEW_Ln1Bias, + NEW_Ln2Scale, + NEW_Ln2Bias, + attr_map); + } + } + + std::map>> ins = + {{"X", egr::EagerUtils::TrySyncToVars(X)}, + {"Linear1Weight", egr::EagerUtils::TrySyncToVars(Linear1Weight)}, + {"Linear2Weight", egr::EagerUtils::TrySyncToVars(Linear2Weight)}}; + if (Dropout1Seed.initialized()) + ins["Dropout1Seed"] = egr::EagerUtils::TrySyncToVars(Dropout1Seed); + if (Dropout2Seed.initialized()) + ins["Dropout2Seed"] = egr::EagerUtils::TrySyncToVars(Dropout2Seed); + if (Linear1Bias.initialized()) + ins["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias); + if (Linear2Bias.initialized()) + ins["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias); + if (Ln1Scale.initialized()) + ins["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale); + if (Ln1Bias.initialized()) + ins["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias); + if (Ln2Scale.initialized()) + ins["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + if (Ln2Bias.initialized()) + ins["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + + std::map>> outs = + {{"Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout1Mask", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout2Mask", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Mean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Variance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Mean", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln2Variance", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Linear1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Ln1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout1Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}, + {"Dropout2Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}}; + + // Prepare Autograd Meta + egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X); + egr::AutogradMeta* p_autograd_Dropout1Seed = + egr::EagerUtils::nullable_autograd_meta(Dropout1Seed); + egr::AutogradMeta* p_autograd_Dropout2Seed = + egr::EagerUtils::nullable_autograd_meta(Dropout2Seed); + egr::AutogradMeta* p_autograd_Linear1Weight = + egr::EagerUtils::nullable_autograd_meta(Linear1Weight); + egr::AutogradMeta* p_autograd_Linear1Bias = + egr::EagerUtils::nullable_autograd_meta(Linear1Bias); + egr::AutogradMeta* p_autograd_Linear2Weight = + egr::EagerUtils::nullable_autograd_meta(Linear2Weight); + egr::AutogradMeta* p_autograd_Linear2Bias = + egr::EagerUtils::nullable_autograd_meta(Linear2Bias); + egr::AutogradMeta* p_autograd_Ln1Scale = + egr::EagerUtils::nullable_autograd_meta(Ln1Scale); + egr::AutogradMeta* p_autograd_Ln1Bias = + egr::EagerUtils::nullable_autograd_meta(Ln1Bias); + egr::AutogradMeta* p_autograd_Ln2Scale = + egr::EagerUtils::nullable_autograd_meta(Ln2Scale); + egr::AutogradMeta* p_autograd_Ln2Bias = + egr::EagerUtils::nullable_autograd_meta(Ln2Bias); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + + bool require_any_grad = + egr::EagerUtils::ComputeRequireGrad(trace_backward, + p_autograd_X, + p_autograd_Dropout1Seed, + p_autograd_Dropout2Seed, + p_autograd_Linear1Weight, + p_autograd_Linear1Bias, + p_autograd_Linear2Weight, + p_autograd_Linear2Bias, + p_autograd_Ln1Scale, + p_autograd_Ln1Bias, + p_autograd_Ln2Scale, + p_autograd_Ln2Bias); + + paddle::framework::AttributeMap attrs = attr_map; + paddle::framework::AttributeMap default_attrs; + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_feedforward", + ins, + outs, + attrs, + egr::Controller::Instance().GetExpectedPlace(), + &default_attrs, + true, + {}); + + paddle::experimental::Tensor Out; + egr::EagerUtils::GetOutput(outs["Out"][0], &Out); + paddle::experimental::Tensor Dropout1Mask; + egr::EagerUtils::GetOutput(outs["Dropout1Mask"][0], &Dropout1Mask); + paddle::experimental::Tensor Dropout2Mask; + egr::EagerUtils::GetOutput(outs["Dropout2Mask"][0], &Dropout2Mask); + paddle::experimental::Tensor Ln1Mean; + egr::EagerUtils::GetOutput(outs["Ln1Mean"][0], &Ln1Mean); + paddle::experimental::Tensor Ln1Variance; + egr::EagerUtils::GetOutput(outs["Ln1Variance"][0], &Ln1Variance); + paddle::experimental::Tensor Ln2Mean; + egr::EagerUtils::GetOutput(outs["Ln2Mean"][0], &Ln2Mean); + paddle::experimental::Tensor Ln2Variance; + egr::EagerUtils::GetOutput(outs["Ln2Variance"][0], &Ln2Variance); + paddle::experimental::Tensor Linear1Out; + egr::EagerUtils::GetOutput(outs["Linear1Out"][0], &Linear1Out); + paddle::experimental::Tensor Ln1Out; + egr::EagerUtils::GetOutput(outs["Ln1Out"][0], &Ln1Out); + paddle::experimental::Tensor Dropout1Out; + egr::EagerUtils::GetOutput(outs["Dropout1Out"][0], &Dropout1Out); + paddle::experimental::Tensor Dropout2Out; + egr::EagerUtils::GetOutput(outs["Dropout2Out"][0], &Dropout2Out); + + { + paddle::platform::RecordEvent node_creation_record_event( + "fused_feedforward node_creation", + paddle::platform::TracerEventType::Operator, + 1); + egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out); + egr::AutogradMeta* p_autograd_Dropout1Mask = + egr::EagerUtils::autograd_meta(&Dropout1Mask); + egr::AutogradMeta* p_autograd_Dropout2Mask = + egr::EagerUtils::autograd_meta(&Dropout2Mask); + egr::AutogradMeta* p_autograd_Ln1Mean = + egr::EagerUtils::autograd_meta(&Ln1Mean); + egr::AutogradMeta* p_autograd_Ln1Variance = + egr::EagerUtils::autograd_meta(&Ln1Variance); + egr::AutogradMeta* p_autograd_Ln2Mean = + egr::EagerUtils::autograd_meta(&Ln2Mean); + egr::AutogradMeta* p_autograd_Ln2Variance = + egr::EagerUtils::autograd_meta(&Ln2Variance); + egr::AutogradMeta* p_autograd_Linear1Out = + egr::EagerUtils::autograd_meta(&Linear1Out); + egr::AutogradMeta* p_autograd_Ln1Out = + egr::EagerUtils::autograd_meta(&Ln1Out); + egr::AutogradMeta* p_autograd_Dropout1Out = + egr::EagerUtils::autograd_meta(&Dropout1Out); + egr::AutogradMeta* p_autograd_Dropout2Out = + egr::EagerUtils::autograd_meta(&Dropout2Out); + if (require_any_grad) { + VLOG(6) << " Construct Grad for fused_feedforward "; + egr::EagerUtils::PassStopGradient(false, + p_autograd_Out, + p_autograd_Dropout1Mask, + p_autograd_Dropout2Mask, + p_autograd_Ln1Mean, + p_autograd_Ln1Variance, + p_autograd_Ln2Mean, + p_autograd_Ln2Variance, + p_autograd_Linear1Out, + p_autograd_Ln1Out, + p_autograd_Dropout1Out, + p_autograd_Dropout2Out); + // Create GradOpNode + auto grad_node = std::shared_ptr( + new fused_feedforwardGradNodeCompat(11, 11)); + + bool pre_layer_norm = false; + if (attrs.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attrs.at("pre_layer_norm")); + } + + // Set Attributes + grad_node->SetAttrMap(std::move(attrs)); + grad_node->SetDefaultAttrMap(std::move(default_attrs)); + + grad_node->SetTensorWrapperX(X); + grad_node->SetTensorWrapperLinear1Weight(Linear1Weight); + grad_node->SetTensorWrapperLinear1Bias(Linear1Bias); + grad_node->SetTensorWrapperLinear2Weight(Linear2Weight); + grad_node->SetTensorWrapperDropout1Mask(Dropout1Mask); + grad_node->SetTensorWrapperDropout2Mask(Dropout2Mask); + grad_node->SetTensorWrapperLinear1Out(Linear1Out); + grad_node->SetTensorWrapperDropout1Out(Dropout1Out); + grad_node->SetTensorWrapperDropout2Out(Dropout2Out); + + grad_node->SetGradOutMeta(X, 0); + grad_node->SetGradOutMeta(Linear1Weight, 3); + grad_node->SetGradOutMeta(Linear1Bias, 4); + grad_node->SetGradOutMeta(Linear2Weight, 5); + + if (pre_layer_norm) { + grad_node->SetTensorWrapperLn1Scale(Ln1Scale); + grad_node->SetTensorWrapperLn1Bias(Ln1Bias); + grad_node->SetTensorWrapperLn1Out(Ln1Out); + grad_node->SetTensorWrapperLn1Mean(Ln1Mean); + grad_node->SetTensorWrapperLn1Variance(Ln1Variance); + grad_node->SetGradOutMeta(Ln1Scale, 7); + grad_node->SetGradOutMeta(Ln1Bias, 8); + } else { + grad_node->SetTensorWrapperLn2Scale(Ln2Scale); + grad_node->SetGradOutMeta(Ln2Scale, 9); + grad_node->SetTensorWrapperLn2Bias(Ln2Bias); + grad_node->SetGradOutMeta(Ln2Bias, 10); + grad_node->SetTensorWrapperLn2Mean(Ln2Mean); + grad_node->SetTensorWrapperLn2Variance(Ln2Variance); + } + + if (Linear2Bias.initialized()) { + grad_node->SetTensorWrapperLinear2Bias(Linear2Bias); + grad_node->SetGradOutMeta(Linear2Bias, 6); + } + + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0); + egr::EagerUtils::SetHistory(p_autograd_Out, grad_node); + grad_node->SetGradInMeta(Out, 0); + egr::EagerUtils::CheckAndRetainGrad(Out); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Mask, 1); + grad_node->SetGradInMeta(Dropout1Mask, 1); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Mask, 2); + grad_node->SetGradInMeta(Dropout2Mask, 2); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Mean, 3); + grad_node->SetGradInMeta(Ln1Mean, 3); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Variance, 4); + grad_node->SetGradInMeta(Ln1Variance, 4); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Mean, 5); + grad_node->SetGradInMeta(Ln2Mean, 5); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln2Variance, 6); + grad_node->SetGradInMeta(Ln2Variance, 6); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Linear1Out, 7); + grad_node->SetGradInMeta(Linear1Out, 7); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Ln1Out, 8); + grad_node->SetGradInMeta(Ln1Out, 8); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout1Out, 9); + grad_node->SetGradInMeta(Dropout1Out, 9); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Dropout2Out, 10); + grad_node->SetGradInMeta(Dropout2Out, 10); + } + } + + return std::make_tuple(Out, + Dropout1Mask, + Dropout2Mask, + Ln1Mean, + Ln1Variance, + Ln2Mean, + Ln2Variance, + Linear1Out, + Ln1Out, + Dropout1Out, + Dropout2Out); +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index fb5e129223544187207e1a79889ced0c7c4bf63b..4eaa43a4b51c6f30c08868a305d0dfb38f1b9fa4 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -3,6 +3,11 @@ cc_library( SRCS fused_gate_attention_node.cc DEPS ${eager_deps} ${fluid_deps}) +cc_library( + fused_feedforward_node + SRCS fused_feedforward_node.cc + DEPS ${eager_deps} ${fluid_deps}) + set(fluid_manual_nodes - fused_gate_attention_node + fused_gate_attention_node fused_feedforward_node PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..5228cb3657825e49fc28c15bed7b3ab2bec997ab --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +paddle::small_vector, + egr::kSlotSmallVectorSize> +fused_feedforwardGradNodeCompat::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + VLOG(3) << "Running Eager Backward Node: fused_feedforwardGradNodeCompat"; + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + outputs(11); + + paddle::small_vector, + egr::kSlotSmallVectorSize> + hooked_grads0 = + fused_feedforwardGradNodeCompat::ApplyGradientHooks(grads); + + bool pre_layer_norm = false; + if (attr_map_.count("pre_layer_norm")) { + pre_layer_norm = BOOST_GET_CONST(bool, attr_map_.at("pre_layer_norm")); + } + + std::map>> ins0 = + {{"Dropout1Mask", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Mask_))}, + {"Dropout1Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout1Out_))}, + {"Dropout2Mask", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Mask_))}, + {"Dropout2Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Dropout2Out_))}, + {"Linear1Out", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Out_))}, + {"Linear1Weight", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Weight_))}, + {"Linear2Weight", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Weight_))}, + {"Out@GRAD", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])}, + {"X", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->X_))}}; + + std::map>> outs0; + + auto Linear1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear1Bias_); + if (Linear1Bias.defined()) + ins0["Linear1Bias"] = egr::EagerUtils::TrySyncToVars(Linear1Bias); + + if ((!out_metas[3].empty()) && (!(out_metas[3][0].IsStopGradient()))) { + outs0.insert({"Linear1Weight@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[5].empty()) && (!(out_metas[5][0].IsStopGradient()))) { + outs0.insert({"Linear2Weight@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) { + outs0.insert({"X@GRAD", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if (Linear1Bias.defined() && (!out_metas[4].empty()) && + (!out_metas[4][0].IsStopGradient())) + outs0["Linear1Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + + if (pre_layer_norm) { + auto Ln1Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Scale_); + if (Ln1Scale.defined()) + ins0["Ln1Scale"] = egr::EagerUtils::TrySyncToVars(Ln1Scale); + auto Ln1Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Bias_); + if (Ln1Bias.defined()) + ins0["Ln1Bias"] = egr::EagerUtils::TrySyncToVars(Ln1Bias); + auto Ln1Out = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Out_); + if (Ln1Out.defined()) + ins0["Ln1Out"] = egr::EagerUtils::TrySyncToVars(Ln1Out); + auto Ln1Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Mean_); + if (Ln1Mean.defined()) + ins0["Ln1Mean"] = egr::EagerUtils::TrySyncToVars(Ln1Mean); + auto Ln1Variance = + egr::EagerUtils::RecoverTensorWrapper(&this->Ln1Variance_); + if (Ln1Variance.defined()) + ins0["Ln1Variance"] = egr::EagerUtils::TrySyncToVars(Ln1Variance); + if (Ln1Scale.defined() && (!out_metas[7].empty()) && + (!out_metas[7][0].IsStopGradient())) + outs0["Ln1Scale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (Ln1Bias.defined() && (!out_metas[8].empty()) && + (!out_metas[8][0].IsStopGradient())) + outs0["Ln1Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + + } else { + auto Ln2Scale = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Scale_); + if (Ln2Scale.defined()) + ins0["Ln2Scale"] = egr::EagerUtils::TrySyncToVars(Ln2Scale); + auto Ln2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Bias_); + if (Ln2Bias.defined()) + ins0["Ln2Bias"] = egr::EagerUtils::TrySyncToVars(Ln2Bias); + auto Ln2Mean = egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Mean_); + if (Ln2Mean.defined()) + ins0["Ln2Mean"] = egr::EagerUtils::TrySyncToVars(Ln2Mean); + auto Ln2Variance = + egr::EagerUtils::RecoverTensorWrapper(&this->Ln2Variance_); + if (Ln2Variance.defined()) + ins0["Ln2Variance"] = egr::EagerUtils::TrySyncToVars(Ln2Variance); + if (Ln2Scale.defined() && (!out_metas[9].empty()) && + (!out_metas[9][0].IsStopGradient())) + outs0["Ln2Scale@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + if (Ln2Bias.defined() && (!out_metas[10].empty()) && + (!out_metas[10][0].IsStopGradient())) + outs0["Ln2Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto Linear2Bias = egr::EagerUtils::RecoverTensorWrapper(&this->Linear2Bias_); + if (Linear2Bias.defined()) { + ins0["Linear2Bias"] = egr::EagerUtils::TrySyncToVars(Linear2Bias); + if ((!out_metas[6].empty()) && (!out_metas[6][0].IsStopGradient())) + outs0["Linear2Bias@GRAD"] = {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}; + } + + auto& attrs_map0 = this->attr_map_; + // Pass the entire attribute map to TraceOp + // The underlying kernel will pickup whatever attribute they need at runtime + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_feedforward_grad", + ins0, + outs0, + attrs_map0, + egr::Controller::Instance().GetExpectedPlace(), + &this->default_attr_map_, + false, + {}); + + if (outs0.find("Linear1Weight@GRAD") != outs0.end()) { + outputs[3] = egr::EagerUtils::GetOutputs(outs0["Linear1Weight@GRAD"]); + } + if (outs0.find("Linear2Weight@GRAD") != outs0.end()) { + outputs[5] = egr::EagerUtils::GetOutputs(outs0["Linear2Weight@GRAD"]); + } + if (outs0.find("X@GRAD") != outs0.end()) { + outputs[0] = egr::EagerUtils::GetOutputs(outs0["X@GRAD"]); + } + if (outs0.find("Linear1Bias@GRAD") != outs0.end()) { + outputs[4] = egr::EagerUtils::GetOutputs(outs0["Linear1Bias@GRAD"]); + } + + if (pre_layer_norm) { + if (outs0.find("Ln1Scale@GRAD") != outs0.end()) { + outputs[7] = egr::EagerUtils::GetOutputs(outs0["Ln1Scale@GRAD"]); + } + if (outs0.find("Ln1Bias@GRAD") != outs0.end()) { + outputs[8] = egr::EagerUtils::GetOutputs(outs0["Ln1Bias@GRAD"]); + } + + } else { + if (outs0.find("Ln2Bias@GRAD") != outs0.end()) { + outputs[10] = egr::EagerUtils::GetOutputs(outs0["Ln2Bias@GRAD"]); + } + if (outs0.find("Ln2Scale@GRAD") != outs0.end()) { + outputs[9] = egr::EagerUtils::GetOutputs(outs0["Ln2Scale@GRAD"]); + } + } + + if (Linear2Bias.defined()) { + if (outs0.find("Linear2Bias@GRAD") != outs0.end()) { + outputs[6] = egr::EagerUtils::GetOutputs(outs0["Linear2Bias@GRAD"]); + } + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs); + return outputs; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h index 0f0fac4b725e0e80277efa8c9b6f79c765d6180a..52d3b44d7ba2ae607b4a9858a4aa484e12fabc76 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -174,3 +174,158 @@ class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase { paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap default_attr_map_; }; + +class fused_feedforwardGradNodeCompat : public egr::GradNodeBase { + public: + fused_feedforwardGradNodeCompat() : egr::GradNodeBase() { + VLOG(7) << " Construct fused_feedforwardGradNodeCompat "; + } + fused_feedforwardGradNodeCompat(size_t bwd_in_slot_num, + size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { + VLOG(7) << " Construct fused_feedforwardGradNodeCompat "; + } + ~fused_feedforwardGradNodeCompat() override { + VLOG(6) << " Destruct fused_feedforwardGradNodeCompat "; + } + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + + void ClearTensorWrappers() override { + Dropout1Mask_.clear(); + Dropout1Out_.clear(); + Dropout2Mask_.clear(); + Dropout2Out_.clear(); + Linear1Bias_.clear(); + Linear1Out_.clear(); + Linear1Weight_.clear(); + Linear2Bias_.clear(); + Linear2Weight_.clear(); + Ln2Bias_.clear(); + Ln2Mean_.clear(); + Ln2Scale_.clear(); + Ln2Variance_.clear(); + X_.clear(); + + SetIsTensorWrappersCleared(true); + } + std::string name() override { return "fused_feedforwardGradNodeCompat"; } + + std::shared_ptr Copy() const override { + { + auto copied_node = std::shared_ptr( + new fused_feedforwardGradNodeCompat(*this)); + return copied_node; + } + } + + // SetX, SetY, ... + void SetTensorWrapperDropout1Mask( + const paddle::experimental::Tensor& Dropout1Mask) { + Dropout1Mask_ = egr::TensorWrapper(Dropout1Mask, false); + } + void SetTensorWrapperDropout1Out( + const paddle::experimental::Tensor& Dropout1Out) { + Dropout1Out_ = egr::TensorWrapper(Dropout1Out, false); + } + void SetTensorWrapperDropout2Mask( + const paddle::experimental::Tensor& Dropout2Mask) { + Dropout2Mask_ = egr::TensorWrapper(Dropout2Mask, false); + } + void SetTensorWrapperDropout2Out( + const paddle::experimental::Tensor& Dropout2Out) { + Dropout2Out_ = egr::TensorWrapper(Dropout2Out, false); + } + void SetTensorWrapperLinear1Bias( + const paddle::experimental::Tensor& Linear1Bias) { + Linear1Bias_ = egr::TensorWrapper(Linear1Bias, false); + } + void SetTensorWrapperLinear1Out( + const paddle::experimental::Tensor& Linear1Out) { + Linear1Out_ = egr::TensorWrapper(Linear1Out, false); + } + void SetTensorWrapperLinear1Weight( + const paddle::experimental::Tensor& Linear1Weight) { + Linear1Weight_ = egr::TensorWrapper(Linear1Weight, false); + } + void SetTensorWrapperLinear2Bias( + const paddle::experimental::Tensor& Linear2Bias) { + Linear2Bias_ = egr::TensorWrapper(Linear2Bias, false); + } + void SetTensorWrapperLinear2Weight( + const paddle::experimental::Tensor& Linear2Weight) { + Linear2Weight_ = egr::TensorWrapper(Linear2Weight, false); + } + void SetTensorWrapperLn2Bias(const paddle::experimental::Tensor& Ln2Bias) { + Ln2Bias_ = egr::TensorWrapper(Ln2Bias, false); + } + void SetTensorWrapperLn2Mean(const paddle::experimental::Tensor& Ln2Mean) { + Ln2Mean_ = egr::TensorWrapper(Ln2Mean, false); + } + void SetTensorWrapperLn2Scale(const paddle::experimental::Tensor& Ln2Scale) { + Ln2Scale_ = egr::TensorWrapper(Ln2Scale, false); + } + void SetTensorWrapperLn2Variance( + const paddle::experimental::Tensor& Ln2Variance) { + Ln2Variance_ = egr::TensorWrapper(Ln2Variance, false); + } + void SetTensorWrapperX(const paddle::experimental::Tensor& X) { + X_ = egr::TensorWrapper(X, false); + } + void SetTensorWrapperLn1Scale(const paddle::experimental::Tensor& Ln1Scale) { + Ln1Scale_ = egr::TensorWrapper(Ln1Scale, false); + } + void SetTensorWrapperLn1Bias(const paddle::experimental::Tensor& Ln1Bias) { + Ln1Bias_ = egr::TensorWrapper(Ln1Bias, false); + } + void SetTensorWrapperLn1Out(const paddle::experimental::Tensor& Ln1Out) { + Ln1Out_ = egr::TensorWrapper(Ln1Out, false); + } + void SetTensorWrapperLn1Mean(const paddle::experimental::Tensor& Ln1Mean) { + Ln1Mean_ = egr::TensorWrapper(Ln1Mean, false); + } + void SetTensorWrapperLn1Variance( + const paddle::experimental::Tensor& Ln1Variance) { + Ln1Variance_ = egr::TensorWrapper(Ln1Variance, false); + } + // SetAttrMap + void SetAttrMap(paddle::framework::AttributeMap&& attr_map) { + attr_map_ = std::move(attr_map); + } + void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) { + default_attr_map_ = std::move(default_attr_map); + } + + private: + // TensorWrappers + egr::TensorWrapper Dropout1Mask_; + egr::TensorWrapper Dropout1Out_; + egr::TensorWrapper Dropout2Mask_; + egr::TensorWrapper Dropout2Out_; + egr::TensorWrapper Linear1Bias_; + egr::TensorWrapper Linear1Out_; + egr::TensorWrapper Linear1Weight_; + egr::TensorWrapper Linear2Bias_; + egr::TensorWrapper Linear2Weight_; + egr::TensorWrapper Ln2Bias_; + egr::TensorWrapper Ln2Mean_; + egr::TensorWrapper Ln2Scale_; + egr::TensorWrapper Ln2Variance_; + egr::TensorWrapper X_; + + egr::TensorWrapper Ln1Scale_; + egr::TensorWrapper Ln1Bias_; + egr::TensorWrapper Ln1Out_; + egr::TensorWrapper Ln1Mean_; + egr::TensorWrapper Ln1Variance_; + + // Attribute Map + paddle::framework::AttributeMap attr_map_; + paddle::framework::AttributeMap default_attr_map_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index bbd6ea6494638ca1b09af4073dd603ea1f9324e8..6eb35eb13f3f7debac4744ea3c0508360b5b9a35 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -52,7 +52,7 @@ static std::unordered_set ops_to_fill_zero_for_empty_grads = { /* --- Black Ops list that's NO NEED to apply code generation --- */ static std::unordered_set black_ops_list = { - "run_program", "fused_gate_attention"}; + "run_program", "fused_gate_attention", "fused_feedforward"}; static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py index 43d39224287e643dde2b45b445679aa880548fe8..8d2873276033a746e8e40bf49a44b01096aecfcc 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py @@ -23,9 +23,7 @@ from paddle.nn.layer.norm import LayerNorm from paddle.nn.layer.common import Linear, Dropout import unittest from op_test import OpTest -from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph - -_enable_legacy_dygraph() +from paddle.fluid.framework import default_main_program class TestFusedFFNOp(OpTest):