diff --git a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h index 91d556f9557dc6bbf6bd1943b9b30fbf53d27ec2..a3e812ce2967ff089b62b82689160b9b160bc33e 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h @@ -101,3 +101,9 @@ fused_attention_dygraph_function( const paddle::experimental::Tensor& Ln2Scale, const paddle::experimental::Tensor& Ln2Bias, const paddle::framework::AttributeMap& attr_map); + +paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + const paddle::experimental::Tensor& Bias, + const paddle::framework::AttributeMap& attr_map); diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt index 5c47b0870a2035852b3a0d2945899f1bf2dc3cef..310da3bbaaa419493cafee514abd6dc33ab911c0 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/CMakeLists.txt @@ -2,4 +2,5 @@ set(fluid_manual_functions ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2cf790d3fa9ae92ac1dcc86d0f7b7a0d1674b8e --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/amp_auto_cast.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" + +#pragma GCC diagnostic ignored "-Wunused-variable" + +paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function( + const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + const paddle::experimental::Tensor& Bias, + const paddle::framework::AttributeMap& attr_map) { + paddle::platform::RecordEvent dygraph_entrance_record_event( + "fused_gemm_epilogue dygraph", + paddle::platform::TracerEventType::Operator, + 1); + VLOG(3) << "Running Eager Forward Op: fused_gemm_epilogue"; + // Dygraph Forward Pass + + if (egr::Controller::Instance().GetAMPLevel() != + paddle::imperative::AmpLevel::O0) { + VLOG(5) << "Check and Prepare For AMP"; + + paddle::small_vector, + egr::kSlotSmallVectorSize> + amp_tensors_vector = {{X}, {Y}, {Bias}}; + + auto amp_dst_dtype = + egr::GetAmpDestDtype("fused_gemm_epilogue", amp_tensors_vector); + + auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_gemm_epilogue"); + auto NEW_Y = egr::AmpAutoCast("Y", Y, amp_dst_dtype, "fused_gemm_epilogue"); + auto NEW_Bias = + egr::AmpAutoCast("Bias", Bias, amp_dst_dtype, "fused_gemm_epilogue"); + + { + paddle::imperative::AutoCastGuard guard( + egr::Controller::Instance().GetCurrentTracer(), + paddle::imperative::AmpLevel::O0); + return fused_gemm_epilogue_dygraph_function( + NEW_X, NEW_Y, NEW_Bias, attr_map); + } + } + + std::map>> ins = + {{"X", egr::EagerUtils::TrySyncToVars(X)}, + {"Y", egr::EagerUtils::TrySyncToVars(Y)}, + {"Bias", egr::EagerUtils::TrySyncToVars(Bias)}}; + + std::map>> outs = + {{"Out", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}}; + + // Prepare Autograd Meta + egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X); + egr::AutogradMeta* p_autograd_Y = egr::EagerUtils::nullable_autograd_meta(Y); + egr::AutogradMeta* p_autograd_Bias = + egr::EagerUtils::nullable_autograd_meta(Bias); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, p_autograd_X, p_autograd_Y, p_autograd_Bias); + + paddle::framework::AttributeMap attrs = attr_map; + paddle::framework::AttributeMap default_attrs; + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_gemm_epilogue", + ins, + outs, + attrs, + egr::Controller::Instance().GetExpectedPlace(), + &default_attrs, + true, + {}); + + paddle::experimental::Tensor Out; + egr::EagerUtils::GetOutput(outs["Out"][0], &Out); + + { + paddle::platform::RecordEvent node_creation_record_event( + "fused_gemm_epilogue node_creation", + paddle::platform::TracerEventType::OperatorInner, + 1); + egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out); + if (require_any_grad) { + VLOG(6) << " Construct Grad for fused_gemm_epilogue "; + egr::EagerUtils::PassStopGradient(false, p_autograd_Out); + // Create GradOpNode + auto grad_node = std::shared_ptr( + new fused_gemm_epilogueGradNodeCompat(1, 3)); + + // Set Attributes + grad_node->SetAttrMap(std::move(attrs)); + grad_node->SetDefaultAttrMap(std::move(default_attrs)); + + // Set Tensor Wrappers + grad_node->SetTensorWrapperX(X); + grad_node->SetTensorWrapperY(Y); + + grad_node->SetGradOutMeta(X, 0); + grad_node->SetGradOutMeta(Y, 1); + grad_node->SetGradOutMeta(Bias, 2); + egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0); + egr::EagerUtils::SetHistory(p_autograd_Out, grad_node); + grad_node->SetGradInMeta(Out, 0); + egr::EagerUtils::CheckAndRetainGrad(Out); + } + } + + return Out; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt index 101ed5d5890754036d734fbd5ebf83ddfe0fa6a2..659c7b2dab0beb8f48ec2f01fe91ebffc4c93be9 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/CMakeLists.txt @@ -2,4 +2,5 @@ set(fluid_manual_nodes ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc + ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc PARENT_SCOPE) diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..68c183120d401c71b66fa3e3f09982e0b64a5f0e --- /dev/null +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "glog/logging.h" +#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h" +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/phi/api/all.h" + +paddle::small_vector, + egr::kSlotSmallVectorSize> +fused_gemm_epilogueGradNodeCompat::operator()( + paddle::small_vector, + egr::kSlotSmallVectorSize>& grads, + bool create_graph, + bool is_new_grad) { + const auto& out_metas = OutputMeta(); + paddle::small_vector, + egr::kSlotSmallVectorSize> + outputs(3); + VLOG(3) << "Running Eager Backward Node: fused_gemm_epilogueGradNodeCompat"; + paddle::small_vector, + egr::kSlotSmallVectorSize> + hooked_grads0 = + fused_gemm_epilogueGradNodeCompat::ApplyGradientHooks(grads); + std::map>> ins0 = + {{"DOut", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])}, + {"X", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->X_))}, + {"Y", + egr::EagerUtils::TrySyncToVars( + egr::EagerUtils::RecoverTensorWrapper(&this->Y_))}}; + std::map>> outs0; + if ((!out_metas[2].empty()) && (!(out_metas[2][0].IsStopGradient()))) { + outs0.insert({"DBias", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) { + outs0.insert({"DX", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + if ((!out_metas[1].empty()) && (!(out_metas[1][0].IsStopGradient()))) { + outs0.insert({"DY", + {std::make_shared( + egr::Controller::Instance().GenerateUniqueName())}}); + } + + auto& attrs_map0 = this->attr_map_; + // Pass the entire attribute map to TraceOp + // The underlying kernel will pickup whatever attribute they need at runtime + egr::Controller::Instance().GetCurrentTracer()->TraceOp( + "fused_gemm_epilogue_grad", + ins0, + outs0, + attrs_map0, + egr::Controller::Instance().GetExpectedPlace(), + &this->default_attr_map_, + true, + {}); + if (outs0.find("DBias") != outs0.end()) { + outputs[2] = egr::EagerUtils::GetOutputs(outs0["DBias"]); + } + if (outs0.find("DX") != outs0.end()) { + outputs[0] = egr::EagerUtils::GetOutputs(outs0["DX"]); + } + if (outs0.find("DY") != outs0.end()) { + outputs[1] = egr::EagerUtils::GetOutputs(outs0["DY"]); + } + + if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs); + return outputs; +} diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h index 571deb4e9ca74224ff1fe00f44960280fc941b96..32389e553d03c44c6ed350bb8ec40dd2156f03e4 100644 --- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h +++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h @@ -531,3 +531,67 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase { paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap default_attr_map_; }; + +class fused_gemm_epilogueGradNodeCompat : public egr::GradNodeBase { + public: + fused_gemm_epilogueGradNodeCompat() : egr::GradNodeBase() { + VLOG(7) << " Construct fused_gemm_epilogueGradNodeCompat "; + } + fused_gemm_epilogueGradNodeCompat(size_t bwd_in_slot_num, + size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { + VLOG(7) << " Construct fused_gemm_epilogueGradNodeCompat "; + } + ~fused_gemm_epilogueGradNodeCompat() override { + VLOG(6) << " Destruct fused_gemm_epilogueGradNodeCompat "; + } + + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()( + paddle::small_vector, // NOLINT + egr::kSlotSmallVectorSize>& grads, // NOLINT + bool create_graph = false, + bool is_new_grad = false) override; + + void ClearTensorWrappers() override { + X_.clear(); + Y_.clear(); + + SetIsTensorWrappersCleared(true); + } + std::string name() override { return "fused_gemm_epilogueGradNodeCompat"; } + + std::shared_ptr Copy() const override { + { + auto copied_node = std::shared_ptr( + new fused_gemm_epilogueGradNodeCompat(*this)); + return copied_node; + } + } + + // SetX, SetY, ... + void SetTensorWrapperX(const paddle::experimental::Tensor& X) { + X_ = egr::TensorWrapper(X, false); + } + void SetTensorWrapperY(const paddle::experimental::Tensor& Y) { + Y_ = egr::TensorWrapper(Y, false); + } + + // SetAttrMap + void SetAttrMap(paddle::framework::AttributeMap&& attr_map) { + attr_map_ = std::move(attr_map); + } + void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) { + default_attr_map_ = std::move(default_attr_map); + } + + private: + // TensorWrappers + egr::TensorWrapper X_; + egr::TensorWrapper Y_; + + // Attribute Map + paddle::framework::AttributeMap attr_map_; + paddle::framework::AttributeMap default_attr_map_; +}; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index b4461740f7843383a3740c1182402b6eaf02c50b..519e0a1ed567a310d43bf721f1d2ddf73e911d72 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -54,7 +54,8 @@ static std::unordered_set ops_to_fill_zero_for_empty_grads = { static std::unordered_set black_ops_list = {"run_program", "fused_gate_attention", "fused_feedforward", - "fused_attention"}; + "fused_attention", + "fused_gemm_epilogue"}; static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py index bd29ebbf12a35534023e793894fe6396ba81247e..6aa995efa8df0609726ed2462d08b0c21fe39e1c 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py @@ -22,6 +22,13 @@ import paddle.fluid.core as core from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci +def is_fused_gemm_epilogue_supported(): + if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(): + return hasattr(core.eager.ops, 'fused_gemm_epilogue') + else: + return False + + def gelu(x): y_ref = 0.5 * x * ( 1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) @@ -480,6 +487,92 @@ class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16): self.atol = 1e-6 +def matmul(x, y, bias, trans_x, trans_y): + x = np.array(x) + if trans_x: + x = np.ascontiguousarray(np.transpose(x)) + if trans_y: + y = np.ascontiguousarray(np.transpose(y)) + z = np.matmul(x, y) + if bias is None: + return z + else: + return z + bias + + +def matmul_grad(x, y, bias, dz, trans_x, trans_y): + if trans_x: + if trans_y: + dx = matmul(y, dz, None, True, True) + dy = matmul(dz, x, None, True, True) + else: + dx = matmul(y, dz, None, False, True) + dy = matmul(x, dz, None, False, False) + else: + if trans_y: + dx = matmul(dz, y, None, False, False) + dy = matmul(dz, x, None, True, False) + else: + dx = matmul(dz, y, None, False, True) + dy = matmul(x, dz, None, True, False) + if bias is None: + dbias = None + else: + dbias = np.sum(dz, axis=0, keepdims=False) + return dx, dy, dbias + + +@unittest.skipIf( + not is_fused_gemm_epilogue_supported(), + "fused_gemm_epilogue is only supported when CUDA version >= 11.6") +class TestEagerFusedGemmEpilogue(unittest.TestCase): + + def setUp(self): + paddle.set_device('gpu') + + def test_case_act(self): + paddle.disable_static() + x_np = np.random.random((8, 4)).astype(np.float64) - 0.5 + y_np = np.random.random((4, 128)).astype(np.float64) - 0.5 + bias_np = np.random.random((128, )).astype(np.float64) - 0.5 + x = paddle.to_tensor(x_np) + y = paddle.to_tensor(y_np) + bias = paddle.to_tensor(bias_np) + x.stop_gradient = False + y.stop_gradient = False + + out1 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, + 'activation', 'none') + out2 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, + 'activation', 'relu') + out3 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False, + 'trans_y', False, + 'activation', 'gelu') + + out_np1 = get_output(x_np, y_np, bias_np, 'none') + out_np2 = get_output(x_np, y_np, bias_np, 'relu') + out_np3 = get_output(x_np, y_np, bias_np, 'gelu') + + self.assertTrue(np.allclose(out1, out_np1)) + self.assertTrue(np.allclose(out2, out_np2)) + self.assertTrue(np.allclose(out3, out_np3)) + + out_grad_np1 = np.random.randint(low=-20, high=20, + size=out_np1.shape).astype(np.float64) + paddle.autograd.backward(out1, + grad_tensors=[paddle.to_tensor(out_grad_np1)]) + + x_grad_np, y_grad_np, bias_grad_np = matmul_grad( + x_np, y_np, bias_np, out_grad_np1, False, False) + self.assertTrue(np.allclose(x.grad.numpy(), x_grad_np)) + self.assertEqual(y_grad_np.shape, y_np.shape) + self.assertTrue(np.allclose(y.grad.numpy(), y_grad_np)) + + paddle.enable_static() + + if __name__ == "__main__": paddle.enable_static() np.random.seed(0)