未验证 提交 a2980169 编写于 作者: X xiaoguoguo626807 提交者: GitHub

[Eager]Menual fused_gemm_epilogue (#44748)

* manuel_fused_gemm_epilogue
上级 942ff89f
...@@ -101,3 +101,9 @@ fused_attention_dygraph_function( ...@@ -101,3 +101,9 @@ fused_attention_dygraph_function(
const paddle::experimental::Tensor& Ln2Scale, const paddle::experimental::Tensor& Ln2Scale,
const paddle::experimental::Tensor& Ln2Bias, const paddle::experimental::Tensor& Ln2Bias,
const paddle::framework::AttributeMap& attr_map); const paddle::framework::AttributeMap& attr_map);
paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Y,
const paddle::experimental::Tensor& Bias,
const paddle::framework::AttributeMap& attr_map);
...@@ -2,4 +2,5 @@ set(fluid_manual_functions ...@@ -2,4 +2,5 @@ set(fluid_manual_functions
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gate_attention_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_feedforward_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_gemm_epilogue_fwd_func.cc
PARENT_SCOPE) PARENT_SCOPE)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/amp_auto_cast.h"
#include "paddle/fluid/eager/amp_utils.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/dygraph_forward_api.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#pragma GCC diagnostic ignored "-Wunused-variable"
paddle::experimental::Tensor fused_gemm_epilogue_dygraph_function(
const paddle::experimental::Tensor& X,
const paddle::experimental::Tensor& Y,
const paddle::experimental::Tensor& Bias,
const paddle::framework::AttributeMap& attr_map) {
paddle::platform::RecordEvent dygraph_entrance_record_event(
"fused_gemm_epilogue dygraph",
paddle::platform::TracerEventType::Operator,
1);
VLOG(3) << "Running Eager Forward Op: fused_gemm_epilogue";
// Dygraph Forward Pass
if (egr::Controller::Instance().GetAMPLevel() !=
paddle::imperative::AmpLevel::O0) {
VLOG(5) << "Check and Prepare For AMP";
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
amp_tensors_vector = {{X}, {Y}, {Bias}};
auto amp_dst_dtype =
egr::GetAmpDestDtype("fused_gemm_epilogue", amp_tensors_vector);
auto NEW_X = egr::AmpAutoCast("X", X, amp_dst_dtype, "fused_gemm_epilogue");
auto NEW_Y = egr::AmpAutoCast("Y", Y, amp_dst_dtype, "fused_gemm_epilogue");
auto NEW_Bias =
egr::AmpAutoCast("Bias", Bias, amp_dst_dtype, "fused_gemm_epilogue");
{
paddle::imperative::AutoCastGuard guard(
egr::Controller::Instance().GetCurrentTracer(),
paddle::imperative::AmpLevel::O0);
return fused_gemm_epilogue_dygraph_function(
NEW_X, NEW_Y, NEW_Bias, attr_map);
}
}
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins =
{{"X", egr::EagerUtils::TrySyncToVars(X)},
{"Y", egr::EagerUtils::TrySyncToVars(Y)},
{"Bias", egr::EagerUtils::TrySyncToVars(Bias)}};
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs =
{{"Out",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}}};
// Prepare Autograd Meta
egr::AutogradMeta* p_autograd_X = egr::EagerUtils::nullable_autograd_meta(X);
egr::AutogradMeta* p_autograd_Y = egr::EagerUtils::nullable_autograd_meta(Y);
egr::AutogradMeta* p_autograd_Bias =
egr::EagerUtils::nullable_autograd_meta(Bias);
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
trace_backward, p_autograd_X, p_autograd_Y, p_autograd_Bias);
paddle::framework::AttributeMap attrs = attr_map;
paddle::framework::AttributeMap default_attrs;
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_gemm_epilogue",
ins,
outs,
attrs,
egr::Controller::Instance().GetExpectedPlace(),
&default_attrs,
true,
{});
paddle::experimental::Tensor Out;
egr::EagerUtils::GetOutput(outs["Out"][0], &Out);
{
paddle::platform::RecordEvent node_creation_record_event(
"fused_gemm_epilogue node_creation",
paddle::platform::TracerEventType::OperatorInner,
1);
egr::AutogradMeta* p_autograd_Out = egr::EagerUtils::autograd_meta(&Out);
if (require_any_grad) {
VLOG(6) << " Construct Grad for fused_gemm_epilogue ";
egr::EagerUtils::PassStopGradient(false, p_autograd_Out);
// Create GradOpNode
auto grad_node = std::shared_ptr<fused_gemm_epilogueGradNodeCompat>(
new fused_gemm_epilogueGradNodeCompat(1, 3));
// Set Attributes
grad_node->SetAttrMap(std::move(attrs));
grad_node->SetDefaultAttrMap(std::move(default_attrs));
// Set Tensor Wrappers
grad_node->SetTensorWrapperX(X);
grad_node->SetTensorWrapperY(Y);
grad_node->SetGradOutMeta(X, 0);
grad_node->SetGradOutMeta(Y, 1);
grad_node->SetGradOutMeta(Bias, 2);
egr::EagerUtils::SetOutRankWithSlot(p_autograd_Out, 0);
egr::EagerUtils::SetHistory(p_autograd_Out, grad_node);
grad_node->SetGradInMeta(Out, 0);
egr::EagerUtils::CheckAndRetainGrad(Out);
}
}
return Out;
}
...@@ -2,4 +2,5 @@ set(fluid_manual_nodes ...@@ -2,4 +2,5 @@ set(fluid_manual_nodes
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gate_attention_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_feedforward_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc ${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_attention_node.cc
${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/manual/fluid_manual/nodes/fused_gemm_epilogue_node.cc
PARENT_SCOPE) PARENT_SCOPE)
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "glog/logging.h"
#include "paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h"
#include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/imperative/tracer.h"
#include "paddle/phi/api/all.h"
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
fused_gemm_epilogueGradNodeCompat::operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>& grads,
bool create_graph,
bool is_new_grad) {
const auto& out_metas = OutputMeta();
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
outputs(3);
VLOG(3) << "Running Eager Backward Node: fused_gemm_epilogueGradNodeCompat";
paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
hooked_grads0 =
fused_gemm_epilogueGradNodeCompat::ApplyGradientHooks(grads);
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> ins0 =
{{"DOut", egr::EagerUtils::TrySyncToVars(hooked_grads0[0])},
{"X",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->X_))},
{"Y",
egr::EagerUtils::TrySyncToVars(
egr::EagerUtils::RecoverTensorWrapper(&this->Y_))}};
std::map<std::string, std::vector<std::shared_ptr<egr::EagerVariable>>> outs0;
if ((!out_metas[2].empty()) && (!(out_metas[2][0].IsStopGradient()))) {
outs0.insert({"DBias",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[0].empty()) && (!(out_metas[0][0].IsStopGradient()))) {
outs0.insert({"DX",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
if ((!out_metas[1].empty()) && (!(out_metas[1][0].IsStopGradient()))) {
outs0.insert({"DY",
{std::make_shared<egr::EagerVariable>(
egr::Controller::Instance().GenerateUniqueName())}});
}
auto& attrs_map0 = this->attr_map_;
// Pass the entire attribute map to TraceOp
// The underlying kernel will pickup whatever attribute they need at runtime
egr::Controller::Instance().GetCurrentTracer()->TraceOp(
"fused_gemm_epilogue_grad",
ins0,
outs0,
attrs_map0,
egr::Controller::Instance().GetExpectedPlace(),
&this->default_attr_map_,
true,
{});
if (outs0.find("DBias") != outs0.end()) {
outputs[2] = egr::EagerUtils::GetOutputs(outs0["DBias"]);
}
if (outs0.find("DX") != outs0.end()) {
outputs[0] = egr::EagerUtils::GetOutputs(outs0["DX"]);
}
if (outs0.find("DY") != outs0.end()) {
outputs[1] = egr::EagerUtils::GetOutputs(outs0["DY"]);
}
if (NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&outputs);
return outputs;
}
...@@ -531,3 +531,67 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase { ...@@ -531,3 +531,67 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase {
paddle::framework::AttributeMap attr_map_; paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_; paddle::framework::AttributeMap default_attr_map_;
}; };
class fused_gemm_epilogueGradNodeCompat : public egr::GradNodeBase {
public:
fused_gemm_epilogueGradNodeCompat() : egr::GradNodeBase() {
VLOG(7) << " Construct fused_gemm_epilogueGradNodeCompat ";
}
fused_gemm_epilogueGradNodeCompat(size_t bwd_in_slot_num,
size_t bwd_out_slot_num)
: egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {
VLOG(7) << " Construct fused_gemm_epilogueGradNodeCompat ";
}
~fused_gemm_epilogueGradNodeCompat() override {
VLOG(6) << " Destruct fused_gemm_epilogueGradNodeCompat ";
}
virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
egr::kSlotSmallVectorSize>
operator()(
paddle::small_vector<std::vector<paddle::experimental::Tensor>, // NOLINT
egr::kSlotSmallVectorSize>& grads, // NOLINT
bool create_graph = false,
bool is_new_grad = false) override;
void ClearTensorWrappers() override {
X_.clear();
Y_.clear();
SetIsTensorWrappersCleared(true);
}
std::string name() override { return "fused_gemm_epilogueGradNodeCompat"; }
std::shared_ptr<GradNodeBase> Copy() const override {
{
auto copied_node = std::shared_ptr<fused_gemm_epilogueGradNodeCompat>(
new fused_gemm_epilogueGradNodeCompat(*this));
return copied_node;
}
}
// SetX, SetY, ...
void SetTensorWrapperX(const paddle::experimental::Tensor& X) {
X_ = egr::TensorWrapper(X, false);
}
void SetTensorWrapperY(const paddle::experimental::Tensor& Y) {
Y_ = egr::TensorWrapper(Y, false);
}
// SetAttrMap
void SetAttrMap(paddle::framework::AttributeMap&& attr_map) {
attr_map_ = std::move(attr_map);
}
void SetDefaultAttrMap(paddle::framework::AttributeMap&& default_attr_map) {
default_attr_map_ = std::move(default_attr_map);
}
private:
// TensorWrappers
egr::TensorWrapper X_;
egr::TensorWrapper Y_;
// Attribute Map
paddle::framework::AttributeMap attr_map_;
paddle::framework::AttributeMap default_attr_map_;
};
...@@ -54,7 +54,8 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = { ...@@ -54,7 +54,8 @@ static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
static std::unordered_set<std::string> black_ops_list = {"run_program", static std::unordered_set<std::string> black_ops_list = {"run_program",
"fused_gate_attention", "fused_gate_attention",
"fused_feedforward", "fused_feedforward",
"fused_attention"}; "fused_attention",
"fused_gemm_epilogue"};
static std::string LegalizeVariableName(const std::string& var_name) { static std::string LegalizeVariableName(const std::string& var_name) {
std::string ret = var_name; std::string ret = var_name;
......
...@@ -22,6 +22,13 @@ import paddle.fluid.core as core ...@@ -22,6 +22,13 @@ import paddle.fluid.core as core
from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci from op_test import OpTest, skip_check_grad_ci, skip_check_inplace_ci
def is_fused_gemm_epilogue_supported():
if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
return hasattr(core.eager.ops, 'fused_gemm_epilogue')
else:
return False
def gelu(x): def gelu(x):
y_ref = 0.5 * x * ( y_ref = 0.5 * x * (
1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) 1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
...@@ -480,6 +487,92 @@ class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16): ...@@ -480,6 +487,92 @@ class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16):
self.atol = 1e-6 self.atol = 1e-6
def matmul(x, y, bias, trans_x, trans_y):
x = np.array(x)
if trans_x:
x = np.ascontiguousarray(np.transpose(x))
if trans_y:
y = np.ascontiguousarray(np.transpose(y))
z = np.matmul(x, y)
if bias is None:
return z
else:
return z + bias
def matmul_grad(x, y, bias, dz, trans_x, trans_y):
if trans_x:
if trans_y:
dx = matmul(y, dz, None, True, True)
dy = matmul(dz, x, None, True, True)
else:
dx = matmul(y, dz, None, False, True)
dy = matmul(x, dz, None, False, False)
else:
if trans_y:
dx = matmul(dz, y, None, False, False)
dy = matmul(dz, x, None, True, False)
else:
dx = matmul(dz, y, None, False, True)
dy = matmul(x, dz, None, True, False)
if bias is None:
dbias = None
else:
dbias = np.sum(dz, axis=0, keepdims=False)
return dx, dy, dbias
@unittest.skipIf(
not is_fused_gemm_epilogue_supported(),
"fused_gemm_epilogue is only supported when CUDA version >= 11.6")
class TestEagerFusedGemmEpilogue(unittest.TestCase):
def setUp(self):
paddle.set_device('gpu')
def test_case_act(self):
paddle.disable_static()
x_np = np.random.random((8, 4)).astype(np.float64) - 0.5
y_np = np.random.random((4, 128)).astype(np.float64) - 0.5
bias_np = np.random.random((128, )).astype(np.float64) - 0.5
x = paddle.to_tensor(x_np)
y = paddle.to_tensor(y_np)
bias = paddle.to_tensor(bias_np)
x.stop_gradient = False
y.stop_gradient = False
out1 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
'trans_y', False,
'activation', 'none')
out2 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
'trans_y', False,
'activation', 'relu')
out3 = core.eager.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
'trans_y', False,
'activation', 'gelu')
out_np1 = get_output(x_np, y_np, bias_np, 'none')
out_np2 = get_output(x_np, y_np, bias_np, 'relu')
out_np3 = get_output(x_np, y_np, bias_np, 'gelu')
self.assertTrue(np.allclose(out1, out_np1))
self.assertTrue(np.allclose(out2, out_np2))
self.assertTrue(np.allclose(out3, out_np3))
out_grad_np1 = np.random.randint(low=-20, high=20,
size=out_np1.shape).astype(np.float64)
paddle.autograd.backward(out1,
grad_tensors=[paddle.to_tensor(out_grad_np1)])
x_grad_np, y_grad_np, bias_grad_np = matmul_grad(
x_np, y_np, bias_np, out_grad_np1, False, False)
self.assertTrue(np.allclose(x.grad.numpy(), x_grad_np))
self.assertEqual(y_grad_np.shape, y_np.shape)
self.assertTrue(np.allclose(y.grad.numpy(), y_grad_np))
paddle.enable_static()
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
np.random.seed(0) np.random.seed(0)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册