未验证 提交 0fb06e46 编写于 作者: Z Zhanlue Yang 提交者: GitHub

[DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode (#41612) (#41894)

* [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad

* Fixed elementwise issue

* Addressed CI failures

* [DoubleGrad] Enabled test_imperative_triple_grad test cases under eager_mode

* Fixed minor issues
上级 6115b016
......@@ -2011,8 +2011,7 @@ static std::string GenerateSingleOpBase(
"egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
"RecoverTensorWrapper("
"&"
"this->%s, "
"nullptr)) },";
"this->%s)) },";
ins_contents_str +=
paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE,
grad_input_name, struct_fwd_input_name);
......@@ -2058,15 +2057,15 @@ static std::string GenerateSingleOpBase(
const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
" if(this->%s.size() > 0) %s[\"%s\"] = "
"egr::EagerUtils::TrySyncToVars(egr::EagerUtils::"
"RecoverTensorWrapper(&this->%s, nullptr));\n";
"RecoverTensorWrapper(&this->%s));\n";
generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name,
ins_name, grad_input_name, struct_fwd_input_name);
} else {
const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE =
" auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, "
"nullptr);\n if(%s.initialized()) %s[\"%s\"] = "
"egr::EagerUtils::TrySyncToVars(%s);\n";
" auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n"
" if(%s.initialized()) %s[\"%s\"] = "
" egr::EagerUtils::TrySyncToVars(%s);\n";
generated_grad_function_body += paddle::string::Sprintf(
DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name,
struct_fwd_input_name, grad_input_name, ins_name, grad_input_name,
......
......@@ -23,7 +23,8 @@ import os
########################
ops_to_fill_zero_for_empty_grads = set([
"split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
"sigmoid_triple_grad, add_double_grad"
"sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad",
"add_triple_grad"
])
# For API dispatch used at python-level
......
......@@ -236,7 +236,7 @@ FORWARD_BODY_TEMPLATE = \
{}
// SetAttributes
{}
// SetTensorWrappers
// Set TensorWrappers for Forward Inputs
{}
// SetGradOutMeta & SetEdges
{}
......@@ -245,6 +245,8 @@ FORWARD_BODY_TEMPLATE = \
{}
{}
{}
{}
// Set TensorWrappers for Forward Outputs
{}
}}
"""
......@@ -720,7 +722,8 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
set_attributes_str = "\n".join(set_attributes_list)
# SetTensorWrappers
set_tensor_wrappers_list = []
set_input_tensor_wrappers_list = []
set_output_tensor_wrappers_list = []
num_fwd_outputs = len(forward_outputs_position_map.keys())
for name, (atype, is_fwd_input,
pos) in backward_forward_inputs_map.items():
......@@ -732,6 +735,7 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);"
else:
set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, {need_input_data});"
set_input_tensor_wrappers_list.append(set_tensor_wrappers)
else:
if num_fwd_outputs > 1:
# Aligned with forward output position
......@@ -743,8 +747,11 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);"
else:
set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
set_output_tensor_wrappers_list.append(set_tensor_wrappers)
set_input_tensor_wrappers_str = "\n".join(
set_input_tensor_wrappers_list)
set_output_tensor_wrappers_str = "\n".join(
set_output_tensor_wrappers_list)
# SetGradOutMeta & SetEdges
set_grad_out_meta_list = []
......@@ -801,9 +808,10 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
node_creation_event_str, pass_stop_gradient_args_str,
node_construction_str, set_attributes_str, set_tensor_wrappers_str,
set_grad_out_meta_str, set_edges_str, set_out_rank_str,
set_history_str, set_grad_in_meta_str, set_retain_grad_str)
node_construction_str, set_attributes_str,
set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
set_out_rank_str, set_history_str, set_grad_in_meta_str,
set_retain_grad_str, set_output_tensor_wrappers_str)
def run(self):
# Basic Validation Check
......@@ -1296,7 +1304,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
transformed_tensor_name = self.TransformToNextGradName(name)
is_optional = (name in self.optional_inputs)
tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, this->shared_from_this());"
tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});"
if is_optional:
tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format(
transformed_tensor_name, transformed_tensor_name,
......
......@@ -731,16 +731,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
continue;
}
auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}
PADDLE_ENFORCE_LT(
j, grad_output_tensors[i].size(),
paddle::platform::errors::Fatal(
......@@ -760,8 +750,19 @@ std::vector<paddle::experimental::Tensor> RunBackward(
<< ", rank: " << j
<< " 's name is: " << grad_output_tensor.name();
auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
<< ", rank: " << edge_rank.second;
node_input_buffers_dict[next_node]->add(
edge_rank.first, edge_rank.second, grad_output_tensor);
......
......@@ -59,7 +59,7 @@ class RunCustomOpNode : public GradNodeBase {
std::vector<egr::TensorWrapper>* fwd_var) {
std::vector<paddle::experimental::Tensor> res;
for (size_t i = 0; i < fwd_var->size(); i++) {
res.emplace_back(fwd_var->at(i).recover(nullptr));
res.emplace_back(fwd_var->at(i).recover());
}
return res;
}
......
......@@ -61,6 +61,10 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
if (!node || !node.get()) {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
}
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " (addr: " << this << ") "
<< " to " << meta->GetMutableGradNode()->name()
<< " (addr: " << meta->GetMutableGradNode().get() << ")";
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
......@@ -84,7 +88,9 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
}
VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
<< this->name() << " to " << meta->GetMutableGradNode()->name();
<< this->name() << " (addr: " << this << ") "
<< " to " << meta->GetMutableGradNode()->name()
<< " (addr: " << meta->GetMutableGradNode().get() << ")";
adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
meta->OutRankInfo());
......
......@@ -110,6 +110,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank,
"got tensor: %s is empty please check you network "
"and make sure it creates grads.",
t.name()));
if (t.is_dense_tensor()) {
if (buffer_tensor.is_dense_tensor()) {
buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor);
......
......@@ -77,16 +77,17 @@ class TensorWrapper {
intermidiate_tensor_.set_name(tensor.name() + "@Saved");
// If an output is marked "intermedaite", we won't create
// autograd_meta for it.
// In that case, simply skip OutRankInfo Copy
if (EagerUtils::nullable_autograd_meta(tensor)) {
out_rank_info_ = EagerUtils::OutRankInfo(tensor);
auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
if (tensor_autograd_meta) {
auto autograd_meta = std::make_shared<AutogradMeta>(
Edge(nullptr, EagerUtils::OutRankInfo(tensor)));
autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient());
intermidiate_tensor_.set_autograd_meta(autograd_meta);
weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode();
}
}
paddle::experimental::Tensor recover(
const std::shared_ptr<GradNodeBase>& grad_node) {
paddle::experimental::Tensor recover() {
VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name()
<< " for wrapper";
if (!intermidiate_tensor_.defined()) {
......@@ -99,9 +100,20 @@ class TensorWrapper {
// if it's full_reserved just return the full copy of tensor
paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
if (!full_reserved_) {
std::shared_ptr<GradNodeBase> new_grad_node = grad_node;
auto p_ab_autograd_meta =
std::make_shared<AutogradMeta>(Edge(new_grad_node, out_rank_info_));
std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
if (new_grad_node) {
VLOG(3) << "Recovered TensorWrapper with GradNode "
<< new_grad_node->name() << " addr: " << new_grad_node.get();
} else {
VLOG(3) << "Recovered TensorWrapper with Empth GradNode";
}
auto* intermediate_autograd_meta =
EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
auto p_ab_autograd_meta = std::make_shared<AutogradMeta>(
Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo()));
p_ab_autograd_meta->SetStopGradient(
intermediate_autograd_meta->StopGradient());
recovered_tensor.set_autograd_meta(
std::static_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
p_ab_autograd_meta));
......@@ -149,8 +161,8 @@ class TensorWrapper {
private:
bool full_reserved_ = false;
bool no_need_buffer_ = false;
std::pair<size_t, size_t> out_rank_info_;
paddle::experimental::Tensor intermidiate_tensor_;
std::weak_ptr<egr::GradNodeBase> weak_grad_node_;
uint32_t inplace_version_snapshot_ = 0;
};
} // namespace egr
......@@ -41,7 +41,7 @@ TEST(TensorWrapper, Basic) {
et1.set_autograd_meta(auto_grad0);
et1.set_name("et1");
auto tw0 = egr::TensorWrapper(et1, true);
auto recover_et1 = tw0.recover(std::make_shared<eager_test::GradTestNode>());
auto recover_et1 = tw0.recover();
CHECK_EQ(recover_et1.name(), std::string("et1"));
CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first,
egr::EagerUtils::OutRankInfo(et1).first);
......@@ -67,7 +67,7 @@ TEST(TensorWrapper, Basic) {
auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
et2.set_autograd_meta(auto_grad1);
auto tw1 = egr::TensorWrapper(et2, false);
auto recover_et2 = tw1.recover(grad_test_node1);
auto recover_et2 = tw1.recover();
CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first,
egr::EagerUtils::OutRankInfo(et2).first);
......@@ -76,7 +76,5 @@ TEST(TensorWrapper, Basic) {
// Test Raw recover
paddle::experimental::Tensor et3;
auto tw2 = egr::TensorWrapper(et3, true);
CHECK(
tw2.recover(std::make_shared<eager_test::GradTestNode>()).initialized() ==
false);
CHECK(tw2.recover().initialized() == false);
}
......@@ -360,16 +360,15 @@ void EagerUtils::Output2Result(
}
paddle::experimental::Tensor EagerUtils::RecoverTensorWrapper(
TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node) {
return tw->recover(grad_node);
TensorWrapper* tw) {
return tw->recover();
}
std::vector<paddle::experimental::Tensor> EagerUtils::RecoverTensorWrapper(
std::vector<TensorWrapper>* tw,
const std::shared_ptr<GradNodeBase>& grad_node) {
std::vector<TensorWrapper>* tw) {
std::vector<paddle::experimental::Tensor> ret;
for (auto& t : *tw) {
ret.emplace_back(t.recover(grad_node));
ret.emplace_back(t.recover());
}
return ret;
}
......
......@@ -174,11 +174,9 @@ class EagerUtils {
const std::shared_ptr<EagerVariable>& view_output_var);
// TensorWrapper Utils
static paddle::experimental::Tensor RecoverTensorWrapper(
TensorWrapper* tw, const std::shared_ptr<GradNodeBase>& grad_node);
static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw);
static std::vector<paddle::experimental::Tensor> RecoverTensorWrapper(
std::vector<TensorWrapper>* tw,
const std::shared_ptr<GradNodeBase>& grad_node);
std::vector<TensorWrapper>* tw);
// Intermidate needed remove this once we don't need legacy
// Inner Method
......
......@@ -209,7 +209,9 @@ class TestDygraphTripleGrad(TestCase):
self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected))
def test_all_cases(self):
if _in_legacy_dygraph():
self.func_exception()
self.func_example_with_gradient_and_create_graph()
with _test_eager_guard():
self.func_exception()
self.func_example_with_gradient_and_create_graph()
......@@ -296,7 +298,8 @@ class TestDygraphTripleGradBradcastCase(TestCase):
self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected))
def test_all_cases(self):
if _in_legacy_dygraph():
self.func_example_with_gradient_and_create_graph()
with _test_eager_guard():
self.func_example_with_gradient_and_create_graph()
......
......@@ -1411,7 +1411,7 @@
func : GeneralTernaryGradInferMeta
param : [out, fwd_grad_out, grad_grad_x]
kernel :
func : sigmoid_double_grad
func : sigmoid_triple_grad
- backward_api : silu_grad
forward : silu (Tensor x) -> Tensor(out)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册