diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index a2ffe5af632156529895d98f8c5561ced5a36bda..726e049e611509d5b403fa699a7633fdd90ab37b 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2011,8 +2011,7 @@ static std::string GenerateSingleOpBase( "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" "RecoverTensorWrapper(" "&" - "this->%s, " - "nullptr)) },"; + "this->%s)) },"; ins_contents_str += paddle::string::Sprintf(GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name); @@ -2058,15 +2057,15 @@ static std::string GenerateSingleOpBase( const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = " if(this->%s.size() > 0) %s[\"%s\"] = " "egr::EagerUtils::TrySyncToVars(egr::EagerUtils::" - "RecoverTensorWrapper(&this->%s, nullptr));\n"; + "RecoverTensorWrapper(&this->%s));\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, struct_fwd_input_name, ins_name, grad_input_name, struct_fwd_input_name); } else { const char* DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE = - " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s, " - "nullptr);\n if(%s.initialized()) %s[\"%s\"] = " - "egr::EagerUtils::TrySyncToVars(%s);\n"; + " auto %s = egr::EagerUtils::RecoverTensorWrapper(&this->%s);\n" + " if(%s.initialized()) %s[\"%s\"] = " + " egr::EagerUtils::TrySyncToVars(%s);\n"; generated_grad_function_body += paddle::string::Sprintf( DISPENSABLE_GRAD_INS_FWD_CONTENT_TEMPLATE, grad_input_name, struct_fwd_input_name, grad_input_name, ins_name, grad_input_name, diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index 29d727fc8cba5f023683bb7a6082c8ba6a06bf3a..4dbaf47b0c0b1d0a663327b5b2b8ee3dd9e43b70 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -23,7 +23,8 @@ import os ######################## ops_to_fill_zero_for_empty_grads = set([ "split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad", - "sigmoid_triple_grad, add_double_grad" + "sigmoid_double_grad", "sigmoid_triple_grad", "add_double_grad", + "add_triple_grad" ]) # For API dispatch used at python-level diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 2154b5d6a4898308ac60f7b592ed34d580a804dd..bd31de520750d3f79c81d8d9d0e48b947955ae7f 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -236,7 +236,7 @@ FORWARD_BODY_TEMPLATE = \ {} // SetAttributes {} - // SetTensorWrappers + // Set TensorWrappers for Forward Inputs {} // SetGradOutMeta & SetEdges {} @@ -245,6 +245,8 @@ FORWARD_BODY_TEMPLATE = \ {} {} {} +{} + // Set TensorWrappers for Forward Outputs {} }} """ @@ -720,7 +722,8 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase): set_attributes_str = "\n".join(set_attributes_list) # SetTensorWrappers - set_tensor_wrappers_list = [] + set_input_tensor_wrappers_list = [] + set_output_tensor_wrappers_list = [] num_fwd_outputs = len(forward_outputs_position_map.keys()) for name, (atype, is_fwd_input, pos) in backward_forward_inputs_map.items(): @@ -732,6 +735,7 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, {need_input_data});" + set_input_tensor_wrappers_list.append(set_tensor_wrappers) else: if num_fwd_outputs > 1: # Aligned with forward output position @@ -743,8 +747,11 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase): set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);" else: set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);" - set_tensor_wrappers_list.append(set_tensor_wrappers) - set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) + set_output_tensor_wrappers_list.append(set_tensor_wrappers) + set_input_tensor_wrappers_str = "\n".join( + set_input_tensor_wrappers_list) + set_output_tensor_wrappers_str = "\n".join( + set_output_tensor_wrappers_list) # SetGradOutMeta & SetEdges set_grad_out_meta_list = [] @@ -801,9 +808,10 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase): self.node_creation_str = FORWARD_BODY_TEMPLATE.format( node_creation_event_str, pass_stop_gradient_args_str, - node_construction_str, set_attributes_str, set_tensor_wrappers_str, - set_grad_out_meta_str, set_edges_str, set_out_rank_str, - set_history_str, set_grad_in_meta_str, set_retain_grad_str) + node_construction_str, set_attributes_str, + set_input_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str, + set_out_rank_str, set_history_str, set_grad_in_meta_str, + set_retain_grad_str, set_output_tensor_wrappers_str) def run(self): # Basic Validation Check @@ -1296,7 +1304,7 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase): transformed_tensor_name = self.TransformToNextGradName(name) is_optional = (name in self.optional_inputs) - tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name}, this->shared_from_this());" + tensor_wrapper_recover_str = f"{indent}auto {transformed_tensor_name} = egr::EagerUtils::RecoverTensorWrapper(&this->{tensor_wrapper_name});" if is_optional: tensor_wrapper_recover_str += "\n" + CREATE_RECOVER_OPTIONAL_TENSOR_TEMPLATE.format( transformed_tensor_name, transformed_tensor_name, diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 4afe8ff105e7611d9b275bee5bedf7b87edaca15..3b555eda8fff7d11ce2b4cb5270c57bbc1765319 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -731,16 +731,6 @@ std::vector RunBackward( continue; } - auto* next_node = next_node_shared.get(); - if (!node_input_buffers_dict.count(next_node)) { - const auto& input_meta = next_node->InputMeta(); - auto grad_tensor_holder = - std::make_unique(input_meta); - VLOG(6) << "Construct GradTensorHolder for grad node: " - << next_node->name(); - node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); - } - PADDLE_ENFORCE_LT( j, grad_output_tensors[i].size(), paddle::platform::errors::Fatal( @@ -760,8 +750,19 @@ std::vector RunBackward( << ", rank: " << j << " 's name is: " << grad_output_tensor.name(); + auto* next_node = next_node_shared.get(); + if (!node_input_buffers_dict.count(next_node)) { + const auto& input_meta = next_node->InputMeta(); + auto grad_tensor_holder = + std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); + node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); + } + VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first << ", rank: " << edge_rank.second; + node_input_buffers_dict[next_node]->add( edge_rank.first, edge_rank.second, grad_output_tensor); diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h index c483dc0ebd17771ac7ef3a2e8b433fdc76d623b9..6db410fa0f1af6be0135b11cf389c0023818df0e 100644 --- a/paddle/fluid/eager/custom_operator/custom_operator_node.h +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -59,7 +59,7 @@ class RunCustomOpNode : public GradNodeBase { std::vector* fwd_var) { std::vector res; for (size_t i = 0; i < fwd_var->size(); i++) { - res.emplace_back(fwd_var->at(i).recover(nullptr)); + res.emplace_back(fwd_var->at(i).recover()); } return res; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 9b1743b4860267e20858c8191aec37826e70299f..5b4921320f6b0d8532e07f0c9809ca3e125bf969 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -61,6 +61,10 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { if (!node || !node.get()) { meta->SetGradNode(std::make_shared(meta)); } + VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); @@ -84,7 +88,9 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { meta->SetGradNode(std::make_shared(meta)); } VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " - << this->name() << " to " << meta->GetMutableGradNode()->name(); + << this->name() << " (addr: " << this << ") " + << " to " << meta->GetMutableGradNode()->name() + << " (addr: " << meta->GetMutableGradNode().get() << ")"; adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc index 167717fb43166e300770db83b23fe24f1c74c7ad..27a8c6002e29d67f504bd1741998ec5e9350843e 100644 --- a/paddle/fluid/eager/grad_tensor_holder.cc +++ b/paddle/fluid/eager/grad_tensor_holder.cc @@ -110,6 +110,7 @@ void GradTensorHolder::add(size_t slot_id, size_t rank, "got tensor: %s is empty please check you network " "and make sure it creates grads.", t.name())); + if (t.is_dense_tensor()) { if (buffer_tensor.is_dense_tensor()) { buffer_tensor = add_final_state_dygraph_function(t, buffer_tensor); diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 3d5d3139de14c7548ee6320cbfea1d89c19e59dc..b5dd6b960b23a17b8c849e9f8040163df4773de9 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -77,16 +77,17 @@ class TensorWrapper { intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - // If an output is marked "intermedaite", we won't create - // autograd_meta for it. - // In that case, simply skip OutRankInfo Copy - if (EagerUtils::nullable_autograd_meta(tensor)) { - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor); + if (tensor_autograd_meta) { + auto autograd_meta = std::make_shared( + Edge(nullptr, EagerUtils::OutRankInfo(tensor))); + autograd_meta->SetStopGradient(tensor_autograd_meta->StopGradient()); + intermidiate_tensor_.set_autograd_meta(autograd_meta); + weak_grad_node_ = tensor_autograd_meta->GetMutableGradNode(); } } - paddle::experimental::Tensor recover( - const std::shared_ptr& grad_node) { + paddle::experimental::Tensor recover() { VLOG(6) << "Recover tensor: " << intermidiate_tensor_.name() << " for wrapper"; if (!intermidiate_tensor_.defined()) { @@ -99,9 +100,20 @@ class TensorWrapper { // if it's full_reserved just return the full copy of tensor paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_; if (!full_reserved_) { - std::shared_ptr new_grad_node = grad_node; - auto p_ab_autograd_meta = - std::make_shared(Edge(new_grad_node, out_rank_info_)); + std::shared_ptr new_grad_node = weak_grad_node_.lock(); + if (new_grad_node) { + VLOG(3) << "Recovered TensorWrapper with GradNode " + << new_grad_node->name() << " addr: " << new_grad_node.get(); + } else { + VLOG(3) << "Recovered TensorWrapper with Empth GradNode"; + } + auto* intermediate_autograd_meta = + EagerUtils::unsafe_autograd_meta(intermidiate_tensor_); + auto p_ab_autograd_meta = std::make_shared( + Edge(new_grad_node, intermediate_autograd_meta->OutRankInfo())); + p_ab_autograd_meta->SetStopGradient( + intermediate_autograd_meta->StopGradient()); + recovered_tensor.set_autograd_meta( std::static_pointer_cast( p_ab_autograd_meta)); @@ -149,8 +161,8 @@ class TensorWrapper { private: bool full_reserved_ = false; bool no_need_buffer_ = false; - std::pair out_rank_info_; paddle::experimental::Tensor intermidiate_tensor_; + std::weak_ptr weak_grad_node_; uint32_t inplace_version_snapshot_ = 0; }; } // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc index a0c75c02001378b0666d44b7ec6f7dada306f536..5f563edee39f17de8a2fc6cff8cb3c5b0c5f6e69 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc @@ -41,7 +41,7 @@ TEST(TensorWrapper, Basic) { et1.set_autograd_meta(auto_grad0); et1.set_name("et1"); auto tw0 = egr::TensorWrapper(et1, true); - auto recover_et1 = tw0.recover(std::make_shared()); + auto recover_et1 = tw0.recover(); CHECK_EQ(recover_et1.name(), std::string("et1")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first, egr::EagerUtils::OutRankInfo(et1).first); @@ -67,7 +67,7 @@ TEST(TensorWrapper, Basic) { auto auto_grad1 = std::make_shared(edge1); et2.set_autograd_meta(auto_grad1); auto tw1 = egr::TensorWrapper(et2, false); - auto recover_et2 = tw1.recover(grad_test_node1); + auto recover_et2 = tw1.recover(); CHECK_EQ(recover_et2.name(), std::string("et2@Saved")); CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first, egr::EagerUtils::OutRankInfo(et2).first); @@ -76,7 +76,5 @@ TEST(TensorWrapper, Basic) { // Test Raw recover paddle::experimental::Tensor et3; auto tw2 = egr::TensorWrapper(et3, true); - CHECK( - tw2.recover(std::make_shared()).initialized() == - false); + CHECK(tw2.recover().initialized() == false); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 0847f691e822c0681dd503ac93f72d6f555da846..66d877f06e21d0e182aa101053dff9c03e5ff737 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -360,16 +360,15 @@ void EagerUtils::Output2Result( } paddle::experimental::Tensor EagerUtils::RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node) { - return tw->recover(grad_node); + TensorWrapper* tw) { + return tw->recover(); } std::vector EagerUtils::RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node) { + std::vector* tw) { std::vector ret; for (auto& t : *tw) { - ret.emplace_back(t.recover(grad_node)); + ret.emplace_back(t.recover()); } return ret; } diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index be534d4440561ac4dca5c3fbef7ff379ebe44fff..51a322c8524ac860b20cb43e6b266249790bd757 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -174,11 +174,9 @@ class EagerUtils { const std::shared_ptr& view_output_var); // TensorWrapper Utils - static paddle::experimental::Tensor RecoverTensorWrapper( - TensorWrapper* tw, const std::shared_ptr& grad_node); + static paddle::experimental::Tensor RecoverTensorWrapper(TensorWrapper* tw); static std::vector RecoverTensorWrapper( - std::vector* tw, - const std::shared_ptr& grad_node); + std::vector* tw); // Intermidate needed remove this once we don't need legacy // Inner Method diff --git a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py index f0c5316412f1e4c24efba99c6fde20c91ae1a0ca..3a8a3a96e9a33d52e2407c5097d02d1b1eb09ea2 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py @@ -209,7 +209,9 @@ class TestDygraphTripleGrad(TestCase): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_exception() + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_exception() self.func_example_with_gradient_and_create_graph() @@ -296,7 +298,8 @@ class TestDygraphTripleGradBradcastCase(TestCase): self.assertTrue(np.allclose(dddx_grad_actual, dddx_expected)) def test_all_cases(self): - if _in_legacy_dygraph(): + self.func_example_with_gradient_and_create_graph() + with _test_eager_guard(): self.func_example_with_gradient_and_create_graph() diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 006b2bf698bd4637f0936e92b4bd75513e484ea4..703667ece5a1a73e93b4ca6cce89ab002dda71cd 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -1411,7 +1411,7 @@ func : GeneralTernaryGradInferMeta param : [out, fwd_grad_out, grad_grad_x] kernel : - func : sigmoid_double_grad + func : sigmoid_triple_grad - backward_api : silu_grad forward : silu (Tensor x) -> Tensor(out)