From a456dda6a3a83b410ef51e38d8b814b747c6af36 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Sat, 26 Feb 2022 10:37:12 +0800 Subject: [PATCH] [Eager Hook] Support GradientHook and ReduceHook, expose related interface to python (#39893) * Support Eager Hook, expose interface to python * Fix CI issue --- .../eager/accumulation/accumulation_node.cc | 6 +- .../eager/accumulation/accumulation_node.h | 5 +- paddle/fluid/eager/api/utils/hook_utils.cc | 51 +++--- paddle/fluid/eager/api/utils/hook_utils.h | 8 +- .../auto_code_generator/eager_generator.cc | 7 +- paddle/fluid/eager/grad_node_info.cc | 25 ++- paddle/fluid/eager/grad_node_info.h | 31 +++- paddle/fluid/eager/hooks.h | 63 +++++++ .../accumulation_node_test.cc | 7 +- .../grad_node_info_test.cc | 21 ++- .../tests/task_tests/fwd_bwd_joint_test.cc | 20 +- .../fluid/eager/tests/task_tests/hook_test.cc | 25 +-- .../task_tests/hook_test_intermidiate.cc | 84 ++++++--- paddle/fluid/pybind/eager_method.cc | 172 ++++++++++++++++++ .../fluid/dygraph/varbase_patch_methods.py | 5 +- .../unittests/test_tensor_register_hook.py | 94 ++++++++-- 16 files changed, 488 insertions(+), 136 deletions(-) create mode 100644 paddle/fluid/eager/hooks.h diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 2e377e43ca3..3a2ec403c0a 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -76,13 +76,13 @@ operator()( } void GradNodeAccumulation::RegisterReduceHook( - const std::function& hook) { - reduce_hooks_.emplace_back(hook); + std::shared_ptr&& hook) { + reduce_hooks_.emplace_back(std::move(hook)); } void GradNodeAccumulation::ApplyReduceHooks() { for (auto& hook : reduce_hooks_) { - hook(); + (*hook)(); } } } // namespace egr diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 787149ab305..734cabdc3dc 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -16,6 +16,7 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" namespace egr { @@ -39,7 +40,7 @@ class GradNodeAccumulation : public GradNodeBase { /** * Register ReduceHook * **/ - void RegisterReduceHook(const std::function& hook); + void RegisterReduceHook(std::shared_ptr&& hook); /** * Apply ReduceHook here @@ -54,7 +55,7 @@ class GradNodeAccumulation : public GradNodeBase { const paddle::experimental::Tensor&)> retain_grad_hook_; - std::vector> reduce_hooks_; + std::vector> reduce_hooks_; }; } // namespace egr diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 748afe6d1f3..c7927716300 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -22,19 +22,19 @@ namespace egr { namespace egr_utils_api { -void RegisterGradientHookForTensor( +int64_t RegisterGradientHookForTensor( const paddle::experimental::Tensor& tensor, - std::function& hook) { + std::shared_ptr&& hook) { // Find grad_node and out_rank from AutogradMeta std::shared_ptr grad_node = EagerUtils::grad_node(tensor); auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo(); - grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook); + return grad_node->RegisterGradientHook(rank_info.first, rank_info.second, + std::move(hook)); } void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, - const std::function& hook) { + std::shared_ptr&& hook) { if (IsLeafTensor(tensor)) { VLOG(6) << "Register ReduceHook for leaf tensor"; std::shared_ptr grad_node = EagerUtils::grad_node(tensor); @@ -45,7 +45,7 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, "with type: GradNodeAccumulation")); auto accumulation_grad_node = std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->RegisterReduceHook(hook); + accumulation_grad_node->RegisterReduceHook(std::move(hook)); } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Only can register reduce hook for leaf Tensor.")); @@ -65,28 +65,27 @@ static void RetainGradForRegularNode( meta->WeakGrad(); // Define Hook - std::function - hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { - if (!weak_grad_tensor.expired()) { - auto grad_tensor = weak_grad_tensor.lock(); - if (t.defined()) { - VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); - // Simply Copy impl() to grad_tensor - grad_tensor->set_impl(t.impl()); - return *grad_tensor.get(); - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - }; + auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { + if (!weak_grad_tensor.expired()) { + auto grad_tensor = weak_grad_tensor.lock(); + if (t.defined()) { + VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); + // Simply Copy impl() to grad_tensor + grad_tensor->set_impl(t.impl()); + return *grad_tensor.get(); + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } + }; // Append to GradientHooks - RegisterGradientHookForTensor(tensor, hook); + RegisterGradientHookForTensor(tensor, + std::make_shared(hook)); } void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h index 4c4ecc9fb80..b36ef81125a 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.h +++ b/paddle/fluid/eager/api/utils/hook_utils.h @@ -16,17 +16,17 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/all.h" namespace egr { namespace egr_utils_api { -void RegisterGradientHookForTensor( +int64_t RegisterGradientHookForTensor( const paddle::experimental::Tensor& tensor, - std::function& hook); + std::shared_ptr&& hook); void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, - const std::function& hook); + std::shared_ptr&& hook); void RetainGradForTensor(const paddle::experimental::Tensor& tensor); } // namespace egr_utils_api diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index e1f4d6ee9a1..74c5bcdb209 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2040,12 +2040,13 @@ static std::string GenerateGradNodeCCContents( const char* BWD_RETURN_TEMPLATE = " std::vector> hooked_grads = " - "egr::GradNodeBase::ApplyGradientHooks(grads);\n" + "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" " return outputs;\n"; - generated_grad_function_body = paddle::string::Sprintf( - BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body); + generated_grad_function_body = + paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), + generated_grad_function_body); // [Generation] Get Full Grad Function const char* GRAD_FUNCTION_TEMPLATE = diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 27c376b4c80..35416281f18 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -210,22 +210,22 @@ const std::vector>& GradNodeBase::GetEdges() const { return adj_edges_; } -void GradNodeBase::RegisterGradientHook( - size_t slot_id, size_t rank, - const std::function& hook) { - gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook)); +int64_t GradNodeBase::RegisterGradientHook( + size_t slot_id, size_t rank, std::shared_ptr&& hook) { + gradient_hooks_.emplace(next_hook_id_, + std::make_tuple(slot_id, rank, std::move(hook))); + return next_hook_id_++; } std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { std::vector> outs(tensors.size()); - for (auto& tuple : gradient_hooks_) { - size_t slot_id = std::get<0>(tuple); - size_t rank = std::get<1>(tuple); - std::function& hook = std::get<2>(tuple); + for (auto& hook_pair : gradient_hooks_) { + size_t slot_id = std::get<0>(hook_pair.second); + size_t rank = std::get<1>(hook_pair.second); + + auto hook = std::get<2>(hook_pair.second); PADDLE_ENFORCE(slot_id < tensors.size(), paddle::platform::errors::Fatal( @@ -242,12 +242,11 @@ GradNodeBase::ApplyGradientHooks( slot_out.resize(tensors[slot_id].size()); paddle::experimental::Tensor& out = slot_out[rank]; if (!out.defined() || !out.initialized()) { - VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name(); - out = hook(tensors[slot_id][rank]); + out = (*hook)(tensors[slot_id][rank]); } else { // If more than one hook is registered, the input to the next hook func // should be the output of the previous hook - out = hook(out); + out = (*hook)(out); } } diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index f699f9ab28e..eeac1cca4ac 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/all.h" namespace egr { @@ -135,14 +136,24 @@ class GradNodeBase { /** * Register GradientHook * **/ - void RegisterGradientHook(size_t slot_id, size_t rank, - const std::function& hook); + int64_t RegisterGradientHook(size_t slot_id, size_t rank, + std::shared_ptr&& hook); + + /** + * Remove GradientHook + * **/ + bool RemoveGradientHook(const int64_t& hook_id) { + auto remove_cnt = gradient_hooks_.erase(hook_id); + if (remove_cnt == 0) { + return false; + } + return true; + } /** * Apply GradientHook * **/ - inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; } + inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); } std::vector> ApplyGradientHooks( const std::vector>& tensors); @@ -166,12 +177,14 @@ class GradNodeBase { // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward - // Each entry consists one pair of - std::vector>> + // Each entry consists one pair of + // >> + std::map>> gradient_hooks_; + + int64_t next_hook_id_{0}; }; class Edge { diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h new file mode 100644 index 00000000000..097150cf5ed --- /dev/null +++ b/paddle/fluid/eager/hooks.h @@ -0,0 +1,63 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/phi/api/include/tensor.h" +namespace egr { + +class TensorHook { + public: + virtual ~TensorHook() = default; + virtual paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) = 0; +}; + +class TensorVoidHook { + public: + virtual ~TensorVoidHook() = default; + virtual void operator()() = 0; +}; + +class CppTensorHook : public TensorHook { + public: + explicit CppTensorHook(std::function&& fn) + : fn_(std::move(fn)) {} + + paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) override { + return fn_(var); + } + + private: + std::function + fn_; +}; + +class CppTensorVoidHook : public TensorVoidHook { + public: + explicit CppTensorVoidHook(std::function&& fn) : fn_(std::move(fn)) {} + + void operator()() override { return fn_(); } + + private: + std::function fn_; +}; +} // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index 880bd268410..28682ab0fe0 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" @@ -116,7 +117,8 @@ TEST(AccumulationNode, Tensor) { VLOG(6) << "Running Reduce Hook"; }; - node->RegisterReduceHook(reduce_hook_1); + node->RegisterReduceHook( + std::make_shared(reduce_hook_1)); // operator() paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; @@ -141,7 +143,8 @@ TEST(AccumulationNode, Tensor) { ret_et0_ptr[0] = 100.0; // set to 100.0 VLOG(6) << "Running Reduce Hook"; }; - node->RegisterReduceHook(reduce_hook_2); + node->RegisterReduceHook( + std::make_shared(reduce_hook_2)); node->ApplyReduceHooks(); // Check ApplyReduceHooks result diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index aee6ee74886..e3db309c401 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/phi/api/lib/utils/allocator.h" @@ -32,7 +33,7 @@ TEST(GradNodeInfo, GradSlotMeta) { CHECK_EQ(grad_slot.Size(), 2); } -TEST(GradNodeInfo, GradNodeBase) { +void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Construct Grad Node"; auto grad_test_node0 = std::make_shared( /* val */ 5.0, /* in_num */ 2, /* out_num */ 2); @@ -112,13 +113,25 @@ TEST(GradNodeInfo, GradNodeBase) { VLOG(6) << "Running Gradient Hook"; return res; }; - grad_test_node0->RegisterGradientHook(0, 0, gradient_hook); - // 5 + 6 + int64_t hook_id = grad_test_node0->RegisterGradientHook( + 0, 0, std::make_shared(gradient_hook)); + + if (is_remove_gradient_hook) { + // Remove GradientHook + grad_test_node0->RemoveGradientHook(hook_id); + } + + // Check results auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads); CHECK_EQ( std::dynamic_pointer_cast(grad_hook_res[0][0].impl()) ->data()[0], - 11.0); + is_remove_gradient_hook ? 5.0 : 11.0); +} + +TEST(GradNodeInfo, GradNodeBase) { + TestGradNodeBase(true); + TestGradNodeBase(false); } TEST(GradNodeInfo, Edge) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 752fd781284..5a7bafb2fe3 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -27,6 +27,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" namespace egr { @@ -221,10 +222,6 @@ TEST(FwdBwdJoint, GradientHook) { phi::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); egr_utils_api::RetainGradForTensor(tensor); - std::function - hook = &hook_function; - // 3. Run Forward // Run Forward Node 0 float scale0 = 2.0; @@ -232,24 +229,27 @@ TEST(FwdBwdJoint, GradientHook) { paddle::experimental::Tensor out0 = egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out0); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out0, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out0); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out0, std::make_shared(hook_function)); // hook: +5 // Run Forward Node 1 float scale1 = 5.0; float bias1 = 10.0; paddle::experimental::Tensor out1 = egr::scale( out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out1); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out1, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out1); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out1, std::make_shared(hook_function)); // hook: +5 // Run Forward Node 2 float scale2 = 10.0; float bias2 = 20.0; paddle::experimental::Tensor out2 = egr::scale( out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out2); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out2, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out2); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out2, std::make_shared(hook_function)); // hook: +5 // 4. Run Backward std::vector outs = {out1, out2}; diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index fbc71168fe4..9cda961741f 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -28,6 +28,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" namespace egr { @@ -83,9 +84,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { // Apply RetainGrad { // ScaleNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto_grad_meta->SetGradNode( @@ -96,7 +94,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + target_tensor, std::make_shared(hook_function)); egr_utils_api::RetainGradForTensor( target_tensor); // result: 1.0 + 3.0 = 4.0 egr_utils_api::RetainGradForTensor( @@ -107,9 +106,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { // AccumulationNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); @@ -126,7 +122,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + leaf_tensor, std::make_shared(hook_function)); egr_utils_api::RetainGradForTensor( leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } @@ -161,9 +158,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { // Apply RetainGrad { // ScaleNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto_grad_meta->SetGradNode( @@ -175,16 +169,14 @@ TEST(RetainGrad, HookAfterRetainGrad) { auto_grad_meta)); egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 - egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + target_tensor, std::make_shared(hook_function)); } // Retain Grad for leaf tensor1 paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { // AccumulationNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto acc_node_ptr = @@ -199,7 +191,8 @@ TEST(RetainGrad, HookAfterRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + leaf_tensor, std::make_shared(hook_function)); } RunBackward(target_tensors, {}); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index dbcfe704dbe..15b2a62dca7 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -24,6 +24,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" namespace egr { @@ -54,7 +55,7 @@ paddle::experimental::Tensor hook_function( return ret; } -TEST(Hook_intermidiate, Sigmoid) { +void test_sigmoid(bool is_remove_gradient_hook) { // Prepare Device Contexts VLOG(6) << "Init Env"; eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -67,11 +68,6 @@ TEST(Hook_intermidiate, Sigmoid) { ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 0.0, true); - VLOG(6) << "Make Hook function"; - std::function - hook = &hook_function; - VLOG(6) << "Make ReduceHook function"; auto reduce_hook = [&](void) -> void { auto* t_ptr = std::dynamic_pointer_cast(tensor.impl()) @@ -85,10 +81,12 @@ TEST(Hook_intermidiate, Sigmoid) { egr_utils_api::RetainGradForTensor(tensor); VLOG(6) << "Register GradientHook for Tensor"; - egr_utils_api::RegisterGradientHookForTensor(tensor, hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + tensor, std::make_shared(hook_function)); VLOG(6) << "Register ReduceHook for Tensor"; - egr_utils_api::RegisterReduceHookForTensor(tensor, reduce_hook); + egr_utils_api::RegisterReduceHookForTensor( + tensor, std::make_shared(reduce_hook)); VLOG(6) << "Runing Forward"; auto output_tensor = sigmoid_dygraph_function(tensor, {}); @@ -98,11 +96,17 @@ TEST(Hook_intermidiate, Sigmoid) { std::vector target_tensors = {output_tensor}; + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(tensor); + grad_node_tmp->RemoveGradientHook(hook_id); + } + VLOG(6) << "Runing Backward"; RunBackward(target_tensors, {}); VLOG(6) << "Finish Backward"; - eager_test::CompareGradTensorWithValue(tensor, 0.25 + 3); + eager_test::CompareGradTensorWithValue( + tensor, is_remove_gradient_hook ? 0.25 : 0.25 + 3.0); VLOG(6) << "Checking ReduceHook results"; for (int i = 0; i < tensor.numel(); i++) { @@ -113,7 +117,7 @@ TEST(Hook_intermidiate, Sigmoid) { VLOG(6) << "After Tests"; } -TEST(Hook_intermidiate, ElementwiseAdd) { +void test_elementwiseAdd(bool is_remove_gradient_hook) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -132,11 +136,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) { ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 2.0, true); - std::function - hook = &hook_function; - - auto reduce_hook = [&](void) -> void { + auto reduce_hook = [&]() -> void { auto* t_ptr = std::dynamic_pointer_cast(Y.impl())->data(); for (int i = 0; i < Y.numel(); i++) { @@ -145,18 +145,26 @@ TEST(Hook_intermidiate, ElementwiseAdd) { }; egr_utils_api::RetainGradForTensor(Y); - egr_utils_api::RegisterGradientHookForTensor(Y, hook); - egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + Y, std::make_shared(hook_function)); + egr_utils_api::RegisterReduceHookForTensor( + Y, std::make_shared(reduce_hook)); auto output_tensor = elementwise_add_dygraph_function(X, Y, {}); eager_test::CompareTensorWithValue(output_tensor, 5); - std::vector target_tensors = {output_tensor}; + + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(Y); + grad_node_tmp->RemoveGradientHook(hook_id); + } + RunBackward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); - eager_test::CompareGradTensorWithValue(Y, 4.0); + eager_test::CompareGradTensorWithValue( + Y, is_remove_gradient_hook ? 1.0 : 1.0 + 3.0); // Checking ReduceHook results for (int i = 0; i < Y.numel(); i++) { @@ -166,7 +174,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) { } } -TEST(Hook_intermidiate, Matmul_v2) { +void test_matmul(bool is_remove_gradient_hook) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -185,10 +193,6 @@ TEST(Hook_intermidiate, Matmul_v2) { ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 2.0, true); - std::function - hook = &hook_function; - auto reduce_hook = [&](void) -> void { auto* t_ptr = std::dynamic_pointer_cast(Y.impl())->data(); @@ -198,19 +202,27 @@ TEST(Hook_intermidiate, Matmul_v2) { }; egr_utils_api::RetainGradForTensor(Y); - egr_utils_api::RegisterGradientHookForTensor(Y, hook); - egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + Y, std::make_shared(hook_function)); + egr_utils_api::RegisterReduceHookForTensor( + Y, std::make_shared(reduce_hook)); auto output_tensor = matmul_v2_dygraph_function( X, Y, {{"trans_x", false}, {"trans_y", false}}); eager_test::CompareTensorWithValue(output_tensor, 96); - std::vector target_tensors = {output_tensor}; + + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(Y); + grad_node_tmp->RemoveGradientHook(hook_id); + } + RunBackward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); - eager_test::CompareGradTensorWithValue(Y, 3.0 * 4 + 3); + eager_test::CompareGradTensorWithValue( + Y, is_remove_gradient_hook ? 3.0 * 4 : 3.0 * 4 + 3); // Checking ReduceHook results for (int i = 0; i < Y.numel(); i++) { @@ -219,6 +231,22 @@ TEST(Hook_intermidiate, Matmul_v2) { static_cast(100.0f)); } } + +TEST(Hook_intermidiate, Sigmoid) { + // True or false represents whether to call RemoveGradientHook + test_sigmoid(true); + test_sigmoid(false); +} + +TEST(Hook_intermidiate, ElementwiseAdd) { + test_elementwiseAdd(true); + test_elementwiseAdd(false); +} + +TEST(Hook_intermidiate, Matmul_v2) { + test_matmul(true); + test_matmul(false); +} } // namespace egr USE_OP(sigmoid); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 4e900ae2ffb..221d4d53d06 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -20,6 +20,8 @@ limitations under the License. */ #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/memory/allocation/allocator.h" @@ -35,6 +37,82 @@ limitations under the License. */ namespace paddle { namespace pybind { +namespace py = ::pybind11; + +class PyTensorHook : public egr::TensorHook { + public: + explicit PyTensorHook(PyObject* func) : py_func_(func) { + Py_INCREF(py_func_); + } + + ~PyTensorHook() { + py::gil_scoped_acquire gil; + Py_DECREF(py_func_); + } + + paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) override { + py::gil_scoped_acquire gil; + VLOG(3) << "Call PyTensorHook for var " << var.name(); + + PyObject* res = nullptr; + try { + res = PyObject_CallFunctionObjArgs(py_func_, ToPyObject(var), nullptr); + } catch (platform::EnforceNotMet& e) { + throw std::move(e); + } catch (std::exception& e) { + PADDLE_THROW(platform::errors::Unavailable( + "Hook function of Tensor raises an exception: %s.", e.what())); + } catch (...) { + PADDLE_THROW(platform::errors::Fatal( + "Hook function of Tensor raises an unknown exception.")); + } + + PADDLE_ENFORCE_NOT_NULL(res, + platform::errors::Unavailable( + "Hook function of Tensor return a nullptr.")); + if (res == Py_None) { + return var; + } + return reinterpret_cast(res)->tensor; + } + + private: + PyObject* py_func_; +}; + +class PyTensorVoidHook : public egr::TensorVoidHook { + public: + explicit PyTensorVoidHook(PyObject* func) : py_func_(func) { + Py_INCREF(py_func_); + } + + ~PyTensorVoidHook() { + py::gil_scoped_acquire gil; + Py_DECREF(py_func_); + } + + void operator()() override { + py::gil_scoped_acquire gil; + VLOG(3) << "Call PyTensorVoidHook"; + + try { + PyObject_CallFunctionObjArgs(py_func_, nullptr); + } catch (platform::EnforceNotMet& e) { + throw std::move(e); + } catch (std::exception& e) { + PADDLE_THROW(platform::errors::Unavailable( + "Hook function of Tensor raises an exception: %s.", e.what())); + } catch (...) { + PADDLE_THROW(platform::errors::Fatal( + "Hook function of Tensor raises an unknown exception.")); + } + } + + private: + PyObject* py_func_; +}; + extern void InitTensorWithNumpyValue(TensorObject* self, const pybind11::object& array, bool zero_copy); @@ -403,6 +481,92 @@ static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + int64_t hook_id; + if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { + VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation.")); + auto rank_info = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo(); + + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + auto accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + hook_id = accumulation_grad_node->RegisterGradientHook( + rank_info.first, rank_info.second, + std::make_shared(hook_func)); + + } else { + VLOG(6) << "Register hook for non leaf tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + auto rank_info = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo(); + + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + hook_id = grad_node->RegisterGradientHook( + rank_info.first, rank_info.second, + std::make_shared(hook_func)); + } + return ToPyObject(hook_id); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_remove_grad_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + VLOG(6) << "Remove the registered hook for tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + + int64_t hook_id = pybind::CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0); + + return ToPyObject(grad_node->RemoveGradientHook(hook_id)); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + VLOG(4) << "Register reduce hook for tensor: " << self->tensor.name(); + + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + PADDLE_ENFORCE_EQ(egr::egr_utils_api::IsLeafTensor(self->tensor), true, + platform::errors::InvalidArgument( + "Only can register backward hook for leaf Tensor.")); + PADDLE_ENFORCE_EQ( + !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(), + true, platform::errors::InvalidArgument( + "Cannot register backward hook on a Tensor that stop " + "gradient.")); + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation.")); + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + auto accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(hook_func)); + + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -440,6 +604,14 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_register_grad_hook", + (PyCFunction)(void (*)(void))tensor_register_grad_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_remove_grad_hook", (PyCFunction)(void (*)(void))tensor_remove_grad_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_register_backward_hook", + (PyCFunction)(void (*)(void))tensor_register_reduce_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index c4ea751ed92..65bfba3f6c3 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -33,10 +33,11 @@ import paddle.utils.deprecated as deprecated class TensorHookRemoveHelper(object): """ A helper class that for removing Tensor gradient's hook. + NOTE(wuweilong):the operation weakref.ref(tensor) will cause some unexpected errors in eager mode. """ def __init__(self, tensor, hook_id): - self._tensor_ref = weakref.ref(tensor) + self._tensor = tensor if core._in_eager_mode() else weakref.ref(tensor) self._hook_id = hook_id def remove(self): @@ -46,7 +47,7 @@ class TensorHookRemoveHelper(object): Returns: bool: Return True if removed successfully """ - tensor = self._tensor_ref() + tensor = self._tensor if core._in_eager_mode() else self._tensor() if tensor is not None: res = tensor._remove_grad_hook(self._hook_id) if res is True: diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 52256766fed..3238876b894 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -19,6 +19,7 @@ import numpy as np import paddle import paddle.nn as nn +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode class SimpleNet(nn.Layer): @@ -64,7 +65,7 @@ class TestTensorRegisterHook(unittest.TestCase): if paddle.is_compiled_with_cuda(): self.devices.append("gpu") - def test_hook_for_interior_var(self): + def func_hook_for_interior_var(self): def run_double_hook_for_interior_var(double_hook, removed=False): for device in self.devices: paddle.set_device(device) @@ -154,7 +155,12 @@ class TestTensorRegisterHook(unittest.TestCase): # register hook and removed run_print_hook_for_interior_var(print_hook, removed=True) - def test_hook_for_leaf_var(self): + def test_hook_for_interior_var(self): + with _test_eager_guard(): + self.func_hook_for_interior_var() + self.func_hook_for_interior_var() + + def func_hook_for_leaf_var(self): def run_double_hook_for_leaf_var(double_hook, removed=False): for device in self.devices: paddle.set_device(device) @@ -193,7 +199,12 @@ class TestTensorRegisterHook(unittest.TestCase): # register hook and removed run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True) - def test_hook_for_accumulated_grad_interior_var(self): + def test_hook_for_leaf_var(self): + with _test_eager_guard(): + self.func_hook_for_leaf_var() + self.func_hook_for_leaf_var() + + def func_hook_for_accumulated_grad_interior_var(self): def run_double_hook_for_accumulated_grad_interior_var(double_hook, removed=False): for device in self.devices: @@ -248,7 +259,12 @@ class TestTensorRegisterHook(unittest.TestCase): run_double_hook_for_accumulated_grad_interior_var( lambda grad: grad * 2, removed=True) - def test_hook_for_accumulated_grad_leaf_var(self): + def test_hook_for_accumulated_grad_interior_var(self): + with _test_eager_guard(): + self.func_hook_for_accumulated_grad_interior_var() + self.func_hook_for_accumulated_grad_interior_var() + + def func_hook_for_accumulated_grad_leaf_var(self): def run_double_hook_for_accumulated_grad_leaf_var(double_hook, removed=False): for device in self.devices: @@ -289,7 +305,12 @@ class TestTensorRegisterHook(unittest.TestCase): run_double_hook_for_accumulated_grad_leaf_var( lambda grad: grad * 2, removed=True) - def test_hook_in_model(self): + def test_hook_for_accumulated_grad_leaf_var(self): + with _test_eager_guard(): + self.func_hook_for_accumulated_grad_leaf_var() + self.func_hook_for_accumulated_grad_leaf_var() + + def func_hook_in_model(self): def run_double_hook_in_model(data, label, hook=None, @@ -336,7 +357,12 @@ class TestTensorRegisterHook(unittest.TestCase): self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm)) self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm)) - def test_multiple_hooks_for_interior_var(self): + def test_func_hook_in_model(self): + with _test_eager_guard(): + self.func_hook_in_model() + self.func_hook_in_model() + + def func_multiple_hooks_for_interior_var(self): def run_multiple_hooks_for_interior_var(device, hooks, remove1=False, @@ -414,6 +440,12 @@ class TestTensorRegisterHook(unittest.TestCase): self.assertTrue(np.array_equal(x_grad, z)) self.assertTrue(np.array_equal(y_grad, z)) + def test_multiple_hooks_for_interior_var(self): + with _test_eager_guard(): + self.func_multiple_hooks_for_interior_var() + self.func_multiple_hooks_for_interior_var() + + # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready def test_hook_in_double_grad(self): def double_print_hook(grad): grad = grad * 2 @@ -446,7 +478,7 @@ class TestTensorRegisterHook(unittest.TestCase): z.backward() self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.]))) - def test_remove_one_hook_multiple_times(self): + def func_remove_one_hook_multiple_times(self): for device in self.devices: paddle.set_device(device) @@ -457,7 +489,12 @@ class TestTensorRegisterHook(unittest.TestCase): self.assertTrue(h.remove()) self.assertFalse(h.remove()) - def test_register_hook_for_stop_gradient_var(self): + def test_remove_one_hook_multiple_times(self): + with _test_eager_guard(): + self.func_remove_one_hook_multiple_times() + self.func_remove_one_hook_multiple_times() + + def func_register_hook_for_stop_gradient_var(self): for device in self.devices: paddle.set_device(device) @@ -466,6 +503,11 @@ class TestTensorRegisterHook(unittest.TestCase): with self.assertRaises(RuntimeError): x.register_hook(lambda grad: grad * 2) + def test_register_hook_for_stop_gradient_var(self): + with _test_eager_guard(): + self.func_register_hook_for_stop_gradient_var() + self.func_register_hook_for_stop_gradient_var() + def test_register_hook_in_static_mode(self): paddle.enable_static() @@ -482,7 +524,7 @@ class TestTensorRegisterHook(unittest.TestCase): paddle.disable_static() - def test_register_hook_in_dy2static_mode(self): + def func_register_hook_in_dy2static_mode(self): net = SimpleNetForStatic(self.in_size, self.out_size) jit_net = paddle.jit.to_static( net, input_spec=[paddle.static.InputSpec([None, self.in_size])]) @@ -491,8 +533,17 @@ class TestTensorRegisterHook(unittest.TestCase): size=[self.batch_size, self.in_size]).astype('float32') data_t = paddle.to_tensor(data) - with self.assertRaises(AssertionError): - out = jit_net(data_t) + if _in_eager_mode(): + with self.assertRaises(TypeError): + out = jit_net(data_t) + else: + with self.assertRaises(AssertionError): + out = jit_net(data_t) + + def test_register_hook_in_dy2static_mode(self): + with _test_eager_guard(): + self.func_register_hook_in_dy2static_mode() + self.func_register_hook_in_dy2static_mode() HOOK_INIT_VALUE = 10 @@ -512,7 +563,7 @@ class TestTensorRegisterBackwardHook(unittest.TestCase): if paddle.is_compiled_with_cuda(): self.devices.append("gpu") - def test_register_backward_hook(self): + def func_register_backward_hook(self): global HOOK_INIT_VALUE global HOOK_IS_CALLED for device in self.devices: @@ -529,20 +580,35 @@ class TestTensorRegisterBackwardHook(unittest.TestCase): HOOK_INIT_VALUE = 10 HOOK_IS_CALLED = False - def test_register_backward_hook_for_interior_var(self): + def test_register_backward_hook(self): + with _test_eager_guard(): + self.func_register_backward_hook() + self.func_register_backward_hook() + + def func_register_backward_hook_for_interior_var(self): x = paddle.to_tensor(5., stop_gradient=False) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): y._register_backward_hook(global_void_hook) - def test_register_backward_hook_for_var_without_gradient(self): + def test_register_backward_hook_for_interior_var(self): + with _test_eager_guard(): + self.func_register_backward_hook_for_interior_var() + self.func_register_backward_hook_for_interior_var() + + def func_register_backward_hook_for_var_without_gradient(self): x = paddle.to_tensor(5.) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): x._register_backward_hook(global_void_hook) + def test_register_backward_hook_for_var_without_gradient(self): + with _test_eager_guard(): + self.func_register_backward_hook_for_var_without_gradient() + self.func_register_backward_hook_for_var_without_gradient() + if __name__ == '__main__': unittest.main() -- GitLab