未验证 提交 4db8cf24 编写于 作者: W Weilong Wu 提交者: GitHub

[Eager Grad] Support eager grad interface (#40170)

* [Eager] Support eager grad interface, draft version

* Support eager grad interface with allow_unused and multi startup_op

* Fix code format

* Fix allow_unused case, return PyNone if tensor not initialize

* Support output's stop_gradient related to create_graph

* Support grad exception case in eager mode, fix coverage CI

* Update ToPyObject, return PyNone if not initialize

* AccumulationNode add FLAGS_retain_grad_for_all_tensor

* Fix ci issue

* Fix CI issue

* fix, use core.eager.Tensor

* Add func SetBufferSlotRankZeros for GradTensorHolder

* Support retain_graph by using ClearTensorWrappers

* Support retain_graph by using ClearTensorWrappers

* Update retain_graph and no_grad_vars related test case

* Update code gen logic for ClearTensorWrappers

* Fix by override statement

* fix override func args

* Support retain_graph, update unit tests

* Updated ClearTensorWrappers logic

* fix grad python interface

* Use deep copy and update unit tests

* Polish code

* Polish code

* Fix CI issue, Deep copy only use when user set grad_tensors

* Fix CI, use Backward instead RunBackward

* Fix CI, Declare kernel explicitly in test file

* Polish, remove vector of TensorWrapper

* Refactor the logic of grad/backward, polish codes

* Update code after merge upstream develop

* Polish after merge upstream develop

* Update to adapt new GradNodeBase superclass

* Fix error introduced during conflict resolution

* Update purify potential_startup_nodes logic

* Fix errors

* Polish code

* Remove useless args for ToPyObject

* Remove useless TensorWrappersSet

* Fix code-format, re-install pre-commit

* Fix pre-process logic for potential_startup_ops

* Update unit tests, use eager mode
上级 1e045cae
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/errors.h"
#include "glog/logging.h" #include "glog/logging.h"
DECLARE_bool(retain_grad_for_all_tensor);
namespace egr { namespace egr {
static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
...@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, ...@@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
} }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation:: std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()( operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) { bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1, PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
...@@ -62,7 +62,7 @@ operator()( ...@@ -62,7 +62,7 @@ operator()(
grad_out = grads[0][0]; grad_out = grads[0][0];
} }
if (!weak_grad_.expired()) { if (!weak_grad_.expired() && FLAGS_retain_grad_for_all_tensor) {
auto grad = weak_grad_.lock(); auto grad = weak_grad_.lock();
CopyOrAddTensor(grad.get(), grad_out); CopyOrAddTensor(grad.get(), grad_out);
} }
......
...@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase { ...@@ -35,8 +35,15 @@ class GradNodeAccumulation : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
override; bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
std::string name() { return "GradNodeAccumulation"; } std::string name() { return "GradNodeAccumulation"; }
......
...@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X( ...@@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale:: std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()( operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) { bool create_graph) {
// 1. Check Output Size // 1. Check Output Size
PADDLE_ENFORCE( PADDLE_ENFORCE(
((grads.size() == 1) && (grads[0].size() == 1)), ((grads.size() == 1) && (grads[0].size() == 1)),
......
...@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase { ...@@ -39,8 +39,15 @@ class GradNodeScale : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
override; bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetTensorWrappers_X( void SetTensorWrappers_X(
const std::vector<paddle::experimental::Tensor>& tensors); const std::vector<paddle::experimental::Tensor>& tensors);
......
...@@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents( ...@@ -2074,7 +2074,8 @@ static std::string GenerateGradNodeCCContents(
const char* GRAD_FUNCTION_TEMPLATE = const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> " "std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const " "GradNode%s::operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}"; "std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
"bool create_graph) {\n%s\n}";
std::string grad_function_str = paddle::string::Sprintf( std::string grad_function_str = paddle::string::Sprintf(
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
...@@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2109,18 +2110,28 @@ static std::string GenerateGradNodeHeaderContents(
"\n" "\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> " " virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const " "operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) " "std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph = false) "
"override;\n" "override;\n"
"\n" "\n"
" void ClearTensorWrappers() override { \n"
"%s\n"
" is_tensor_wrappers_cleared = true;\n"
" }\n"
" std::string name() override { return \" GradNode%s \"; } \n " " std::string name() override { return \" GradNode%s \"; } \n "
"\n" "\n"
" // SetX, SetY, ...\n" " // SetX, SetY, ...\n"
"%s\n" "%s\n"
" // SetAttrMap\n" " // SetAttrMap\n"
"%s\n" "%s\n"
" bool IsTensorWrappersCleared() override { \n"
" return is_tensor_wrappers_cleared;\n"
" }\n"
" private:\n" " private:\n"
" // TensorWrappers\n" " // TensorWrappers\n"
"%s\n" "%s\n"
" bool is_tensor_wrappers_cleared = false;\n"
"\n"
" // Attribute Map\n" " // Attribute Map\n"
"%s\n" "%s\n"
"};"; "};";
...@@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2154,6 +2165,7 @@ static std::string GenerateGradNodeHeaderContents(
std::string set_tensor_wrappers_str = ""; std::string set_tensor_wrappers_str = "";
std::string tensor_wrapper_members_str = ""; std::string tensor_wrapper_members_str = "";
std::string clear_tensor_wrappers_str = "";
for (const auto& iter : op_base_infos) { for (const auto& iter : op_base_infos) {
const std::map<std::string, std::string>& grad_ins_fwd_slotname_map = const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
iter.GetGradInsFwdSlotnameMap(); iter.GetGradInsFwdSlotnameMap();
...@@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2185,6 +2197,13 @@ static std::string GenerateGradNodeHeaderContents(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name, SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
struct_tensor_wrapper_name); struct_tensor_wrapper_name);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
"for (auto tw: %s) {\n"
" tw.clear();\n"
" }\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
} else { } else {
const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE = const char* ATTR_TENSOR_WRAPPER_ARG_TEMPLATE =
"const paddle::experimental::Tensor& %s"; "const paddle::experimental::Tensor& %s";
...@@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2197,10 +2216,14 @@ static std::string GenerateGradNodeHeaderContents(
TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name); TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE = const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
"%s = egr::TensorWrapper(%s, %s /*full_reserved*/);"; "%s = egr::TensorWrapper(%s, %s /*full_reserved*/);\n";
tensor_wrapper_body_str = paddle::string::Sprintf( tensor_wrapper_body_str = paddle::string::Sprintf(
SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name, SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
tensor_wrapper_name, full_reserved_str); tensor_wrapper_name, full_reserved_str);
const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = " %s.clear();\n";
clear_tensor_wrappers_str += paddle::string::Sprintf(
CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
} }
std::string full_reserved_signature_str = "bool full_reserved"; std::string full_reserved_signature_str = "bool full_reserved";
const char* SET_TENSOR_WRAPPER_TEMPLATE = const char* SET_TENSOR_WRAPPER_TEMPLATE =
...@@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2215,8 +2238,8 @@ static std::string GenerateGradNodeHeaderContents(
std::string grad_node_str = paddle::string::Sprintf( std::string grad_node_str = paddle::string::Sprintf(
GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, op_type, clear_tensor_wrappers_str, op_type, set_tensor_wrappers_str,
tensor_wrapper_members_str, attr_members_str); set_attr_map_str, tensor_wrapper_members_str, attr_members_str);
return grad_node_str; return grad_node_str;
} }
......
...@@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, ...@@ -478,6 +478,7 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
# SetTensorWrapper Methods & TensorWrapper Members # SetTensorWrapper Methods & TensorWrapper Members
set_tensor_wrapper_methods_str = "" set_tensor_wrapper_methods_str = ""
tensor_wrapper_members_str = "" tensor_wrapper_members_str = ""
clear_tensor_wrapper_str = ""
for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items(): for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
if tname in no_need_buffer_set: if tname in no_need_buffer_set:
no_need_buffer = "true" no_need_buffer = "true"
...@@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, ...@@ -499,6 +500,13 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
""" """
tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_members_str += PLAIN_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name) tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
{}.clear();
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
else: else:
assert IsVectorTensorType(ttype) assert IsVectorTensorType(ttype)
SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """ SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
...@@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, ...@@ -516,6 +524,15 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
""" """
tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format( tensor_wrapper_members_str += VECTOR_TENSOR_MEMBER_TEMPLATE.format(
tensor_wrapper_name) tensor_wrapper_name)
CLEAR_TENSOR_WRAPPERS_TEMPLATE = """
for (auto tw: {}) {
tw.clear();
};
"""
clear_tensor_wrapper_str += CLEAR_TENSOR_WRAPPERS_TEMPLATE.format(
tensor_wrapper_name)
# End: SetTensorWrapper Methods & TensorWrapper Members # End: SetTensorWrapper Methods & TensorWrapper Members
# SetAttributes & Attribute Members # SetAttributes & Attribute Members
...@@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{ ...@@ -555,25 +572,37 @@ class {} : public egr::GradNodeBase {{
~{}() override = default; ~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override; const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }} std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
{}
is_tensor_wrappers_cleared = true;
}}
// SetTensorWrapperX, SetTensorWrapperY, ... // SetTensorWrapperX, SetTensorWrapperY, ...
{} {}
// SetAttributes // SetAttributes
{} {}
bool IsTensorWrappersCleared() override {{
return is_tensor_wrappers_cleared;
}}
private: private:
// TensorWrappers // TensorWrappers
{} {}
bool is_tensor_wrappers_cleared = false;
// Attributes // Attributes
{} {}
}}; }};
""" """
node_declaration_str = NODE_DECLARATION_TEMPLATE.format( node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name, grad_node_name,
grad_node_name, set_tensor_wrapper_methods_str, grad_node_name, clear_tensor_wrapper_str,
set_attribute_methods_str, tensor_wrapper_members_str, set_tensor_wrapper_methods_str, set_attribute_methods_str,
attribute_members_str) tensor_wrapper_members_str, attribute_members_str)
return node_declaration_str return node_declaration_str
...@@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, ...@@ -637,7 +666,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_api_namespace = f"paddle::experimental" grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """ FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
// Call grad_api function // Call grad_api function
auto grad_api_returns = {}::{}({}); auto grad_api_returns = {}::{}({});
{} {}
......
...@@ -39,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -39,12 +39,21 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
// Copy nodes // Copy nodes
std::queue<GradNodeBase*> queue = init_queue; std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited; std::unordered_set<GradNodeBase*> visited;
size_t potential_startup_ops_cnt = queue.size();
size_t cnt = 0;
// Visit each node exactly once in any order // Visit each node exactly once in any order
while (!queue.empty()) { while (!queue.empty()) {
GradNodeBase* node = queue.front(); GradNodeBase* node = queue.front();
queue.pop(); queue.pop();
if (cnt < potential_startup_ops_cnt) {
if (!node_in_degree_map.count(node)) {
node_in_degree_map[node] = 0;
}
cnt += 1;
}
if (visited.count(node)) { if (visited.count(node)) {
continue; continue;
} }
...@@ -76,23 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap( ...@@ -76,23 +85,248 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
return node_in_degree_map; return node_in_degree_map;
} }
void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, // Remove some nodes those doesn't need to be
const std::vector<paddle::experimental::Tensor>& grad_tensors, // stored in potential_stop_nodes、potential_startup_nodes
bool retain_graph) { void UpdateGraphInfo(
paddle::platform::RecordEvent backward_record_event( std::unordered_map<GradNodeBase*, AutogradMeta*>*
"backward", paddle::platform::TracerEventType::Operator, 1); target_nodes_inputmeta_map,
std::unordered_map<GradNodeBase*, std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
// Updated potential_sotp_nodes by depending_nodes,
// make sure the path from root to target_node is ok
std::unordered_set<GradNodeBase*> _startup_ops;
VLOG(6) << "Running in UpdateGraphInfo";
std::queue<GradNodeBase*> queue;
for (auto& target_nodes_inputmeta_pair : *target_nodes_inputmeta_map) {
queue.emplace(target_nodes_inputmeta_pair.first);
}
while (!queue.empty()) {
auto* target_node = queue.front();
queue.pop();
if (!(*depending_nodes)[target_node].empty()) {
auto precedding_nodes = (*depending_nodes)[target_node];
for (auto pre_nodes : precedding_nodes) {
queue.emplace(pre_nodes);
if (potential_stop_nodes->find(pre_nodes) !=
potential_stop_nodes->end()) {
potential_stop_nodes->erase(pre_nodes);
}
}
} else { // startup_ops have no precedding nodes
VLOG(6) << "Emplace _startup_ops";
_startup_ops.emplace(target_node);
}
}
// Purify potential_startup_nodes again, remove some
// potential startup_nodes that unreach to input target nodes
if (!_startup_ops.empty()) {
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto node : *potential_startup_nodes) {
if (_startup_ops.count(node) == 0) {
VLOG(6) << "Set up potential_startup_nodes_to_be_erased";
potential_startup_nodes_to_be_erased.emplace(node);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto node : potential_startup_nodes_to_be_erased) {
VLOG(6) << "Erase nodes in potential_startup_nodes_to_be_erased";
potential_startup_nodes->erase(node);
}
}
}
}
// Get Graph Info Betweent input target gradnode and outputs,
// record depending_nodes、 potential_stop_nodes、potential_startup_nodes
void GetGraphInfoBetweenTargets(
const std::queue<GradNodeBase*>& init_queue,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
input_target_nodes_inputmeta_map,
std::unordered_map</*child node*/ GradNodeBase*,
/*father nodes*/ std::unordered_set<GradNodeBase*>>*
depending_nodes,
std::unordered_set<GradNodeBase*>* potential_stop_nodes,
std::unordered_set<GradNodeBase*>* potential_startup_nodes) {
if (input_target_nodes_inputmeta_map->empty()) return;
VLOG(6) << "Runing In GetGraphInfoBetweenTargets";
// Calculate in_degree for each node
std::unordered_map<GradNodeBase*, int> node_in_degree_map;
// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
// Visit each node exactly once in any order
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
if (visited.count(node)) {
continue;
}
visited.insert(node);
// Check node is target_nodes or not, if node is not target_node,
// all the next_node will be marked in potential_stop_nodes
bool is_potential_stop_nodes =
input_target_nodes_inputmeta_map->count(node);
// Find and append next nodes
const std::vector<std::vector<Edge>>& edges = node->GetEdges();
for (const auto& edge_list : edges) {
for (const Edge& edge : edge_list) {
GradNodeBase* next_node = edge.GetMutableGradNode().get();
// Next node could be nullptr if it is leaf tensor with no
// AccumulationNode attached
// Or it could also originated from dispensable inputs
if (!next_node) continue;
// if node not in input_target_nodes,
// all the next_nodes of current node will be inserted to
// potential_stop_node
if (is_potential_stop_nodes) {
potential_stop_nodes->emplace(next_node);
}
// Update in_degree
if (!node_in_degree_map.count(next_node))
node_in_degree_map[next_node] = 0;
node_in_degree_map[next_node]++;
// Record depending relationship
(*depending_nodes)[next_node].emplace(node);
queue.push(next_node);
}
}
}
// Update Graph Info, remove some stop_node in potential_stop_nodes
UpdateGraphInfo(input_target_nodes_inputmeta_map, depending_nodes,
potential_stop_nodes, potential_startup_nodes);
}
void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, AutogradMeta*>*
target_nodes_inputmeta_map) {
VLOG(6) << "Running in GetTargetNodesInfo";
if (!inputs.empty()) {
VLOG(6) << "Inputs are not empty";
size_t num_inputs = inputs.size();
for (size_t i = 0; i < num_inputs; i++) {
AutogradMeta* auto_grad_meta =
EagerUtils::unsafe_autograd_meta(inputs[i]);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
PADDLE_ENFORCE_NOT_NULL(target_node,
paddle::platform::errors::Fatal(
"There is no grad op for input:%d or it's"
"stop_gradient=True",
i));
(*target_nodes_inputmeta_map)[target_node] = auto_grad_meta;
}
}
}
std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>*
results_map,
bool allow_unused, bool create_graph) {
VLOG(6) << "Running in GetResults";
if (inputs.empty()) return {};
std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
auto iter = results_map->find(target_node);
if (iter != results_map->end()) {
// set StopGradient = !create_graph
AutogradMeta* tensor_auto_grad_meta =
EagerUtils::autograd_meta(&(iter->second));
tensor_auto_grad_meta->SetStopGradient(!create_graph);
results.emplace_back(iter->second);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input variable or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
}
}
return results;
}
// Enforce GradNode has TensorWrappers as Input
void EnforceGradNodeHasInput(GradNodeBase* node) {
VLOG(6) << "Running in EnforceGradNodeHasInput";
PADDLE_ENFORCE_NE(
node->IsTensorWrappersCleared(), true,
paddle::platform::errors::Fatal(
"The TensorWrappers of %s do not exist. This may be because:\n"
"You calculate backward twice for the same subgraph without "
"setting retain_graph=True. Please set retain_graph=True in the "
"first backward/grad call.\n",
node->name()));
}
// Purify potential_startup_nodes, remove nodes those are the same as
// input_target_nodes
void PurifyPotentialStartUpNodes(
std::unordered_set<GradNodeBase*>* potential_startup_nodes,
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>*
input_target_nodes_inputmeta_map) {
VLOG(6) << "Running in PurifyPotentialStartUpNodes";
if (input_target_nodes_inputmeta_map->empty()) return;
std::unordered_set<GradNodeBase*> potential_startup_nodes_to_be_erased;
for (auto startup_op : *potential_startup_nodes) {
auto iter = input_target_nodes_inputmeta_map->find(startup_op);
if (iter != input_target_nodes_inputmeta_map->end()) {
potential_startup_nodes_to_be_erased.emplace(iter->first);
}
}
if (!potential_startup_nodes_to_be_erased.empty()) {
for (auto nodes : potential_startup_nodes_to_be_erased) {
potential_startup_nodes->erase(nodes);
}
}
}
std::vector<paddle::experimental::Tensor> RunBackward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph = false,
const std::vector<paddle::experimental::Tensor>& inputs = {},
bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
VLOG(6) << "Start Backward"; VLOG(6) << "Start Backward";
// *Gradient Hook should happen at node-level // *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level // *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass // *Cross-batch accumulation happens at forward pass
std::unordered_map<GradNodeBase*, AutogradMeta*>
no_grad_var_nodes_inputmeta_map;
// Get no_grad_vars's GradNodes and InputMeta Info
GetTargetNodesInfo(no_grad_vars, &no_grad_var_nodes_inputmeta_map);
/* --- Initialization --- */ /* --- Initialization --- */
// 1. Init queue with starting nodes // 1. Init queue with starting nodes
// 2. Prepare initial input buffers // 2. Prepare initial input buffers
std::queue<GradNodeBase*> queue; std::queue<GradNodeBase*> queue;
std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>> std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
node_input_buffers_dict; node_input_buffers_dict;
std::unordered_set<GradNodeBase*> potential_startup_nodes;
for (size_t i = 0; i < tensors.size(); i++) { for (size_t i = 0; i < tensors.size(); i++) {
const paddle::experimental::Tensor& tensor = tensors[i]; const paddle::experimental::Tensor& tensor = tensors[i];
...@@ -132,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -132,8 +366,17 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
"size = 0 or same size as tensors")); "size = 0 or same size as tensors"));
// Feed given tensor if it's provided // Feed given tensor if it's provided
VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor"; VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
if (grad_tensors[i].is_initialized()) {
// Deep copy
paddle::experimental::Tensor tmp_tensor;
tmp_tensor.copy_(grad_tensors[i], true);
node_input_buffers_dict[grad_node]->add(input_info.first,
input_info.second, tmp_tensor);
} else {
node_input_buffers_dict[grad_node]->add( node_input_buffers_dict[grad_node]->add(
input_info.first, input_info.second, grad_tensors[i]); input_info.first, input_info.second, grad_tensors[i]);
}
} else { } else {
VLOG(6) << "Fill grad input tensor " << i << " with 1.0"; VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
...@@ -146,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -146,8 +389,9 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
input_info.first, input_info.second, tensor, true /*fill_one=true*/); input_info.first, input_info.second, tensor, true /*fill_one=true*/);
} }
// Prepare queue // Prepare queue, potential startup_nodes
queue.push(grad_node); queue.push(grad_node);
potential_startup_nodes.emplace(grad_node);
} }
VLOG(6) << "Update In degree Map for backward"; VLOG(6) << "Update In degree Map for backward";
...@@ -155,25 +399,74 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -155,25 +399,74 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unordered_map<GradNodeBase*, int> node_in_degree_map = std::unordered_map<GradNodeBase*, int> node_in_degree_map =
getInDegreeMap(queue); getInDegreeMap(queue);
// Get input's GradNodes and InputMeta Info
std::unordered_map<GradNodeBase*, AutogradMeta* /* InputMeta */>
input_target_nodes_inputmeta_map;
GetTargetNodesInfo(inputs, &input_target_nodes_inputmeta_map);
// Purify potential_startup_ops, remove those nodes that are the same as
// input_target_nodes
PurifyPotentialStartUpNodes(&potential_startup_nodes,
&input_target_nodes_inputmeta_map);
// Get Graph Info Betweent input target gradnode and outputs
// Record the depending_nodes and potential_stop_nodes
std::unordered_map<GradNodeBase* /* child node */,
std::unordered_set<GradNodeBase*> /* father node */>
depending_nodes;
std::unordered_set<GradNodeBase*> potential_stop_nodes;
// std::unordered_set<GradNodeBase*> startup_ops;
GetGraphInfoBetweenTargets(queue, &input_target_nodes_inputmeta_map,
&depending_nodes, &potential_stop_nodes,
&potential_startup_nodes);
// ready_queue store all startup nodes
std::queue<GradNodeBase*> ready_queue;
// startup op's indegree should be 0
for (auto node : potential_startup_nodes) {
if (node_in_degree_map[node] == 0) {
ready_queue.emplace(node);
}
}
VLOG(1) << " startup_ops' size is :" << ready_queue.size();
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;
// read_queue is empty only when 1.input equals to output. 2.input can not
// reach to output.
if (ready_queue.size() == 0) {
for (auto input_target_node : input_target_nodes_inputmeta_map) {
// out rank_info of forward op
auto rank_info = input_target_node.second->OutRankInfo();
if (node_input_buffers_dict[input_target_node.first]) {
auto& target_result =
node_input_buffers_dict[input_target_node.first]
->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[input_target_node.first] = target_result;
}
}
}
/* --- Topological Visit --- */ /* --- Topological Visit --- */
// 1. Pop queue // 1. Pop queue
// 2. Run node // 2. Run node
// |- Check and capture target result
// |- node(grads) // |- node(grads)
// |- Prepare for next node // |- Prepare for next node
// 3. Update queue // 3. Update queue
VLOG(6) << "Run Backward"; VLOG(6) << "Run Backward";
while (!queue.empty()) { while (!ready_queue.empty()) {
GradNodeBase* node = queue.front(); GradNodeBase* node = ready_queue.front();
VLOG(6) << "Running GradNode:" << node->name();
ready_queue.pop();
paddle::platform::RecordEvent node_record_event( paddle::platform::RecordEvent node_record_event(
std::string(typeid(*node).name()) + " grad_node", std::string(typeid(*node).name()) + " grad_node",
paddle::platform::TracerEventType::Operator, 1); paddle::platform::TracerEventType::Operator, 1);
if (queue.size() > 1 && node_in_degree_map[node] != 0) {
queue.pop();
continue;
}
queue.pop();
// Run node: This is where Hook happens // Run node: This is where Hook happens
PADDLE_ENFORCE( PADDLE_ENFORCE(
node_input_buffers_dict.count(node), node_input_buffers_dict.count(node),
...@@ -184,10 +477,45 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -184,10 +477,45 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
std::unique_ptr<GradTensorHolder> node_input_buffer = std::unique_ptr<GradTensorHolder> node_input_buffer =
std::move(node_input_buffers_dict[node]); std::move(node_input_buffers_dict[node]);
// get target grad_var from node_input_buffer by inputmeta
if (input_target_nodes_inputmeta_map.find(node) !=
input_target_nodes_inputmeta_map.end()) {
VLOG(6) << "Get target result by by inputmeta";
// out rank_info of forward op
auto rank_info = input_target_nodes_inputmeta_map[node]->OutRankInfo();
// rank_info is a pair, first means slot_id, second means rank.
auto& target_result =
node_input_buffer->Buffers()[rank_info.first][rank_info.second];
// save the target result
results_map[node] = target_result;
}
// no_grad_vars
if (no_grad_var_nodes_inputmeta_map.find(node) !=
no_grad_var_nodes_inputmeta_map.end()) {
VLOG(6) << "Change the input buffer[slot][rank] by Zeros";
auto rank_info = no_grad_var_nodes_inputmeta_map[node]->OutRankInfo();
node_input_buffer->SetBufferSlotRankZeros(rank_info.first,
rank_info.second);
}
VLOG(6) << "Running GradNode:" << node->name();
// check input
EnforceGradNodeHasInput(node);
VLOG(6) << "Run Backward Kernel with GradTensorHolder"; VLOG(6) << "Run Backward Kernel with GradTensorHolder";
// Run Pre Backward Node and get outputs // Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors = std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers()); (*node)(node_input_buffer->Buffers(), create_graph);
// retain_grad or not
if (!retain_graph) {
VLOG(6)
<< "retain_graph is false, need to clear the TensorWrapper of nodes.";
node->ClearTensorWrappers();
}
// TODO(jiabin): Should we erase it or find a more efficient way. // TODO(jiabin): Should we erase it or find a more efficient way.
node_input_buffers_dict.erase(node); node_input_buffers_dict.erase(node);
...@@ -252,18 +580,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors, ...@@ -252,18 +580,44 @@ void RunBackward(const std::vector<paddle::experimental::Tensor>& tensors,
// Update queue // Update queue
node_in_degree_map[next_node]--; node_in_degree_map[next_node]--;
PADDLE_ENFORCE( PADDLE_ENFORCE(
node_in_degree_map[next_node] >= 0, node_in_degree_map[next_node] >= 0,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Detected in-degree value smaller than zero. For Node: %s" "Detected in-degree value smaller than zero. For Node: %s"
"Node's in-degree cannot be negative", "Node's in-degree cannot be negative",
next_node->name())); next_node->name()));
if (node_in_degree_map[next_node] == 0) {
queue.emplace(std::move(next_node)); bool is_potential_stop_node = potential_stop_nodes.count(next_node);
if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
ready_queue.emplace(std::move(next_node));
} }
} }
} }
} }
return GetResults(inputs, &results_map, allow_unused, create_graph);
} }
void Backward(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph) {
VLOG(6) << "Run in Backward";
paddle::platform::RecordEvent backward_record_event(
"backward", paddle::platform::TracerEventType::Operator, 1);
RunBackward(tensors, grad_tensors, retain_graph);
}
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors, // output
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph, bool create_graph, bool only_inputs, bool allow_unused,
const std::vector<paddle::experimental::Tensor>& no_grad_vars) {
VLOG(6) << "Run in Grad";
return RunBackward(tensors, grad_tensors, retain_graph, create_graph, inputs,
allow_unused, no_grad_vars);
}
} // namespace egr } // namespace egr
...@@ -19,13 +19,21 @@ ...@@ -19,13 +19,21 @@
namespace egr { namespace egr {
// run_backward(): // Backward():
// tensors corresponds to those lived in the backward graph // tensors corresponds to those lived in the backward graph
// each grad_tensors[i] keeps the value for its corresponding tensors[i] // each grad_tensors[i] keeps the value for its corresponding tensors[i]
void RunBackward(const std::vector<paddle::experimental::Tensor> &tensors, void Backward(const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor> &grad_tensors, const std::vector<paddle::experimental::Tensor>& grad_tensors,
bool retain_graph = false); bool retain_graph = false);
std::vector<paddle::experimental::Tensor> Grad(
const std::vector<paddle::experimental::Tensor>& tensors,
const std::vector<paddle::experimental::Tensor>& inputs,
const std::vector<paddle::experimental::Tensor>& grad_tensors = {},
bool retain_graph = false, bool create_graph = false,
bool only_inputs = false, bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {});
// Reserved for gradient() // Reserved for gradient()
} // namespace egr } // namespace egr
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
namespace egr { namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode:: std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()( operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) { bool create_graph) {
paddle::CustomOpKernelContext ctx; paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
......
...@@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase { ...@@ -37,8 +37,8 @@ class RunCustomOpNode : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
override; bool create_graph) override;
std::string name() { std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
...@@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase { ...@@ -62,6 +62,12 @@ class RunCustomOpNode : public GradNodeBase {
return res; return res;
} }
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; } void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
public: public:
......
...@@ -95,8 +95,12 @@ class GradNodeBase { ...@@ -95,8 +95,12 @@ class GradNodeBase {
* is better choice to fit this format. * is better choice to fit this format.
* **/ * **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0; const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) = 0;
virtual void ClearTensorWrappers() = 0;
virtual bool IsTensorWrappersCleared() = 0;
/** /**
* AddEdges is designed to set input tensors' backward Node as current * AddEdges is designed to set input tensors' backward Node as current
* node's Edges. * node's Edges.
......
...@@ -21,6 +21,11 @@ ...@@ -21,6 +21,11 @@
namespace egr { namespace egr {
void GradTensorHolder::SetBufferSlotRankZeros(size_t slot_id, size_t rank) {
buffer_[slot_id][rank] =
paddle::experimental::zeros_like(buffer_[slot_id][rank]);
}
void GradTensorHolder::add(size_t slot_id, size_t rank, void GradTensorHolder::add(size_t slot_id, size_t rank,
const paddle::experimental::Tensor& t, const paddle::experimental::Tensor& t,
bool fill_one) { bool fill_one) {
......
...@@ -56,6 +56,8 @@ class GradTensorHolder { ...@@ -56,6 +56,8 @@ class GradTensorHolder {
return buffer_; return buffer_;
} }
void SetBufferSlotRankZeros(size_t slot_id, size_t rank);
private: private:
std::vector<std::vector<paddle::experimental::Tensor>> buffer_; std::vector<std::vector<paddle::experimental::Tensor>> buffer_;
}; };
......
...@@ -98,6 +98,8 @@ class TensorWrapper { ...@@ -98,6 +98,8 @@ class TensorWrapper {
} }
} }
void clear() { intermidiate_tensor_.reset(); }
private: private:
bool full_reserved_ = false; bool full_reserved_ = false;
std::pair<size_t, size_t> out_rank_info_; std::pair<size_t, size_t> out_rank_info_;
......
...@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; } GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; } std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()( std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
override { bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl()) val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0]; ->data<float>()[0];
phi::DenseTensorMeta meta = phi::DenseTensorMeta meta =
...@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -49,6 +49,11 @@ class GradTestNode : public egr::GradNodeBase {
std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}}; std::vector<std::vector<paddle::experimental::Tensor>> res = {{et1}};
return res; return res;
} }
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
float val_; float val_;
}; };
} // namespace eager_test } // namespace eager_test
...@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, ...@@ -58,7 +58,7 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor,
} }
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor}; std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
if (accuracy_check) { if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 10) // Examine Forward Grad (w.r.t max_num_runs = 10)
...@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X, ...@@ -80,7 +80,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
} }
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0}; std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
if (accuracy_check) { if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2) // Examine Forward Grad (w.r.t max_num_runs = 2)
...@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, ...@@ -106,7 +106,7 @@ void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X,
} }
std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0}; std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
if (accuracy_check) { if (accuracy_check) {
// Examine Forward Grad (w.r.t max_num_runs = 2) // Examine Forward Grad (w.r.t max_num_runs = 2)
...@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp( ...@@ -137,7 +137,7 @@ void benchmark_eager_intermediate_mlp(
reduce_sum_dygraph_function(input0, {{"reduce_all", true}}); reduce_sum_dygraph_function(input0, {{"reduce_all", true}});
std::vector<paddle::experimental::Tensor> target_tensors = {Out}; std::vector<paddle::experimental::Tensor> target_tensors = {Out};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
if (accuracy_check) { if (accuracy_check) {
std::unordered_map<std::string, float> result = std::unordered_map<std::string, float> result =
......
...@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_ ...@@ -5,6 +5,7 @@ cc_test(test_egr_task_backward SRCS backward_test.cc DEPS ${eager_deps} ${fluid_
cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_hook SRCS hook_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_cross_batch SRCS cross_batch_accumulation_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node) cc_test(test_egr_task_fwd_bwd_joint SRCS fwd_bwd_joint_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
cc_test(test_egr_task_grad SRCS grad_test.cc DEPS ${eager_deps} ${fluid_deps} eager_scale scale_node)
if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node) cc_test(test_egr_task_hook_intermidiate SRCS hook_test_intermidiate.cc DEPS ${eager_deps} ${fluid_deps} ${generated_deps} dygraph_node)
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr { namespace egr {
...@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) { ...@@ -79,7 +80,7 @@ TEST(Backward, SingleNodeEmptyGrad) {
} }
std::vector<paddle::experimental::Tensor> outs = {target_tensor}; std::vector<paddle::experimental::Tensor> outs = {target_tensor};
// Run Backward // Run Backward
RunBackward(outs, {}); Backward(outs, {});
// Check Output Value // Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
...@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) { ...@@ -138,7 +139,7 @@ TEST(Backward, SingleNodeCustomGrad) {
} }
// Run Backward // Run Backward
RunBackward(target_tensors, grad_tensors); Backward(target_tensors, grad_tensors);
// Check Output Value // Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
...@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) { ...@@ -211,7 +212,7 @@ TEST(Backward, LinearNodes) {
} }
// Use Empty Grad Tensor // Use Empty Grad Tensor
RunBackward(target_tensors, {}); Backward(target_tensors, {});
// Check Output Value // Check Output Value
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
...@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) { ...@@ -315,7 +316,7 @@ TEST(Backward, WithAccumulation) {
node2_ptr->AddEdges(&res2, 0); node2_ptr->AddEdges(&res2, 0);
} }
RunBackward(target_tensors, grad_tensors); Backward(target_tensors, grad_tensors);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
} }
......
...@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { ...@@ -71,12 +71,12 @@ TEST(CrossBatchAccumulation, SingleScaleNode) {
std::vector<egr::AutogradMeta*> res = {meta}; std::vector<egr::AutogradMeta*> res = {meta};
scale_node_ptr->AddEdges(&res, 0); scale_node_ptr->AddEdges(&res, 0);
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0); eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0); eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
......
...@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) { ...@@ -86,7 +86,7 @@ TEST(FwdBwdJoint, SingleNode) {
std::vector<paddle::experimental::Tensor> outs = {out}; std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward // 4. Run Backward
RunBackward(outs, {}); Backward(outs, {});
VLOG(7) << "Target Grad is: " VLOG(7) << "Target Grad is: "
<< std::static_pointer_cast<phi::DenseTensor>( << std::static_pointer_cast<phi::DenseTensor>(
...@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) { ...@@ -137,7 +137,7 @@ TEST(FwdBwdJoint, LinearNodes) {
std::vector<paddle::experimental::Tensor> outs = {out1}; std::vector<paddle::experimental::Tensor> outs = {out1};
// 4. Run Backward // 4. Run Backward
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 10.0); eager_test::CompareGradTensorWithValue<float>(tensor, 10.0);
...@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) { ...@@ -203,7 +203,7 @@ TEST(FwdBwdJoint, BranchedNodes) {
// 4. Run Backward // 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2}; std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0); eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
...@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) { ...@@ -260,7 +260,7 @@ TEST(FwdBwdJoint, GradientHook) {
// 4. Run Backward // 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2}; std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
// leaf grad // leaf grad
...@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) { ...@@ -318,13 +318,13 @@ TEST(FwdBwdJoint, CrossBatchAccumulation) {
// 4. Run Backward // 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2}; std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0); eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
// Cross Batch Accumulation // Cross Batch Accumulation
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 60.0); eager_test::CompareGradTensorWithValue<float>(tensor, 60.0);
...@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) { ...@@ -356,7 +356,7 @@ TEST(FwdBwdJoint, SingleNodeCUDA) {
std::vector<paddle::experimental::Tensor> outs = {out}; std::vector<paddle::experimental::Tensor> outs = {out};
// 4. Run Backward // 4. Run Backward
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 2.0); eager_test::CompareGradTensorWithValue<float>(tensor, 2.0);
...@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) { ...@@ -412,7 +412,7 @@ TEST(FwdBwdJoint, BranchedNodesCUDA) {
// TODO(jiabin): fix this with add functor // TODO(jiabin): fix this with add functor
// 4. Run Backward // 4. Run Backward
std::vector<paddle::experimental::Tensor> outs = {out1, out2}; std::vector<paddle::experimental::Tensor> outs = {out1, out2};
RunBackward(outs, {}); Backward(outs, {});
// Examine Backward Grad // Examine Backward Grad
eager_test::CompareGradTensorWithValue<float>(tensor, 30.0); eager_test::CompareGradTensorWithValue<float>(tensor, 30.0);
......
...@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) { ...@@ -57,7 +57,7 @@ TEST(Generated, Sigmoid) {
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor}; std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
VLOG(6) << "Runing Backward"; VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {}); Backward(target_tensors, {});
VLOG(6) << "Finish Backward"; VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>(tensor, 0.25); eager_test::CompareGradTensorWithValue<float>(tensor, 0.25);
...@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) { ...@@ -89,7 +89,7 @@ TEST(Generated, Matmul_v2) {
eager_test::CompareTensorWithValue<float>(output_tensor, 96); eager_test::CompareTensorWithValue<float>(output_tensor, 96);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor}; std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20); eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4); eager_test::CompareGradTensorWithValue<float>(Y, 3.0 * 4);
...@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) { ...@@ -120,7 +120,7 @@ TEST(Generated, ElementwiseAdd) {
eager_test::CompareTensorWithValue<float>(output_tensor, 5); eager_test::CompareTensorWithValue<float>(output_tensor, 5);
std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor}; std::vector<paddle::experimental::Tensor> target_tensors = {output_tensor};
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0); eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>(Y, 1.0); eager_test::CompareGradTensorWithValue<float>(Y, 1.0);
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <sstream>
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/eager/accumulation/accumulation_node.h"
#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
#include "paddle/fluid/eager/api/utils/tensor_utils.h"
#include "paddle/fluid/eager/autograd_meta.h"
#include "paddle/fluid/eager/backward.h"
#include "paddle/fluid/eager/grad_node_info.h"
#include "paddle/fluid/eager/tests/test_utils.h"
#include "paddle/fluid/eager/api/all.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
namespace egr {
TEST(Grad, SingleNodeEmptyGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor (output)
paddle::experimental::Tensor output_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
// Create input tensor
const paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Output_tensor set GradNode、OutRank、StopGradient propertis
AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&output_tensor);
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Get autograd_meta from input tensor
AutogradMeta* auto_grad_meta1 =
EagerUtils::unsafe_autograd_meta(leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
// input tensor set GradNode、OutRank、StopGradient propertis
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// grad_node Add Edges
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
std::vector<paddle::experimental::Tensor> outs = {output_tensor};
// Run Grad
auto result = Grad(outs, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 5.0);
}
TEST(Grad, SingleNodeCustomGrad) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
std::vector<paddle::experimental::Tensor> grad_tensors;
// Create Grad Tensor
paddle::experimental::Tensor grad_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Scale Node
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta
node0_ptr->SetDefaultGradInOutMeta();
// Connect Tensor and Node via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res = {auto_grad_meta1};
node0_ptr->AddEdges(&res, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node1
|
Node0
|
{ } // empty grad tensor
*/
TEST(Grad, LinearNodes) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
paddle::experimental::Tensor tensor = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor));
paddle::experimental::Tensor leaf_tensor =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, true /*is_leaf*/);
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
// Set grad in/out meta for node0
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
// Set grad in/out meta for node1
node1_ptr->SetDefaultGradInOutMeta();
// Connect Input Tensor and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta->SetStopGradient(false);
// Connect Node0 -> Node1 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node1_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta1);
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res1 = {auto_grad_meta1};
node1_ptr->AddEdges(&res1, 0);
}
// Use Empty Grad Tensor
auto result = Grad(target_tensors, {leaf_tensor}, {});
// Check Output Value
eager_test::CompareTensorWithValue<float>(result[0], 50.0);
}
/*
Node2
| |
Node0 Node1
| |
in0 in1
*/
TEST(Grad, WithAccumulation) {
// Prepare Device Contexts
eager_test::InitEnv(paddle::platform::CPUPlace());
// Prepare Inputs
paddle::framework::DDim ddim = phi::make_ddim({4, 16, 16, 32});
// Create Target Tensor
std::vector<paddle::experimental::Tensor> target_tensors;
paddle::experimental::Tensor tensor0 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor tensor1 = egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
target_tensors.emplace_back(std::move(tensor0));
target_tensors.emplace_back(std::move(tensor1));
// Create Grad Tensor
std::vector<paddle::experimental::Tensor> grad_tensors;
paddle::experimental::Tensor grad_tensor0 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
paddle::experimental::Tensor grad_tensor1 =
egr_utils_api::CreateTensorWithValue(
ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32,
phi::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
grad_tensors.emplace_back(std::move(grad_tensor0));
grad_tensors.emplace_back(std::move(grad_tensor1));
paddle::experimental::Tensor leaf_tensor;
{
// Create Node0
auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
node0_ptr->SetAttributes_scale(5.0 /*scale*/);
node0_ptr->SetDefaultGradInOutMeta();
// Create Node1
auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
node1_ptr->SetAttributes_scale(10.0 /*scale*/);
node1_ptr->SetDefaultGradInOutMeta();
// Create Node2
auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
node2_ptr->SetAttributes_scale(20.0 /*scale*/);
node2_ptr->SetDefaultGradInOutMeta();
// Connect Inp0 and Node0 via AutoGradMeta
AutogradMeta* auto_grad_meta0 =
EagerUtils::autograd_meta(&(target_tensors[0]));
auto_grad_meta0->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta0->SetStopGradient(false);
// Connect Inp1 and Node1 via AutoGradMeta
AutogradMeta* auto_grad_meta1 =
EagerUtils::autograd_meta(&(target_tensors[1]));
auto_grad_meta1->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta1->SetStopGradient(false);
// Connect Node0 -> Node2 via Edge
auto meta0 = egr::AutogradMeta();
meta0.SetStopGradient(false);
meta0.SetSingleOutRankWithSlot(0, 0);
meta0.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res0 = {&meta0};
node0_ptr->AddEdges(&res0, 0);
// Connect Node1 -> Node2 via Edge
auto meta1 = egr::AutogradMeta();
meta1.SetStopGradient(false);
meta1.SetSingleOutRankWithSlot(0, 0);
meta1.SetGradNode(node2_ptr);
std::vector<egr::AutogradMeta*> res1 = {&meta1};
node1_ptr->AddEdges(&res1, 0);
AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
// Connect Tensor and AccumulationNode via AutoGradMeta
auto acc_node_ptr =
std::make_shared<egr::GradNodeAccumulation>(auto_grad_meta2);
auto_grad_meta2->SetGradNode(
std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
auto_grad_meta2->SetStopGradient(false);
std::vector<egr::AutogradMeta*> res2 = {auto_grad_meta2};
node2_ptr->AddEdges(&res2, 0);
}
auto result = Grad(target_tensors, {leaf_tensor}, grad_tensors);
eager_test::CompareTensorWithValue<float>(result[0], 2500.0);
}
} // namespace egr
...@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) { ...@@ -132,7 +132,7 @@ TEST(RetainGrad, HookBeforeRetainGrad) {
leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0
} }
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0); eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
...@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) { ...@@ -199,7 +199,7 @@ TEST(RetainGrad, HookAfterRetainGrad) {
leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function)); leaf_tensor, std::make_shared<egr::CppTensorHook>(hook_function));
} }
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0); eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0); eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
} }
......
...@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) { ...@@ -108,7 +108,7 @@ void test_sigmoid(bool is_remove_gradient_hook) {
} }
VLOG(6) << "Runing Backward"; VLOG(6) << "Runing Backward";
RunBackward(target_tensors, {}); Backward(target_tensors, {});
VLOG(6) << "Finish Backward"; VLOG(6) << "Finish Backward";
eager_test::CompareGradTensorWithValue<float>( eager_test::CompareGradTensorWithValue<float>(
...@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) { ...@@ -166,7 +166,7 @@ void test_elementwiseAdd(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id); grad_node_tmp->RemoveGradientHook(hook_id);
} }
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 1.0); eager_test::CompareGradTensorWithValue<float>(X, 1.0);
eager_test::CompareGradTensorWithValue<float>( eager_test::CompareGradTensorWithValue<float>(
...@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) { ...@@ -224,7 +224,7 @@ void test_matmul(bool is_remove_gradient_hook) {
grad_node_tmp->RemoveGradientHook(hook_id); grad_node_tmp->RemoveGradientHook(hook_id);
} }
RunBackward(target_tensors, {}); Backward(target_tensors, {});
eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20); eager_test::CompareGradTensorWithValue<float>(X, 2.0 * 20);
eager_test::CompareGradTensorWithValue<float>( eager_test::CompareGradTensorWithValue<float>(
......
...@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -370,8 +370,8 @@ class GradNodeRunProgram : public egr::GradNodeBase {
~GradNodeRunProgram() override = default; ~GradNodeRunProgram() override = default;
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads) const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
override { bool create_graph) override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grads.size(), 1, grads.size(), 1,
...@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -415,6 +415,12 @@ class GradNodeRunProgram : public egr::GradNodeBase {
// return {x_grad, details::DereferenceTensors(params_grad_ptr)}; // return {x_grad, details::DereferenceTensors(params_grad_ptr)};
} }
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
bool IsTensorWrappersCleared() override {
VLOG(6) << "Do nothing here now";
return false;
}
// SetAttrMap // SetAttrMap
void SetAttrMap(const paddle::framework::AttributeMap &attrs) { void SetAttrMap(const paddle::framework::AttributeMap &attrs) {
attrs_ = attrs; attrs_ = attrs;
......
...@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args, ...@@ -122,13 +122,33 @@ static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
EAGER_TRY EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
egr::RunBackward(tensors, grad_tensors, egr::Backward(tensors, grad_tensors,
CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* eager_api_run_partial_grad(PyObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
auto inputs = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1);
auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 2), 2);
auto retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
auto create_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4);
auto only_inputs = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 5), 5);
auto allow_unused = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 6), 6);
auto no_grad_vars = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 7), 7);
std::vector<paddle::experimental::Tensor> result =
egr::Grad(tensors, inputs, grad_tensors, retain_graph, create_graph,
only_inputs, allow_unused, no_grad_vars);
VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad";
return ToPyObject(result, true /* return_py_none_if_not_initialize */);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = { ...@@ -452,6 +472,9 @@ PyMethodDef variable_functions[] = {
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward, {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"run_partial_grad",
(PyCFunction)(void (*)(void))eager_api_run_partial_grad,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op, {"_run_custom_op", (PyCFunction)(void (*)(void))eager_api_run_costum_op,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy, {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
......
...@@ -492,10 +492,15 @@ PyObject* ToPyObject(const std::vector<double>& value) { ...@@ -492,10 +492,15 @@ PyObject* ToPyObject(const std::vector<double>& value) {
return result; return result;
} }
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) { PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize) {
PyObject* result = PyList_New((Py_ssize_t)value.size()); PyObject* result = PyList_New((Py_ssize_t)value.size());
for (size_t i = 0; i < value.size(); i++) { for (size_t i = 0; i < value.size(); i++) {
if (!value[i].initialized() && return_py_none_if_not_initialize) {
Py_INCREF(Py_None);
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), Py_None);
} else {
PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
if (obj) { if (obj) {
auto v = reinterpret_cast<TensorObject*>(obj); auto v = reinterpret_cast<TensorObject*>(obj);
...@@ -507,6 +512,7 @@ PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) { ...@@ -507,6 +512,7 @@ PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value) {
} }
PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj); PyList_SET_ITEM(result, static_cast<Py_ssize_t>(i), obj);
} }
}
return result; return result;
} }
......
...@@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector<int>& value); ...@@ -68,7 +68,8 @@ PyObject* ToPyObject(const std::vector<int>& value);
PyObject* ToPyObject(const std::vector<int64_t>& value); PyObject* ToPyObject(const std::vector<int64_t>& value);
PyObject* ToPyObject(const std::vector<float>& value); PyObject* ToPyObject(const std::vector<float>& value);
PyObject* ToPyObject(const std::vector<double>& value); PyObject* ToPyObject(const std::vector<double>& value);
PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value); PyObject* ToPyObject(const std::vector<paddle::experimental::Tensor>& value,
bool return_py_none_if_not_initialize = false);
PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const platform::Place& value);
PyObject* ToPyObject(const framework::LoDTensor* value); PyObject* ToPyObject(const framework::LoDTensor* value);
PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype); PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
......
...@@ -565,16 +565,25 @@ def grad(outputs, ...@@ -565,16 +565,25 @@ def grad(outputs,
if isinstance(in_out_list, (list, tuple)): if isinstance(in_out_list, (list, tuple)):
assert len(in_out_list) > 0, "{} cannot be empty".format(name) assert len(in_out_list) > 0, "{} cannot be empty".format(name)
for each_var in in_out_list: for each_var in in_out_list:
if core._in_eager_mode():
assert isinstance(
each_var, core.eager.
Tensor), "Elements of {} must be Tensor".format(name)
else:
assert isinstance( assert isinstance(
each_var, each_var,
core.VarBase), "Elements of {} must be Variable".format( core.VarBase), "Elements of {} must be Variable".format(
name) name)
return in_out_list return in_out_list
else: else:
if core._in_eager_mode():
assert isinstance( assert isinstance(
in_out_list, in_out_list, core.eager.
core.VarBase), "{} must be Variable or list of Variable".format( Tensor), "{} must be Tensor or list of Tensor".format(name)
name) else:
assert isinstance(
in_out_list, core.VarBase
), "{} must be Variable or list of Variable".format(name)
return [in_out_list] return [in_out_list]
outputs = check_in_out(outputs, 'outputs') outputs = check_in_out(outputs, 'outputs')
...@@ -586,6 +595,11 @@ def grad(outputs, ...@@ -586,6 +595,11 @@ def grad(outputs,
for each_var in grad_outputs: for each_var in grad_outputs:
if each_var is not None: if each_var is not None:
if core._in_eager_mode():
assert isinstance(
each_var, core.eager.Tensor
), "grad_outputs must be None, a Variable or a list containing None or Variables"
else:
assert isinstance( assert isinstance(
each_var, core.VarBase each_var, core.VarBase
), "grad_outputs must be None, a Variable or a list containing None or Variables" ), "grad_outputs must be None, a Variable or a list containing None or Variables"
...@@ -600,14 +614,27 @@ def grad(outputs, ...@@ -600,14 +614,27 @@ def grad(outputs,
no_grad_vars = [] no_grad_vars = []
elif isinstance(no_grad_vars, core.VarBase): elif isinstance(no_grad_vars, core.VarBase):
no_grad_vars = [no_grad_vars] no_grad_vars = [no_grad_vars]
elif isinstance(no_grad_vars, core.eager.Tensor):
no_grad_vars = [no_grad_vars]
elif isinstance(no_grad_vars, (list, tuple, set)): elif isinstance(no_grad_vars, (list, tuple, set)):
no_grad_vars = list(no_grad_vars) no_grad_vars = list(no_grad_vars)
for var in no_grad_vars: for var in no_grad_vars:
if core._in_eager_mode():
assert isinstance(
var,
core.eager.Tensor), "no_grad_vars can only contains Tensor"
else:
assert isinstance( assert isinstance(
var, core.VarBase), "no_grad_vars can only contains Variable" var,
core.VarBase), "no_grad_vars can only contains Variable"
else: else:
if core._in_eager_mode():
raise AssertionError( raise AssertionError(
"no_grad_vars must be None, Variable or list/tuple/set of Variables") "no_grad_vars must be None, Tensor or list/tuple/set of Tensors")
else:
raise AssertionError(
"no_grad_vars must be None, Variable or list/tuple/set of Variables"
)
assert isinstance(create_graph, bool), "create_graph must be True or False" assert isinstance(create_graph, bool), "create_graph must be True or False"
...@@ -622,6 +649,11 @@ def grad(outputs, ...@@ -622,6 +649,11 @@ def grad(outputs,
assert isinstance(only_inputs, bool), "only_inputs must be True or False" assert isinstance(only_inputs, bool), "only_inputs must be True or False"
assert only_inputs, "only_inputs=False is not supported yet" assert only_inputs, "only_inputs=False is not supported yet"
if core._in_eager_mode():
return core.eager.run_partial_grad(
outputs, inputs, grad_outputs, retain_graph, create_graph,
only_inputs, allow_unused, no_grad_vars)
place = core.Place() place = core.Place()
place.set_place(framework._current_expected_place()) place.set_place(framework._current_expected_place())
return core.dygraph_partial_grad(inputs, outputs, grad_outputs, return core.dygraph_partial_grad(inputs, outputs, grad_outputs,
......
...@@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase): ...@@ -52,7 +52,7 @@ class EagerScaleTestCase(unittest.TestCase):
out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True) out_eager = core.eager.scale(data_eager, 1.0, 0.9, True, True)
self.assertIsNone(data_eager.grad) self.assertIsNone(data_eager.grad)
out_eager.backward(grad_eager, False) out_eager.backward(grad_eager, False)
self.assertTrue(data_eager.grad._is_initialized()) self.assertIsNotNone(data_eager.grad)
self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data)) self.assertTrue(np.array_equal(data_eager.grad.numpy(), input_data))
def test_retain_grad_and_run_backward_raises(self): def test_retain_grad_and_run_backward_raises(self):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101 ...@@ -19,6 +19,9 @@ from paddle.vision.models import resnet50, resnet101
import unittest import unittest
from unittest import TestCase from unittest import TestCase
import numpy as np import numpy as np
import paddle.compat as cpt
from paddle.fluid.framework import _test_eager_guard
import paddle.fluid.core as core
def _dygraph_guard_(func): def _dygraph_guard_(func):
...@@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'): ...@@ -40,6 +43,80 @@ def random_var(size, low=-1, high=1, dtype='float32'):
return fluid.dygraph.to_variable(x_np) return fluid.dygraph.to_variable(x_np)
class TestEagerGrad(TestCase):
def func_simple_example_eager_grad(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
out = paddle.matmul(x, y)
dx = fluid.dygraph.grad(out, x)
dout = np.ones_like(np_y)
expected_dx = np.matmul(dout, np.transpose(np_y))
# stop_gradient = !create_graph, create_graph default false
self.assertEqual(dx[0].stop_gradient, True)
self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
def test_simple_example_eager_grad(self):
with _test_eager_guard():
self.func_simple_example_eager_grad()
self.func_simple_example_eager_grad()
def func_simple_example_eager_grad_allow_unused(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)
dout = np.ones_like(np_y)
expected_dx = np.matmul(dout, np.transpose(np_y))
self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))
# stop_gradient = !create_graph, create_graph default false
self.assertEqual(dx[0].stop_gradient, True)
# x is unused input in the graph
self.assertEqual(dx[1], None)
def test_simple_example_eager_grad_allow_unused(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_allow_unused()
self.func_simple_example_eager_grad_allow_unused()
def func_simple_example_eager_grad_not_allow_unused(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)
try:
# allow_unused is false in default
dx = fluid.dygraph.grad(out, [x, z])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0
def test_simple_example_eager_grad_not_allow_unused(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_not_allow_unused()
self.func_simple_example_eager_grad_not_allow_unused()
class TestDygraphDoubleGrad(TestCase): class TestDygraphDoubleGrad(TestCase):
def setUp(self): def setUp(self):
self.sort_sum_gradient = False self.sort_sum_gradient = False
...@@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -64,7 +141,7 @@ class TestDygraphDoubleGrad(TestCase):
allow_unused=allow_unused) allow_unused=allow_unused)
@dygraph_guard @dygraph_guard
def test_exception(self): def func_exception(self):
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
self.grad(None, None) self.grad(None, None)
...@@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -93,8 +170,13 @@ class TestDygraphDoubleGrad(TestCase):
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
def test_exception(self):
with _test_eager_guard():
self.func_exception()
self.func_exception()
@dygraph_guard @dygraph_guard
def test_simple_example(self): def func_simple_example(self):
x = random_var(self.shape) x = random_var(self.shape)
x.stop_gradient = False x.stop_gradient = False
y = x + 1 y = x + 1
...@@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -123,8 +205,44 @@ class TestDygraphDoubleGrad(TestCase):
self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
create_graph) create_graph)
def test_simple_example(self):
with _test_eager_guard():
self.func_simple_example()
self.func_simple_example()
@dygraph_guard @dygraph_guard
def test_none_one_initial_gradient(self): def func_example_no_grad_vars(self):
x = random_var(self.shape)
x_np = x.numpy()
numel = x_np.size
x.stop_gradient = False
y1 = fluid.layers.relu(x)
y2 = fluid.layers.relu(x)
z = y1 + y2
w = z * z
w_mean = fluid.layers.reduce_mean(w)
del y1, z, w
dx_actual, = self.grad(
[w_mean], [x], create_graph=True, no_grad_vars=[y2])
self.assertFalse(y2.stop_gradient)
self.assertFalse(dx_actual.stop_gradient)
dx_expected = (1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) *
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
def test_example_no_grad_vars(self):
with _test_eager_guard():
self.func_example_no_grad_vars()
self.func_example_no_grad_vars()
@dygraph_guard
def func_none_one_initial_gradient(self):
numel = 1 numel = 1
for s in self.shape: for s in self.shape:
numel *= s numel *= s
...@@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -190,8 +308,13 @@ class TestDygraphDoubleGrad(TestCase):
np.array_equal(grad_z.numpy(), np.array_equal(grad_z.numpy(),
original_random_grad_z)) original_random_grad_z))
def test_none_one_initial_gradient(self):
with _test_eager_guard():
self.func_none_one_initial_gradient()
self.func_none_one_initial_gradient()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_create_graph(self): def func_example_with_gradient_accumulation_and_create_graph(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -214,12 +337,15 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -214,12 +337,15 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32') (x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True) loss.backward(retain_graph=True)
x_grad_actual = x.gradient() x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * x_grad_expected = (2.0 / float(numel) * (
(x_np + dx_expected * x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32') (x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
...@@ -231,8 +357,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -231,8 +357,13 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2 / float(numel))).astype('float32') (x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
self.func_example_with_gradient_accumulation_and_create_graph()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_no_grad_vars(self): def func_example_with_gradient_accumulation_and_no_grad_vars(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -256,17 +387,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32') (x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward()
x_grad_actual = x.gradient() x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * x_grad_expected = (2.0 / float(numel) * (
(x_np + dx_expected * x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32') (x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_no_grad_vars()
self.func_example_with_gradient_accumulation_and_no_grad_vars()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_not_create_graph(self): def func_example_with_gradient_accumulation_and_not_create_graph(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -289,6 +428,9 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -289,6 +428,9 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward()
...@@ -296,6 +438,11 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -296,6 +438,11 @@ class TestDygraphDoubleGrad(TestCase):
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_not_create_graph()
self.func_example_with_gradient_accumulation_and_not_create_graph()
class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
def setUp(self): def setUp(self):
...@@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): ...@@ -304,7 +451,7 @@ class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
class TestDygraphDoubleGradVisitedUniq(TestCase): class TestDygraphDoubleGradVisitedUniq(TestCase):
def test_compare(self): def func_compare(self):
value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2,
5).astype("float32") 5).astype("float32")
...@@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase): ...@@ -349,6 +496,11 @@ class TestDygraphDoubleGradVisitedUniq(TestCase):
self.assertTrue(np.array_equal(grad_1, grad_2)) self.assertTrue(np.array_equal(grad_1, grad_2))
def test_compare(self):
with _test_eager_guard():
self.func_compare()
self.func_compare()
class TestRaiseNoDoubleGradOp(TestCase): class TestRaiseNoDoubleGradOp(TestCase):
def raise_no_grad_op(self): def raise_no_grad_op(self):
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -18,6 +18,8 @@ import unittest ...@@ -18,6 +18,8 @@ import unittest
from unittest import TestCase from unittest import TestCase
import numpy as np import numpy as np
import paddle import paddle
from paddle.fluid.framework import _test_eager_guard
import paddle.fluid.core as core
def _dygraph_guard_(func): def _dygraph_guard_(func):
...@@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -62,7 +64,7 @@ class TestDygraphDoubleGrad(TestCase):
allow_unused=allow_unused) allow_unused=allow_unused)
@dygraph_guard @dygraph_guard
def test_exception(self): def func_exception(self):
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
self.grad(None, None) self.grad(None, None)
...@@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -91,8 +93,13 @@ class TestDygraphDoubleGrad(TestCase):
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1) self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
def test_exception(self):
with _test_eager_guard():
self.func_exception()
self.func_exception()
@dygraph_guard @dygraph_guard
def test_simple_example(self): def func_simple_example(self):
x = random_var(self.shape) x = random_var(self.shape)
x.stop_gradient = False x.stop_gradient = False
y = x + 1 y = x + 1
...@@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -121,8 +128,13 @@ class TestDygraphDoubleGrad(TestCase):
self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, self.assertNotEqual(grad_with_none_and_not_none.stop_gradient,
create_graph) create_graph)
def test_simple_example(self):
with _test_eager_guard():
self.func_simple_example()
self.func_simple_example()
@dygraph_guard @dygraph_guard
def test_none_one_initial_gradient(self): def func_none_one_initial_gradient(self):
numel = 1 numel = 1
for s in self.shape: for s in self.shape:
numel *= s numel *= s
...@@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -188,8 +200,13 @@ class TestDygraphDoubleGrad(TestCase):
np.array_equal(grad_z.numpy(), np.array_equal(grad_z.numpy(),
original_random_grad_z)) original_random_grad_z))
def test_none_one_initial_gradient(self):
with _test_eager_guard():
self.func_none_one_initial_gradient()
self.func_none_one_initial_gradient()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_create_graph(self): def func_example_with_gradient_accumulation_and_create_graph(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -212,17 +229,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32') (x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward()
x_grad_actual = x.gradient() x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * x_grad_expected = (2.0 / float(numel) * (
(x_np + dx_expected * x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32') (x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
self.func_example_with_gradient_accumulation_and_create_graph()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_no_grad_vars(self): def func_example_with_gradient_accumulation_and_no_grad_vars(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -246,17 +271,25 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32') (x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward()
x_grad_actual = x.gradient() x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * x_grad_expected = (2.0 / float(numel) * (
(x_np + dx_expected * x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32') (x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_no_grad_vars()
self.func_example_with_gradient_accumulation_and_no_grad_vars()
@dygraph_guard @dygraph_guard
def test_example_with_gradient_accumulation_and_not_create_graph(self): def func_example_with_gradient_accumulation_and_not_create_graph(self):
x = random_var(self.shape) x = random_var(self.shape)
x_np = x.numpy() x_np = x.numpy()
numel = x_np.size numel = x_np.size
...@@ -279,6 +312,9 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -279,6 +312,9 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected)) self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if core._in_eager_mode():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x) loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward() loss.backward()
...@@ -286,6 +322,11 @@ class TestDygraphDoubleGrad(TestCase): ...@@ -286,6 +322,11 @@ class TestDygraphDoubleGrad(TestCase):
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32') x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected)) self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_not_create_graph()
self.func_example_with_gradient_accumulation_and_not_create_graph()
class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad): class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
def setUp(self): def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册