未验证 提交 0b4c3c20 编写于 作者: Z Zhanlue Yang 提交者: GitHub

[DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad (#41451)

* [DoubleGrad] Enabled double grad test cases in eager_mode for test_imperative_double_grad

* Fixed elementwise issue

* Addressed CI failures
上级 c448032b
......@@ -23,7 +23,7 @@ import os
########################
ops_to_fill_zero_for_empty_grads = set([
"split_grad", "rnn_grad", "matmul_double_grad", "matmul_triple_grad",
"sigmoid_triple_grad"
"sigmoid_triple_grad, add_double_grad"
])
# For API dispatch used at python-level
......
......@@ -205,6 +205,7 @@ FORWARD_FUNCTION_TEMPLATE = \
#endif
}}
// Forward API Call
VLOG(3) << \"Final State Running: \" << \"{}\";
{}
// Get Outputs
{}
......@@ -505,15 +506,11 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
for i in range(len(forward_attrs_list)):
orig_attr_type = orig_forward_attrs_list[i][1]
orig_attr_default = orig_forward_attrs_list[i][2]
orig_attr_pos = orig_forward_attrs_list[i][3]
forward_attr_type = forward_attrs_list[i][1]
forward_attr_default = forward_attrs_list[i][2]
forward_attr_pos = forward_attrs_list[i][3]
assert orig_attr_type == forward_attr_type, AssertMessage(
orig_attr_type, forward_attr_type)
assert orig_attr_default == forward_attr_default, AssertMessage(
orig_attr_default, forward_attr_default)
assert orig_attr_pos == forward_attr_pos, AssertMessage(
orig_attr_pos, forward_attr_pos)
......@@ -753,6 +750,15 @@ class DygraphFunctionGeneratorBase(FunctionGeneratorBase):
set_grad_out_meta_list = []
set_edges_list = []
for name, (_, pos) in forward_inputs_position_map.items():
# Has corresponding grad output
has_corresponding_grad_output = False
for _, (_, corresponding_pos,
_) in backward_grad_outputs_map.items():
if pos == corresponding_pos:
has_corresponding_grad_output = True
if not has_corresponding_grad_output:
continue
input_autograd_meta_name = GetAutoGradMetaName(name)
is_optional = (name in self.optional_inputs)
if is_optional:
......@@ -1063,9 +1069,10 @@ class DygraphForwardFunctionGenerator(DygraphFunctionGeneratorBase):
self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, amp_logic_str, inputs_autograd_meta_str,
forward_call_str, get_outputs_str, outputs_autograd_meta_str,
compute_require_grad_args_str, check_inplace_str,
bump_inplace_version_str, node_creation_str, returns_str)
forward_function_name, forward_call_str, get_outputs_str,
outputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, bump_inplace_version_str, node_creation_str,
returns_str)
self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
logging.info(
......@@ -1439,28 +1446,18 @@ class DygraphNodeGenerator(DygraphFunctionGeneratorBase):
compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
# Construct grad_api returns
num_bwd_outputs = len(backward_grad_outputs_map.keys())
slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
returns_str = f"{indent}std::vector<std::vector<paddle::experimental::Tensor>> returns({slot_num_bwd_outputs});\n"
for name, (ttype, fwd_position,
grad_api_position) in backward_grad_outputs_map.items():
transformed_tensor_name = self.TransformToNextGradName(name)
# Infer Grad API Return Type
if num_bwd_outputs == 1:
# Single tensor output, return as is
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[0] = {{ {transformed_tensor_name} }};\n"
else:
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[0] = {transformed_tensor_name};\n"
# Rearrange output order accordingly
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
else:
# Rearrange output order accordingly
if IsPlainTensorType(ttype):
returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
else:
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
assert IsVectorTensorType(ttype)
returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
returns_str += f"{indent}return returns;\n"
......
......@@ -485,6 +485,7 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
}
}
}
return node_in_degree_map;
}
......@@ -526,6 +527,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
bool allow_unused = false,
const std::vector<paddle::experimental::Tensor>& no_grad_vars = {}) {
VLOG(6) << "Start Backward";
// *Gradient Hook should happen at node-level
// *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass
......@@ -729,6 +731,16 @@ std::vector<paddle::experimental::Tensor> RunBackward(
continue;
}
auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}
PADDLE_ENFORCE_LT(
j, grad_output_tensors[i].size(),
paddle::platform::errors::Fatal(
......@@ -748,15 +760,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
<< ", rank: " << j
<< " 's name is: " << grad_output_tensor.name();
auto* next_node = next_node_shared.get();
if (!node_input_buffers_dict.count(next_node)) {
const auto& input_meta = next_node->InputMeta();
auto grad_tensor_holder =
std::make_unique<GradTensorHolder>(input_meta);
VLOG(6) << "Construct GradTensorHolder for grad node: "
<< next_node->name();
node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
}
VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
<< ", rank: " << edge_rank.second;
node_input_buffers_dict[next_node]->add(
......
......@@ -63,9 +63,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
......
......@@ -31,9 +31,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout);
......
......@@ -56,9 +56,9 @@ void AddGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void AddDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& y,
const DenseTensor& dout,
paddle::optional<const DenseTensor&> ddx,
paddle::optional<const DenseTensor&> ddy,
const DenseTensor& dout,
int axis,
DenseTensor* ddout) {
phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
......
......@@ -115,7 +115,7 @@ KernelSignature ElementwiseAddGradOpArgumentMapping(
KernelSignature ElementwiseAddDoubleGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"add_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"});
"add_double_grad", {"Y", "DOut", "DDX", "DDY"}, {"axis"}, {"DDOut"});
}
KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
......
......@@ -15,7 +15,7 @@
from __future__ import print_function
from .. import core
from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
from ..layers.layer_function_generator import OpProtoHolder
from . import no_grad
from .. import framework
......@@ -62,6 +62,15 @@ _complex_dtypes = [
_already_patch_varbase = False
_already_patch_eager_tensor = False
# Dispatch to final state Python-C functions
_final_state_op_type_mapping = {
"elementwise_add": "final_state_add",
"elementwise_sub": "final_state_subtract",
"elementwise_div": "final_state_divide",
"elementwise_mul": "final_state_multiply",
"matmul_v2": "final_state_matmul",
}
def monkey_patch_math_varbase():
"""
......@@ -105,10 +114,15 @@ def monkey_patch_math_varbase():
"""
if not isinstance(dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(dtype)
return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype)
if _in_legacy_dygraph():
return _C_ops.cast(self, 'in_dtype', self.dtype, 'out_dtype', dtype)
return _C_ops.final_state_cast(self, dtype)
def _scalar_elementwise_op_(var, scale, bias):
return _C_ops.scale(var, 'scale', scale, 'bias', bias)
if _in_legacy_dygraph():
return _C_ops.scale(var, 'scale', scale, 'bias', bias)
return _C_ops.final_state_scale(var, float(scale), bias, True)
def _neg_(var):
return _scalar_elementwise_op_(var, -1.0, 0.0)
......@@ -164,7 +178,10 @@ def monkey_patch_math_varbase():
perm = []
for i in range(len(var.shape)):
perm.insert(0, i)
out, _ = _C_ops.transpose2(var, 'axis', perm)
if _in_legacy_dygraph():
out, _ = _C_ops.transpose2(var, 'axis', perm)
else:
out = _C_ops.final_state_transpose(var, perm)
return out
def _scalar_add_(var, value):
......@@ -270,11 +287,13 @@ def monkey_patch_math_varbase():
# 4. calculation
axis = -1
if framework._in_eager_mode_ and op_type == 'elementwise_add':
math_op = getattr(_C_ops, 'final_state_add')
if in_dygraph_mode(
) and op_type in _final_state_op_type_mapping.keys():
math_op = getattr(_C_ops, _final_state_op_type_mapping[op_type])
return math_op(self, other_var)
else:
math_op = getattr(_C_ops, op_type)
return math_op(self, other_var, 'axis', axis)
return math_op(self, other_var, 'axis', axis)
comment = OpProtoHolder.instance().get_op_proto(op_type).comment
......
......@@ -9036,7 +9036,10 @@ def relu(x, name=None):
# [[0. 0. ]
# [1. 2.6]]
"""
if _non_static_mode():
if in_dygraph_mode():
return _C_ops.final_state_relu(x)
if _in_legacy_dygraph():
return _C_ops.relu(x)
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu')
......
......@@ -385,26 +385,23 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True)
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
for i in range(5):
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * (
x_grad_expected = (i + 2) * (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
for i in range(5):
loss.backward(retain_graph=True)
x_grad_actual = x.gradient()
x_grad_expected = (i + 2) * (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 2 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_create_graph(self):
with _test_eager_guard():
self.func_example_with_gradient_accumulation_and_create_graph()
......@@ -426,7 +423,10 @@ class TestDygraphDoubleGrad(TestCase):
del y1, z, w
dx_actual, = self.grad(
[w_mean], [x], create_graph=True, no_grad_vars=[y2])
[w_mean], [x],
retain_graph=True,
create_graph=True,
no_grad_vars=[y2])
self.assertFalse(y2.stop_gradient)
self.assertFalse(dx_actual.stop_gradient)
......@@ -435,17 +435,14 @@ class TestDygraphDoubleGrad(TestCase):
(x_np > 0) * 2).astype('float32')
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) * (
x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
x_grad_actual = x.gradient()
x_grad_expected = (2.0 / float(numel) *
(x_np + dx_expected *
(x_np > 0) * 4 / float(numel))).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_no_grad_vars(self):
with _test_eager_guard():
......@@ -476,15 +473,12 @@ class TestDygraphDoubleGrad(TestCase):
self.assertTrue(np.allclose(dx_actual.numpy(), dx_expected))
if not _in_legacy_dygraph():
pass
else:
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
loss = fluid.layers.reduce_mean(dx_actual * dx_actual + x * x)
loss.backward()
x_grad_actual = x.gradient()
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
x_grad_actual = x.gradient()
x_grad_expected = (2.0 * x_np / float(numel)).astype('float32')
self.assertTrue(np.allclose(x_grad_actual, x_grad_expected))
def test_example_with_gradient_accumulation_and_not_create_graph(self):
with _test_eager_guard():
......
......@@ -30,6 +30,18 @@
kernel :
func : acosh_grad
- backward_api : add_double_grad
forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
output : Tensor(grad_out_grad)
infer_meta :
func : UnchangedInferMeta
param : [grad_out]
kernel :
func : add_double_grad
optional : grad_x_grad, grad_y_grad
backward : add_triple_grad
- backward_api : add_grad
forward : add (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
......@@ -40,6 +52,7 @@
kernel :
func : add_grad
no_need_buffer : x, y
backward : add_double_grad
- backward_api : add_n_grad
forward : add_n (Tensor[] x) -> Tensor(out)
......@@ -48,6 +61,16 @@
invoke : add_n_grad_impl(x, out_grad)
no_need_buffer : x
- backward_api : add_triple_grad
forward : add_double_grad (Tensor y, Tensor grad_out, Tensor grad_grad_x, Tensor grad_grad_y, int axis = -1) -> Tensor(grad_grad_out)
args : (Tensor grad_grad_x, Tensor grad_grad_y, Tensor grad_grad_out_grad, int axis = -1)
output : Tensor(grad_grad_x_grad), Tensor(grad_grad_y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [grad_grad_x, grad_grad_y]
kernel :
func : add_triple_grad
- backward_api : addmm_grad
forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta)
......@@ -934,6 +957,12 @@
kernel :
func : mean_all_grad
- backward_api : mean_double_grad
forward: mean_grad (Tensor x, Tensor grad_out, int64_t[] dims={}, bool keep_dim=false, bool reduce_all = false) -> Tensor(grad_x)
args : (Tensor grad_x_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
output : Tensor(grad_out_grad)
invoke : mean(grad_x_grad, dims, keep_dim)
- backward_api : mean_grad
forward: mean (Tensor x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(out)
args : (Tensor x, Tensor out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
......@@ -943,6 +972,7 @@
param: [x]
kernel :
func : mean_grad
backward : mean_double_grad
no_need_buffer : x
- backward_api : meshgrid_grad
......@@ -1025,6 +1055,17 @@
func : multiplex_grad
param : [ids, out_grad]
- backward_api : multiply_double_grad
forward : multiply_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
args : (Tensor x, Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad), Tensor(grad_out_grad)
infer_meta :
func : GeneralTernaryGradInferMeta
param : [x, y, grad_out]
kernel :
func : multiply_double_grad
optional : grad_x_grad, grad_y_grad
- backward_api : multiply_grad
forward : multiply (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
......@@ -1034,6 +1075,7 @@
param : [x, y]
kernel :
func : multiply_grad
backward : multiply_double_grad
- backward_api : mv_grad
forward : mv (Tensor x, Tensor vec) -> Tensor(out)
......@@ -1184,10 +1226,10 @@
- backward_api : relu_double_grad
forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
args : (Tensor out, Tensor grad_x_grad)
output : Tensor(out_grad), Tensor(grad_out_grad)
output : Tensor(grad_out_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [out, out]
func : UnchangedInferMeta
param : [out]
kernel :
func : relu_double_grad
......@@ -1270,11 +1312,25 @@
kernel :
func : rsqrt_grad
- backward_api : scale_double_grad
forward : scale_grad (Tensor grad_out, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_x)
args : (Tensor grad_x_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true)
output : Tensor(grad_out_grad)
invoke : scale(grad_x_grad, scale, 0.0, bias_after_scale)
backward : scale_triple_grad
- backward_api : scale_grad
forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
args : (Tensor out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true)
output : Tensor(x_grad)
invoke : scale(out_grad, scale, 0.0, bias_after_scale)
backward : scale_double_grad
- backward_api : scale_triple_grad
forward : scale_double_grad (Tensor grad_grad_x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(grad_grad_out)
args : (Tensor grad_grad_out_grad, Scalar scale=1.0, float bias=0.0, bool bias_after_scale=true)
output : Tensor(grad_grad_x_grad)
invoke : scale(grad_grad_out_grad, scale, 0.0, bias_after_scale)
- backward_api : scatter_grad
forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册