Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
f484a61e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f484a61e
编写于
1月 03, 2023
作者:
W
WangZhen
提交者:
GitHub
1月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dy2St]Fix param and out grad names in dy2st for high order grad (#49461)
* Fix param and out grad names in dy2st for high order grad
上级
dc13f7c5
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
108 addition
and
30 deletion
+108
-30
paddle/fluid/eager/to_static/run_program_op_func.h
paddle/fluid/eager/to_static/run_program_op_func.h
+0
-6
paddle/fluid/eager/to_static/run_program_op_node.h
paddle/fluid/eager/to_static/run_program_op_node.h
+26
-21
paddle/fluid/operators/run_program_op.cc
paddle/fluid/operators/run_program_op.cc
+8
-0
python/paddle/fluid/tests/unittests/test_eager_run_program.py
...on/paddle/fluid/tests/unittests/test_eager_run_program.py
+4
-0
python/paddle/fluid/tests/unittests/test_run_program_op.py
python/paddle/fluid/tests/unittests/test_run_program_op.py
+18
-0
python/paddle/jit/dy2static/partial_program.py
python/paddle/jit/dy2static/partial_program.py
+52
-3
未找到文件。
paddle/fluid/eager/to_static/run_program_op_func.h
浏览文件 @
f484a61e
...
...
@@ -80,16 +80,10 @@ inline void run_program_ad_func(
trace_backward
,
&
p_autograd_x
,
&
p_autograd_params
);
if
(
require_any_grad
)
{
std
::
vector
<
std
::
string
>
out_names
;
for
(
auto
&
t
:
deref_out
)
{
out_names
.
emplace_back
(
t
.
name
());
}
egr
::
EagerUtils
::
PassStopGradient
(
false
,
&
p_autograd_outs
);
// Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad])
auto
grad_node
=
std
::
make_shared
<
GradNodeRunProgram
>
(
1
,
2
);
grad_node
->
SetFwdOutNames
(
out_names
);
// Set Attributes
grad_node
->
SetAttrMap
(
attrs
);
// Set TensorWrappers
...
...
paddle/fluid/eager/to_static/run_program_op_node.h
浏览文件 @
f484a61e
...
...
@@ -791,13 +791,15 @@ class GradNodeRunProgram : public egr::GradNodeBase {
}
}
auto
out_grad_names
=
PADDLE_GET_CONST
(
std
::
vector
<
std
::
string
>
,
attrs_
.
at
(
"out_grad_names"
));
PADDLE_ENFORCE_EQ
(
hooked_grads
[
0
].
size
(),
fwd_out_names_
.
size
(),
out_grad_names
.
size
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"The hooked_grads[0].size() and "
"
fwd_out_names_
.size() should be equal."
));
for
(
size_t
i
=
0
;
i
<
fwd_out_names_
.
size
();
++
i
)
{
hooked_grads
[
0
][
i
].
set_name
(
fwd_out_names_
[
i
]
+
"@GRAD"
);
"
out_grad_names
.size() should be equal."
));
for
(
size_t
i
=
0
;
i
<
out_grad_names
.
size
();
++
i
)
{
hooked_grads
[
0
][
i
].
set_name
(
out_grad_names
[
i
]
);
}
RunProgramGradAPI
(
x_
,
params_
,
...
...
@@ -829,10 +831,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
step_scope_
=
scopes
;
}
void
SetFwdOutNames
(
std
::
vector
<
std
::
string
>
out_names
)
{
fwd_out_names_
=
out_names
;
}
protected:
void
ConstructXGradTensors
(
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>
&
x
,
...
...
@@ -850,21 +848,30 @@ class GradNodeRunProgram : public egr::GradNodeBase {
}
void
ConstructParamGradTensors
(
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>
&
param
,
std
::
vector
<
paddle
::
experimental
::
Tensor
>
*
param_grad
)
{
for
(
auto
&
t
:
param
)
{
auto
t_grad
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
t
)
->
Grad
();
const
std
::
vector
<
paddle
::
experimental
::
Tensor
>
&
params
,
std
::
vector
<
paddle
::
experimental
::
Tensor
>
*
param_grads
)
{
auto
param_grad_names
=
PADDLE_GET_CONST
(
std
::
vector
<
std
::
string
>
,
attrs_
.
at
(
"param_grad_names"
));
PADDLE_ENFORCE_EQ
(
params
.
size
(),
param_grad_names
.
size
(),
paddle
::
platform
::
errors
::
InvalidArgument
(
"The param.size() and "
"param_grad_names.size() should be equal."
));
for
(
size_t
i
=
0
;
i
<
params
.
size
();
++
i
)
{
auto
&
p
=
params
[
i
];
auto
&
p_grad
=
egr
::
EagerUtils
::
unsafe_autograd_meta
(
p
)
->
Grad
();
// In eager mode, the number of param_grad should be the same as
// param, so here an empty Tensor is added for the param with
// stop_gradient=True
if
(
!
t
_grad
.
defined
())
{
param_grad
->
emplace_back
();
}
else
if
(
t
_grad
.
is_dense_tensor
())
{
param_grad
->
emplace_back
(
std
::
make_shared
<
phi
::
DenseTensor
>
());
}
else
if
(
t
_grad
.
is_selected_rows
())
{
param_grad
->
emplace_back
(
std
::
make_shared
<
phi
::
SelectedRows
>
());
if
(
!
p
_grad
.
defined
())
{
param_grad
s
->
emplace_back
();
}
else
if
(
p
_grad
.
is_dense_tensor
())
{
param_grad
s
->
emplace_back
(
std
::
make_shared
<
phi
::
DenseTensor
>
());
}
else
if
(
p
_grad
.
is_selected_rows
())
{
param_grad
s
->
emplace_back
(
std
::
make_shared
<
phi
::
SelectedRows
>
());
}
param_grad
->
back
().
set_name
(
t
.
name
()
+
"@GRAD"
);
param_grad
s
->
back
().
set_name
(
param_grad_names
[
i
]
);
}
}
...
...
@@ -880,8 +887,6 @@ class GradNodeRunProgram : public egr::GradNodeBase {
std
::
vector
<
paddle
::
experimental
::
Tensor
>
params_
;
std
::
vector
<
paddle
::
framework
::
Scope
*>
step_scope_
;
std
::
vector
<
std
::
string
>
fwd_out_names_
;
// Attribute Map
paddle
::
framework
::
AttributeMap
attrs_
;
};
paddle/fluid/operators/run_program_op.cc
浏览文件 @
f484a61e
...
...
@@ -130,6 +130,14 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
"(BlockDesc *)"
"The global block of executed backward program desc."
)
.
SetDefault
(
nullptr
);
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"param_grad_names"
,
"std::vector<std::string>"
"The names of parameter gradients."
)
.
SetDefault
({});
AddAttr
<
std
::
vector
<
std
::
string
>>
(
"out_grad_names"
,
"std::vector<std::string>"
"The names of output gradients."
)
.
SetDefault
({});
AddComment
(
R"DOC(
RunProgram operator.
...
...
python/paddle/fluid/tests/unittests/test_eager_run_program.py
浏览文件 @
f484a61e
...
...
@@ -135,6 +135,10 @@ class TestRunProgram(unittest.TestCase):
False
,
'program_id'
,
_hash_with_id
(
program
),
'param_grad_names'
,
[
'Fake_var@GRAD'
],
'out_grad_names'
,
[
out
.
name
+
'@GRAD'
],
]
use_interpretorcore
=
(
...
...
python/paddle/fluid/tests/unittests/test_run_program_op.py
浏览文件 @
f484a61e
...
...
@@ -262,6 +262,15 @@ class RunProgramOpTest(unittest.TestCase):
)
)
self
.
attrs
.
extend
(
(
'param_grad_names'
,
[
p
.
name
+
'@GRAD'
for
p
in
inputs
[
'Params'
]],
'out_grad_names'
,
[
out
.
name
+
'@GRAD'
for
out
in
outputs
[
'Out'
]],
)
)
_legacy_C_ops
.
run_program
(
inputs
[
'X'
],
inputs
[
'Params'
],
...
...
@@ -305,6 +314,15 @@ class RunProgramOpTest(unittest.TestCase):
)
)
self
.
attrs
.
extend
(
(
'param_grad_names'
,
[
p
.
name
+
'@GRAD'
for
p
in
inputs
[
'Params'
]],
'out_grad_names'
,
[
out
.
name
+
'@GRAD'
for
out
in
outputs
[
'Out'
]],
)
)
_legacy_C_ops
.
run_program
(
inputs
[
'X'
],
inputs
[
'Params'
],
...
...
python/paddle/jit/dy2static/partial_program.py
浏览文件 @
f484a61e
...
...
@@ -253,7 +253,7 @@ class PartialProgramLayer:
@
switch_to_static_graph
def
_create_forward_backward_train_program
(
self
):
whole_program
=
self
.
_
create_program
()
whole_program
=
self
.
_
train_program
forward_end_op_index
=
self
.
_infer_program
.
desc
.
block
(
0
).
op_size
()
return
self
.
_get_forward_backward_program_form
(
whole_program
,
forward_end_op_index
...
...
@@ -261,7 +261,7 @@ class PartialProgramLayer:
@
switch_to_static_graph
def
_create_forward_backward_train_amp_program
(
self
):
whole_program
=
self
.
_
create_amp_program
()
whole_program
=
self
.
_
train_amp_program
forward_end_op_index
=
self
.
_infer_amp_program
.
desc
.
block
(
0
).
op_size
()
return
self
.
_get_forward_backward_program_form
(
whole_program
,
forward_end_op_index
...
...
@@ -269,7 +269,7 @@ class PartialProgramLayer:
@
switch_to_static_graph
def
_create_forward_backward_train_pure_fp16_program
(
self
):
whole_program
=
self
.
_
create_pure_fp16_program
()
whole_program
=
self
.
_
train_pure_fp16_program
forward_end_op_index
=
self
.
_infer_pure_fp16_program
.
desc
.
block
(
0
).
op_size
()
...
...
@@ -404,6 +404,43 @@ class PartialProgramLayer:
def
_infer_pure_fp16_program_id
(
self
):
return
_hash_with_id
(
self
.
_infer_pure_fp16_program
,
self
)
@
LazyInitialized
def
_param_grad_names
(
self
):
names
=
[]
# NOTE: `names` and `self._params` must be in the same order so that
# the param grad name can be set correctly in the run_program.
for
param
in
self
.
_params
:
candidate
=
[
var_name
for
var_name
in
self
.
backward_program
.
block
(
0
).
vars
.
keys
()
if
var_name
.
endswith
(
param
.
name
+
'@GRAD'
)
]
if
candidate
:
names
.
append
(
max
(
candidate
,
key
=
lambda
name
:
name
.
count
(
'grad/'
))
)
else
:
names
.
append
(
param
.
name
+
'@GRAD'
)
return
names
@
LazyInitialized
def
_out_grad_names
(
self
):
names
=
[]
fwd_end_op_index
=
self
.
_get_end_op_index
()
for
i
in
range
(
fwd_end_op_index
+
1
,
min
(
fwd_end_op_index
+
2
*
len
(
self
.
_outputs
.
var_ids
),
len
(
self
.
program
.
block
(
0
).
ops
),
),
2
,
):
op
=
self
.
program
.
block
(
0
).
ops
[
i
]
if
op
.
type
==
'fill_constant'
:
var_name
=
op
.
output
(
'Out'
)[
0
]
names
.
append
(
var_name
)
return
names
@
property
def
whole_program_id
(
self
):
if
self
.
training
:
...
...
@@ -610,6 +647,18 @@ class PartialProgramLayer:
'program_id'
,
self
.
program_id
,
]
if
self
.
training
:
# NOTE: In the case of higher-order gradient, the names of the parameter grads may be like
# `grad/grad/grad/linear_0.w_0@GRAD` instead of simply `linear_0.w_0@GRAD`, so we get
# the correct names of the parameter grads from program. And out grads are similar to above.
attrs
.
extend
(
(
'param_grad_names'
,
self
.
_param_grad_names
,
'out_grad_names'
,
self
.
_out_grad_names
,
)
)
if
self
.
_cuda_graph_capture_mode
:
attrs
.
extend
(
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录