Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
09482dde
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
09482dde
编写于
3月 18, 2021
作者:
C
Chengmo
提交者:
GitHub
3月 18, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
【Paddle.Fleet】Fix one ps gradient clip (#31664)
* fix one ps gradient clip
上级
740359ed
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
62 addition
and
47 deletion
+62
-47
python/paddle/distributed/fleet/runtime/the_one_ps.py
python/paddle/distributed/fleet/runtime/the_one_ps.py
+2
-1
python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
...paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+1
-1
python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
.../fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+7
-8
python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
...paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
+51
-36
未找到文件。
python/paddle/distributed/fleet/runtime/the_one_ps.py
浏览文件 @
09482dde
...
@@ -150,7 +150,8 @@ class CommonAccessor:
...
@@ -150,7 +150,8 @@ class CommonAccessor:
oop
=
None
oop
=
None
for
op
in
optimizer_ops
:
for
op
in
optimizer_ops
:
if
op
.
input
(
"Param"
)[
0
]
==
param_name
:
if
(
"Param"
in
op
.
input_names
)
and
(
op
.
input
(
"Param"
)[
0
]
==
param_name
):
oop
=
op
oop
=
op
break
break
...
...
python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
浏览文件 @
09482dde
...
@@ -31,7 +31,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
...
@@ -31,7 +31,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR
from
paddle.fluid.transpiler.details.program_utils
import
delete_ops
from
paddle.fluid.transpiler.details.program_utils
import
delete_ops
OP_NAME_SCOPE
=
"op_namescope"
OP_NAME_SCOPE
=
"op_namescope"
CLIP_OP_NAME_SCOPE
=
"
@CLIP
"
CLIP_OP_NAME_SCOPE
=
"
gradient_clip
"
STEP_COUNTER
=
"@PS_STEP_COUNTER@"
STEP_COUNTER
=
"@PS_STEP_COUNTER@"
LEARNING_RATE_DECAY_COUNTER
=
"@LR_DECAY_COUNTER@"
LEARNING_RATE_DECAY_COUNTER
=
"@LR_DECAY_COUNTER@"
...
...
python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
浏览文件 @
09482dde
...
@@ -32,7 +32,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_ta
...
@@ -32,7 +32,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_ta
from
paddle.fluid.incubate.fleet.parameter_server.mode
import
DistributedMode
from
paddle.fluid.incubate.fleet.parameter_server.mode
import
DistributedMode
OP_NAME_SCOPE
=
"op_namescope"
OP_NAME_SCOPE
=
"op_namescope"
CLIP_OP_NAME_SCOPE
=
"
@CLIP
"
CLIP_OP_NAME_SCOPE
=
"
gradient_clip
"
STEP_COUNTER
=
"@PS_STEP_COUNTER@"
STEP_COUNTER
=
"@PS_STEP_COUNTER@"
OP_ROLE_VAR_ATTR_NAME
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
OP_ROLE_VAR_ATTR_NAME
=
core
.
op_proto_and_checker_maker
.
kOpRoleVarAttrName
()
RPC_OP_ROLE_ATTR_NAME
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
RPC_OP_ROLE_ATTR_NAME
=
core
.
op_proto_and_checker_maker
.
kOpRoleAttrName
()
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
浏览文件 @
09482dde
...
@@ -18,6 +18,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
...
@@ -18,6 +18,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle
"""
"""
high level unit test for distribute fleet.
high level unit test for distribute fleet.
"""
"""
...
@@ -112,23 +113,21 @@ class FleetDistRunnerBase(object):
...
@@ -112,23 +113,21 @@ class FleetDistRunnerBase(object):
def
build_optimizer
(
self
,
avg_cost
,
strategy
):
def
build_optimizer
(
self
,
avg_cost
,
strategy
):
use_grad_clip
=
int
(
os
.
getenv
(
'GRAD_CLIP'
,
0
))
use_grad_clip
=
int
(
os
.
getenv
(
'GRAD_CLIP'
,
0
))
grad_clip
=
None
if
use_grad_clip
:
if
use_grad_clip
:
# 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
# 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm
if
use_grad_clip
==
1
:
if
use_grad_clip
==
1
:
fluid
.
clip
.
set_gradient_clip
(
grad_clip
=
paddle
.
nn
.
ClipGradByValue
(
min
=-
5.0
,
max
=
5.0
)
clip
=
fluid
.
clip
.
GradientClipByValue
(
2.0
))
elif
use_grad_clip
==
2
:
elif
use_grad_clip
==
2
:
fluid
.
clip
.
set_gradient_clip
(
grad_clip
=
paddle
.
nn
.
ClipGradByNorm
(
2.0
)
clip
=
fluid
.
clip
.
GradientClipByNorm
(
2.0
))
elif
use_grad_clip
==
3
:
elif
use_grad_clip
==
3
:
fluid
.
clip
.
set_gradient_clip
(
grad_clip
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
2.0
)
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
2.0
))
use_decay
=
int
(
os
.
getenv
(
"USE_DECAY"
,
"0"
))
use_decay
=
int
(
os
.
getenv
(
"USE_DECAY"
,
"0"
))
if
use_decay
:
if
use_decay
:
scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
scheduler
=
paddle
.
optimizer
.
lr
.
ExponentialDecay
(
learning_rate
=
LEARNING_RATE
,
gamma
=
0.999
,
verbose
=
True
)
learning_rate
=
LEARNING_RATE
,
gamma
=
0.999
,
verbose
=
True
)
optimizer
=
fluid
.
optimizer
.
SGD
(
scheduler
)
optimizer
=
fluid
.
optimizer
.
SGD
(
scheduler
,
grad_clip
=
grad_clip
)
"""
"""
# learning rate decay method before 2.0
# learning rate decay method before 2.0
optimizer = fluid.optimizer.SGD(
optimizer = fluid.optimizer.SGD(
...
@@ -139,7 +138,7 @@ class FleetDistRunnerBase(object):
...
@@ -139,7 +138,7 @@ class FleetDistRunnerBase(object):
staircase=True))
staircase=True))
"""
"""
else
:
else
:
optimizer
=
fluid
.
optimizer
.
SGD
(
LEARNING_RATE
)
optimizer
=
fluid
.
optimizer
.
SGD
(
LEARNING_RATE
,
grad_clip
=
grad_clip
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
=
strategy
)
optimizer
.
minimize
(
avg_cost
)
optimizer
.
minimize
(
avg_cost
)
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py
浏览文件 @
09482dde
...
@@ -16,53 +16,66 @@ from __future__ import print_function
...
@@ -16,53 +16,66 @@ from __future__ import print_function
import
os
import
os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
from
paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler
import
fleet
from
paddle.fluid.transpiler.distribute_transpiler
import
DistributeTranspilerConfig
from
test_dist_fleet_base
import
TestFleetBase
from
test_dist_fleet_base
import
TestFleetBase
from
dist_fleet_simnet_bow
import
train_network
@
unittest
.
skip
(
reason
=
"Skip unstable ut, add it after PR 22957 merged"
)
class
TestDistGeoClipByGlobalNorm
(
TestFleetBase
):
class
TestDistGeoClipByGlobalNormTranspiler
(
unittest
.
TestCase
):
def
_setup_config
(
self
):
def
test_pserver
(
self
):
self
.
_mode
=
"geo"
role
=
role_maker
.
UserDefinedRoleMaker
(
self
.
_reader
=
"dataset"
current_id
=
0
,
self
.
_geo_sgd_need_push_nums
=
5
role
=
role_maker
.
Role
.
SERVER
,
self
.
_grad_clip_mode
=
3
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
])
fleet
.
init
(
role
)
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{}):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"5000"
,
# 5sec to fail fast
"http_proxy"
:
""
}
required_envs
.
update
(
need_envs
)
batch_size
=
128
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
def
test_dist_train
(
self
):
strategy
.
sync_mode
=
False
self
.
check_with_place
(
strategy
.
geo_sgd_mode
=
True
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
def
_setup_config
(
self
):
fluid
.
clip
.
set_gradient_clip
(
self
.
_sync_mode
=
False
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
2.0
))
self
.
_grad_clip_mode
=
2
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
def
check_with_place
(
self
,
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
model_file
,
optimizer
.
minimize
(
avg_cost
)
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{}):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"FLAGS_rpc_deadline"
:
"5000"
,
# 5sec to fail fast
"http_proxy"
:
""
}
required_envs
.
update
(
need_envs
)
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
pserver_startup_program
=
fleet
.
startup_program
def
test_dist_train
(
self
):
pserver_mian_program
=
fleet
.
main_program
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
@
unittest
.
skip
(
reason
=
"Skip unstable ut, add it after PR 22957 merged"
)
class
TestDistASyncClipByValue
(
TestFleetBase
):
class
TestDistGeoClipByGlobalNorm
(
TestFleetBase
):
def
_setup_config
(
self
):
def
_setup_config
(
self
):
self
.
_mode
=
"
geo
"
self
.
_mode
=
"
async
"
self
.
_reader
=
"dataset"
self
.
_reader
=
"dataset"
self
.
_geo_sgd_need_push_nums
=
5
self
.
_grad_clip_mode
=
1
self
.
_grad_clip_mode
=
3
def
check_with_place
(
self
,
def
check_with_place
(
self
,
model_file
,
model_file
,
...
@@ -84,8 +97,11 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
...
@@ -84,8 +97,11 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
self
.
check_with_place
(
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
class
TestDistASyncClipByNorm
(
TestFleetBase
):
def
_setup_config
(
self
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
self
.
_mode
=
"async"
self
.
_reader
=
"dataset"
self
.
_grad_clip_mode
=
2
self
.
_grad_clip_mode
=
2
def
check_with_place
(
self
,
def
check_with_place
(
self
,
...
@@ -109,7 +125,6 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
...
@@ -109,7 +125,6 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase):
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
True
)
@
unittest
.
skip
(
reason
=
"Skip unstable ut, add it after PR 22957 merged"
)
class
TestDistASyncClipByGlobalNorm
(
TestFleetBase
):
class
TestDistASyncClipByGlobalNorm
(
TestFleetBase
):
def
_setup_config
(
self
):
def
_setup_config
(
self
):
self
.
_mode
=
"async"
self
.
_mode
=
"async"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录