Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
f95ee9c0
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f95ee9c0
编写于
12月 12, 2018
作者:
W
Wu Yi
提交者:
GitHub
12月 12, 2018
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix nccl dist test acc (#14867)
* fix nccl dist test acc test=develop * fix test=develop
上级
66b6e473
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
15 addition
and
10 deletion
+15
-10
python/paddle/fluid/tests/unittests/dist_mnist.py
python/paddle/fluid/tests/unittests/dist_mnist.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+13
-8
python/paddle/fluid/tests/unittests/test_dist_mnist.py
python/paddle/fluid/tests/unittests/test_dist_mnist.py
+1
-1
未找到文件。
python/paddle/fluid/tests/unittests/dist_mnist.py
浏览文件 @
f95ee9c0
...
...
@@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
# TODO(typhoonzero): fix distributed adam optimizer
# opt = fluid.optimizer.AdamOptimizer(
# learning_rate=0.001, beta1=0.9, beta2=0.999)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
0.001
,
momentum
=
0.9
)
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
self
.
lr
,
momentum
=
0.9
)
# Reader
train_reader
=
paddle
.
batch
(
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
f95ee9c0
...
...
@@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2
class
TestDistRunnerBase
(
object
):
def
get_model
(
self
,
batch_size
=
DEFAULT_BATCH_SIZE
):
def
get_model
(
self
,
batch_size
=
DEFAULT_BATCH_SIZE
,
lr
=
0.1
):
raise
NotImplementedError
(
"get_model should be implemented by child classes."
)
...
...
@@ -56,6 +56,7 @@ class TestDistRunnerBase(object):
return
t
def
run_pserver
(
self
,
args
):
self
.
lr
=
args
.
lr
self
.
get_model
(
batch_size
=
args
.
batch_size
)
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
...
...
@@ -71,6 +72,7 @@ class TestDistRunnerBase(object):
exe
.
run
(
pserver_prog
)
def
run_trainer
(
self
,
args
):
self
.
lr
=
args
.
lr
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
=
\
self
.
get_model
(
batch_size
=
args
.
batch_size
)
...
...
@@ -189,6 +191,7 @@ def runtime_main(test_class):
parser
.
add_argument
(
'--use_reader_alloc'
,
action
=
'store_true'
,
required
=
False
)
parser
.
add_argument
(
'--batch_size'
,
required
=
False
,
type
=
int
,
default
=
2
)
parser
.
add_argument
(
'--lr'
,
required
=
False
,
type
=
float
,
default
=
0.001
)
parser
.
add_argument
(
'--batch_merge_repeat'
,
required
=
False
,
type
=
int
,
default
=
1
)
...
...
@@ -234,6 +237,7 @@ class TestDistBase(unittest.TestCase):
self
.
_dc_asgd
=
False
# must use with async mode
self
.
_use_reader_alloc
=
True
self
.
_nccl2_mode
=
False
self
.
_lr
=
0.001
self
.
_setup_config
()
self
.
_after_setup_config
()
...
...
@@ -284,7 +288,8 @@ class TestDistBase(unittest.TestCase):
batch_size
=
DEFAULT_BATCH_SIZE
,
batch_merge_repeat
=
1
):
cmd
=
"%s %s --role trainer"
%
(
self
.
_python_interp
,
model
)
cmd
=
"%s %s --role trainer --lr %f"
%
(
self
.
_python_interp
,
model
,
self
.
_lr
)
if
batch_size
!=
DEFAULT_BATCH_SIZE
:
cmd
+=
" --batch_size %d"
%
batch_size
if
batch_merge_repeat
>
1
:
...
...
@@ -330,13 +335,13 @@ class TestDistBase(unittest.TestCase):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver"
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver
--lr %f
"
tr0_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
0
,
ps0_ep
,
self
.
_trainers
)
0
,
ps0_ep
,
self
.
_trainers
,
self
.
_lr
)
tr1_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
1
,
ps1_ep
,
self
.
_trainers
)
1
,
ps1_ep
,
self
.
_trainers
,
self
.
_lr
)
if
self
.
_sync_mode
:
tr0_cmd
+=
" --sync_mode"
...
...
@@ -425,13 +430,13 @@ class TestDistBase(unittest.TestCase):
worker_endpoints
=
self
.
_ps_endpoints
.
split
(
","
)
w0_ep
,
w1_ep
=
worker_endpoints
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2"
tr_cmd
=
"%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2
--lr %f
"
tr0_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
0
,
w0_ep
)
0
,
w0_ep
,
self
.
_lr
/
2
)
tr1_cmd
=
tr_cmd
%
\
(
self
.
_python_interp
,
model
,
self
.
_ps_endpoints
,
1
,
w1_ep
)
1
,
w1_ep
,
self
.
_lr
/
2
)
if
self
.
_mem_opt
:
tr0_cmd
+=
" --mem_opt"
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist.py
浏览文件 @
f95ee9c0
...
...
@@ -36,7 +36,7 @@ class TestDistMnistNCCL2(TestDistBase):
def
test_dist_train
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1
)
self
.
check_with_place
(
"dist_mnist.py"
,
delta
=
1
e-5
)
class
TestDistMnist2x2Lars
(
TestDistBase
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录