Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wmsofts
Paddle
提交
3346d681
P
Paddle
项目概览
wmsofts
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
3346d681
编写于
3月 16, 2023
作者:
K
kangguangli
提交者:
GitHub
3月 16, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove parallelExecutor related unit tests about DistributedTraining (#51698)
上级
2440c980
变更
7
展开全部
显示空白变更内容
内联
并排
Showing
7 changed file
with
0 addition
and
2809 deletion
+0
-2809
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+0
-3
python/paddle/fluid/tests/unittests/dist_save_load.py
python/paddle/fluid/tests/unittests/dist_save_load.py
+0
-222
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+0
-2030
python/paddle/fluid/tests/unittests/test_dist_save_load.py
python/paddle/fluid/tests/unittests/test_dist_save_load.py
+0
-173
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+0
-89
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+0
-291
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+0
-1
未找到文件。
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
3346d681
...
...
@@ -43,7 +43,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend
)
list
(
APPEND MIXED_DIST_TEST_OPS test_ascend_group
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_api_input
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_base
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_base_2
)
...
...
@@ -602,7 +601,6 @@ if(WITH_DISTRIBUTE)
add_subdirectory
(
collective
)
# FIXME(typhoonzero): add these tests back
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_transformer"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_transpiler"
)
# TODO(sandyhouse): fix and add the ut back
...
...
@@ -615,7 +613,6 @@ if(WITH_DISTRIBUTE)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_ctr"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_mnist_lars"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_mnist_train"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_save_load"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_text_classification"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_train"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_word2vec"
)
...
...
python/paddle/fluid/tests/unittests/dist_save_load.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
pickle
import
sys
import
numpy
as
np
from
dist_simnet_bow
import
DATA_MD5
,
DATA_URL
,
TestDistSimnetBow2x2
from
test_dist_base
import
RUN_STEP
,
runtime_main
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
class
TestDistSaveLoad2x2
(
TestDistSimnetBow2x2
):
def
_load_persistable_vars
(
self
,
executor
,
dirname
,
program
):
def
_is_checkpoint_var
(
var
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
(
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
):
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
return
var
.
persistable
paddle
.
static
.
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
,
filename
=
None
,
)
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
,
False
,
args
.
current_endpoint
,
)
pserver_prog
=
t
.
get_pserver_program
(
args
.
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
args
.
current_endpoint
,
pserver_prog
)
need_load
=
bool
(
int
(
os
.
getenv
(
"LOAD"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
if
need_load
and
model_dir
:
paddle
.
distributed
.
io
.
load_persistables
(
exe
,
model_dir
,
pserver_prog
)
exe
.
run
(
pserver_prog
)
def
run_trainer
(
self
,
args
):
(
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
,
)
=
self
.
get_model
(
batch_size
=
2
)
if
args
.
update_method
==
"pserver"
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
,
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
build_stra
=
fluid
.
BuildStrategy
()
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
(
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
)
else
:
build_stra
.
reduce_strategy
=
(
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
)
exe
=
fluid
.
ParallelExecutor
(
args
.
use_cuda
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
,
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
reader_generator
=
train_reader
()
def
get_data
():
origin_batch
=
next
(
reader_generator
)
if
args
.
update_method
==
"pserver"
and
args
.
use_reader_alloc
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
origin_batch
):
if
offset
%
2
==
args
.
trainer_id
:
new_batch
.
append
(
item
)
return
new_batch
else
:
return
origin_batch
need_save
=
bool
(
int
(
os
.
getenv
(
"SAVE"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
save_mode
=
os
.
getenv
(
"SAVE_MODE"
,
""
)
if
save_mode
==
"LOCAL"
:
if
need_save
:
for
_
in
range
(
RUN_STEP
):
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
())
)
if
need_save
and
model_dir
:
paddle
.
distributed
.
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
var
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'__fc_b__'
).
get_tensor
()
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
np
.
ravel
(
var
).
tolist
()))
elif
save_mode
==
"DIST"
:
skip_steps
=
int
(
os
.
getenv
(
"SKIP_STEPS"
))
loss
=
None
if
need_save
:
for
idx
in
range
(
8
):
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
())
)
if
(
need_save
and
model_dir
and
idx
==
skip_steps
and
args
.
trainer_id
==
0
):
paddle
.
distributed
.
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
else
:
for
idx
in
range
(
8
):
data
=
get_data
()
if
idx
<=
skip_steps
:
continue
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
data
)
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
loss
.
tolist
()))
else
:
raise
Exception
(
"save_mode must be LOCAL or DIST"
)
if
__name__
==
"__main__"
:
paddle
.
dataset
.
common
.
download
(
DATA_URL
,
'simnet'
,
DATA_MD5
,
"train"
)
runtime_main
(
TestDistSaveLoad2x2
)
python/paddle/fluid/tests/unittests/dist_transformer.py
已删除
100644 → 0
浏览文件 @
2440c980
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_dist_save_load.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
unittest
import
numpy
as
np
from
test_dist_base
import
TestDistBase
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestDistSaveLoadDense2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{},
log_name
=
""
,
):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"http_proxy"
:
""
,
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_vmodule"
]
=
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
model_dir
=
tempfile
.
mkdtemp
()
local_env
=
{}
local_env
[
"SAVE"
]
=
"1"
local_env
[
"MODEL_DIR"
]
=
model_dir
local_env
.
update
(
required_envs
)
cluster_env
=
{}
cluster_env
[
"LOAD"
]
=
"1"
cluster_env
[
"MODEL_DIR"
]
=
model_dir
cluster_env
.
update
(
required_envs
)
local_var
=
self
.
_run_local
(
model_file
,
local_env
,
check_error_log
)
tr0_var
,
tr1_var
=
self
.
_run_cluster
(
model_file
,
cluster_env
,
check_error_log
,
log_name
=
flag_name
)
shutil
.
rmtree
(
model_dir
)
local_np
=
np
.
array
(
local_var
)
train0_np
=
np
.
array
(
tr0_var
)
train1_np
=
np
.
array
(
tr1_var
)
np
.
testing
.
assert_almost_equal
(
local_np
,
train0_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
local_np
,
train1_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
train0_np
,
train1_np
,
decimal
=
2
)
def
test_dist
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
,
'SAVE_MODE'
:
'LOCAL'
,
}
self
.
check_with_place
(
"dist_save_load.py"
,
delta
=
0
,
check_error_log
=
False
,
need_envs
=
need_envs
,
)
class
TestDistSaveLoadWithPServerStateDense2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{},
log_name
=
""
,
):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"http_proxy"
:
""
,
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_vmodule"
]
=
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
model_dir
=
tempfile
.
mkdtemp
()
save_env
=
{}
save_env
[
"SAVE_MODE"
]
=
"DIST"
save_env
[
"SAVE"
]
=
"1"
save_env
[
"MODEL_DIR"
]
=
model_dir
save_env
.
update
(
required_envs
)
tr0_var_1
,
tr1_var_1
=
self
.
_run_cluster
(
model_file
,
save_env
,
check_error_log
,
log_name
=
flag_name
)
load_env
=
{}
load_env
[
"LOAD"
]
=
"1"
load_env
[
"MODEL_DIR"
]
=
model_dir
load_env
.
update
(
required_envs
)
tr0_var_2
,
tr1_var_2
=
self
.
_run_cluster
(
model_file
,
load_env
,
check_error_log
,
log_name
=
flag_name
)
shutil
.
rmtree
(
model_dir
)
train0_1_np
=
np
.
array
(
tr0_var_1
)
train1_1_np
=
np
.
array
(
tr1_var_1
)
train0_2_np
=
np
.
array
(
tr0_var_2
)
train1_2_np
=
np
.
array
(
tr1_var_2
)
np
.
testing
.
assert_almost_equal
(
train0_1_np
,
train0_2_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
train1_1_np
,
train1_2_np
,
decimal
=
2
)
def
test_dist
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
,
'SAVE_MODE'
:
'DIST'
,
'OPTIMIZER'
:
'ADAM'
,
'SKIP_STEPS'
:
str
(
np
.
random
.
randint
(
2
,
6
)),
}
self
.
check_with_place
(
"dist_save_load.py"
,
delta
=
0
,
check_error_log
=
True
,
need_envs
=
need_envs
,
log_name
=
flag_name
,
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_transformer.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
from
test_dist_base
import
TestDistBase
import
paddle
def
download_files
():
url_prefix
=
'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
vocab_url
=
url_prefix
+
'vocab.bpe.32000'
vocab_md5
=
'a86d345ca6e27f6591d0dccb1b9be853'
paddle
.
dataset
.
common
.
download
(
vocab_url
,
'test_dist_transformer'
,
vocab_md5
)
local_train_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de'
local_train_md5
=
'033eb02b9449e6dd823f050782ac8914'
paddle
.
dataset
.
common
.
download
(
local_train_url
,
'test_dist_transformer'
,
local_train_md5
)
train0_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_0'
train0_md5
=
'ddce7f602f352a0405267285379a38b1'
paddle
.
dataset
.
common
.
download
(
train0_url
,
'test_dist_transformer'
,
train0_md5
)
train1_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_1'
train1_md5
=
'8757798200180285b1a619cd7f408747'
paddle
.
dataset
.
common
.
download
(
train1_url
,
'test_dist_transformer'
,
train1_md5
)
test_url
=
url_prefix
+
'newstest2013.tok.bpe.32000.en-de'
test_md5
=
'9dd74a266dbdb25314183899f269b4a2'
paddle
.
dataset
.
common
.
download
(
test_url
,
'test_dist_transformer'
,
test_md5
)
# cut test data for faster CI
orig_path
=
os
.
path
.
join
(
paddle
.
dataset
.
common
.
DATA_HOME
,
"test_dist_transformer"
,
"newstest2013.tok.bpe.32000.en-de"
,
)
head_path
=
os
.
path
.
join
(
paddle
.
dataset
.
common
.
DATA_HOME
,
"test_dist_transformer"
,
"newstest2013.tok.bpe.32000.en-de.cut"
,
)
os
.
system
(
"head -n10 %s > %s"
%
(
orig_path
,
head_path
))
class
TestDistTransformer2x2Sync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
def
test_dist_train
(
self
):
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
class
TestDistTransformer2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_dist_train
(
self
):
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1.0
,
check_error_log
=
False
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
dist_fleet_simnet_bow
import
train_network
import
paddle
import
paddle.fluid
as
fluid
import
paddle.incubate.distributed.fleet.role_maker
as
role_maker
from
paddle.distributed.transpiler.distribute_transpiler
import
(
DistributeTranspilerConfig
,
)
from
paddle.incubate.distributed.fleet.collective
import
CollectiveOptimizer
# from paddle.incubate.distributed.fleet.parameter_server import TranspilerOptimizer
from
paddle.incubate.distributed.fleet.parameter_server.distribute_transpiler
import
(
fleet
,
)
from
paddle.incubate.distributed.fleet.role_maker
import
(
Role
,
UserDefinedCollectiveRoleMaker
,
UserDefinedRoleMaker
,
)
class
DistributeTranspilerConfigTest
(
unittest
.
TestCase
):
def
set_runtime_split_send_recv
(
self
,
config
,
value
):
config
.
runtime_split_send_recv
=
value
def
set_sync_mode
(
self
,
config
,
value
):
config
.
sync_mode
=
value
def
testConfig
(
self
):
config
=
DistributeTranspilerConfig
()
self
.
assertRaises
(
Exception
,
self
.
set_sync_mode
,
config
,
None
)
self
.
assertRaises
(
Exception
,
self
.
set_runtime_split_send_recv
,
config
,
None
)
self
.
assertRaises
(
Exception
,
self
.
set_runtime_split_send_recv
,
config
,
True
)
self
.
set_sync_mode
(
config
,
False
)
self
.
assertFalse
(
config
.
sync_mode
)
self
.
set_runtime_split_send_recv
(
config
,
True
)
self
.
assertRaises
(
Exception
,
self
.
set_sync_mode
,
config
,
True
)
class
FleetTest
(
unittest
.
TestCase
):
def
testInvalidInputs
(
self
):
self
.
assertRaises
(
Exception
,
fleet
.
split_files
,
"files"
)
self
.
assertRaises
(
Exception
,
fleet
.
init
,
"pserver"
)
data
=
paddle
.
static
.
data
(
name
=
'X'
,
shape
=
[
-
1
,
1
],
dtype
=
'float32'
)
hidden
=
paddle
.
static
.
nn
.
fc
(
x
=
data
,
size
=
10
)
loss
=
paddle
.
mean
(
hidden
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
False
,
loss_name
=
loss
.
name
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
pe
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
"executor"
,
)
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
exe
,
main_program
=
compiled_prog
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
pe
,
dirname
=
'/tmp/'
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
"executor"
,
dirname
=
'/tmp/'
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
exe
,
dirname
=
'/tmp/'
,
main_program
=
compiled_prog
,
)
# self.assertRaises(Exception, fleet._transpile, "config")
def
set_program
(
self
,
avg_cost
,
strategy
):
with
fluid
.
scope_guard
(
fluid
.
Scope
()):
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
def
test_init_role
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
],
)
# for test optimizer without init(role)
# fleet.init(role)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
geo_sgd_mode
=
True
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
assertRaises
(
Exception
,
self
.
set_program
,
avg_cost
,
strategy
)
def
test_transpile
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
],
)
# for test optimizer without init(role)
fleet
.
init
(
role
)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
runtime_split_send_recv
=
True
avg_cost
,
_
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
set_program
(
avg_cost
,
strategy
)
strategy
.
runtime_split_send_recv
=
False
self
.
set_program
(
avg_cost
,
strategy
)
"""
class TranspilerOptimizerTest(unittest.TestCase):
def testInvalidInputs(self):
self.assertRaises(Exception, TranspilerOptimizer, "Adam", None)
self.assertRaises(
Exception,
TranspilerOptimizer,
fluid.optimizer.Adam(0.001),
"strategy",
)
transpiler = TranspilerOptimizer(fluid.optimizer.Adam(0.001))
self.assertRaises(Exception, transpiler.minimize, loss=[])
data = paddle.static.data(name='X', shape=[-1, 1], dtype='float32')
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
self.assertRaises(
Exception, transpiler.minimize, loss=loss.name, startup_program=[]
)
"""
class
UserDefinedRoleMakerTest
(
unittest
.
TestCase
):
def
createRoleMaker
(
self
,
current_id
=
0
,
role
=
Role
.
WORKER
,
worker_num
=
1
,
server_endpoints
=
[
"127.0.0.1:8080"
],
):
role
=
UserDefinedRoleMaker
(
current_id
,
role
,
worker_num
,
server_endpoints
)
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
# test all invalid server_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
None
)
# server_endpoints must be as list
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[]
)
# server_endpoints can't be empty
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[
3
,
[]]
)
# element in server_endpoints must be as string
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
],
)
# element in server_endpoints can't be duplicate
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=-
1
)
# current_id must be greater than or equal to 0
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
1
,
role
=
Role
.
SERVER
,
server_endpoints
=
[
"127.0.0.1:8080"
],
)
# if role is server, current_id must be less than len(server_endpoints)
# test all invalid worker_num
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
"1"
)
# worker_num must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
0
)
# worker_num must be greater than 0
# test all invalid role
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
role
=
3
)
# role must be as Role(Role.WORKER=1, Role.SERVER=2)
class
UserDefinedCollectiveRoleMakerTest
(
unittest
.
TestCase
):
def
createRoleMaker
(
self
,
current_id
=
0
,
worker_endpoints
=
[
"127.0.0.1:8080"
]
):
role
=
UserDefinedCollectiveRoleMaker
(
current_id
,
worker_endpoints
)
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
# test all invalid worker_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
None
)
# worker_endpoints must be as list
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[]
)
# worker_endpoints can't be empty
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[
3
,
[]]
)
# element worker_endpoints must be as string
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
],
)
# element in worker_endpoints can't be duplicate
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=-
1
)
# current_id must be greater than or equal to 0
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
1
,
worker_endpoints
=
[
"127.0.0.1:8080"
],
)
# current_id must be less than len(worker_endpoints)
class
CollectiveOptimizerTest
(
unittest
.
TestCase
):
def
test_ds_as_None
(
self
):
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
()
dist_optimizer
=
CollectiveOptimizer
(
optimizer
,
strategy
=
None
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
浏览文件 @
3346d681
...
...
@@ -71,7 +71,6 @@ class TestFleetBase(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
paddle
.
static
.
default_startup_program
())
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
False
,
loss_name
=
avg_cost
.
name
)
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录