Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3346d681
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3346d681
编写于
3月 16, 2023
作者:
K
kangguangli
提交者:
GitHub
3月 16, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove parallelExecutor related unit tests about DistributedTraining (#51698)
上级
2440c980
变更
7
展开全部
显示空白变更内容
内联
并排
Showing
7 changed file
with
0 addition
and
2809 deletion
+0
-2809
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+0
-3
python/paddle/fluid/tests/unittests/dist_save_load.py
python/paddle/fluid/tests/unittests/dist_save_load.py
+0
-222
python/paddle/fluid/tests/unittests/dist_transformer.py
python/paddle/fluid/tests/unittests/dist_transformer.py
+0
-2030
python/paddle/fluid/tests/unittests/test_dist_save_load.py
python/paddle/fluid/tests/unittests/test_dist_save_load.py
+0
-173
python/paddle/fluid/tests/unittests/test_dist_transformer.py
python/paddle/fluid/tests/unittests/test_dist_transformer.py
+0
-89
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
+0
-291
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+0
-1
未找到文件。
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
3346d681
...
...
@@ -43,7 +43,6 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend
)
list
(
APPEND MIXED_DIST_TEST_OPS test_ascend_group
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_api_input
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_base
)
list
(
APPEND MIXED_DIST_TEST_OPS test_fleet_base_2
)
...
...
@@ -602,7 +601,6 @@ if(WITH_DISTRIBUTE)
add_subdirectory
(
collective
)
# FIXME(typhoonzero): add these tests back
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_transformer"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_transpiler"
)
# TODO(sandyhouse): fix and add the ut back
...
...
@@ -615,7 +613,6 @@ if(WITH_DISTRIBUTE)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_ctr"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_mnist_lars"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_mnist_train"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_save_load"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_text_classification"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_train"
)
list
(
REMOVE_ITEM DIST_TEST_OPS
"test_dist_word2vec"
)
...
...
python/paddle/fluid/tests/unittests/dist_save_load.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
pickle
import
sys
import
numpy
as
np
from
dist_simnet_bow
import
DATA_MD5
,
DATA_URL
,
TestDistSimnetBow2x2
from
test_dist_base
import
RUN_STEP
,
runtime_main
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
class
TestDistSaveLoad2x2
(
TestDistSimnetBow2x2
):
def
_load_persistable_vars
(
self
,
executor
,
dirname
,
program
):
def
_is_checkpoint_var
(
var
):
"""
the checkpoint will not save or load all the variables.
var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
: param var(Variable)
"""
if
(
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FEED_MINIBATCH
or
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
FETCH_LIST
or
var
.
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
RAW
):
return
False
# @GRAD are named for gradient variables, checkpoint will not save it.
if
"@GRAD"
in
var
.
name
:
return
False
# .trainer_ are named for distribute train variables, checkpoint will not save it.
if
".trainer_"
in
var
.
name
:
return
False
# .block is named for distribute train variables, checkpoint will not save it.
if
".block"
in
var
.
name
:
return
False
if
"tmp_"
in
var
.
name
:
return
False
return
var
.
persistable
paddle
.
static
.
io
.
load_vars
(
executor
,
dirname
=
dirname
,
main_program
=
program
,
predicate
=
_is_checkpoint_var
,
filename
=
None
,
)
def
run_pserver
(
self
,
args
):
self
.
get_model
(
batch_size
=
2
)
# NOTE: pserver should not call memory optimize
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
,
False
,
args
.
current_endpoint
,
)
pserver_prog
=
t
.
get_pserver_program
(
args
.
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
args
.
current_endpoint
,
pserver_prog
)
need_load
=
bool
(
int
(
os
.
getenv
(
"LOAD"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
if
need_load
and
model_dir
:
paddle
.
distributed
.
io
.
load_persistables
(
exe
,
model_dir
,
pserver_prog
)
exe
.
run
(
pserver_prog
)
def
run_trainer
(
self
,
args
):
(
test_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
,
)
=
self
.
get_model
(
batch_size
=
2
)
if
args
.
update_method
==
"pserver"
:
t
=
self
.
get_transpiler
(
args
.
trainer_id
,
fluid
.
default_main_program
(),
args
.
endpoints
,
args
.
trainers
,
args
.
sync_mode
,
)
trainer_prog
=
t
.
get_trainer_program
()
else
:
trainer_prog
=
fluid
.
default_main_program
()
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
startup_exe
=
fluid
.
Executor
(
place
)
startup_exe
.
run
(
fluid
.
default_startup_program
())
strategy
=
fluid
.
ExecutionStrategy
()
strategy
.
num_threads
=
1
build_stra
=
fluid
.
BuildStrategy
()
if
args
.
use_reduce
:
build_stra
.
reduce_strategy
=
(
fluid
.
BuildStrategy
.
ReduceStrategy
.
Reduce
)
else
:
build_stra
.
reduce_strategy
=
(
fluid
.
BuildStrategy
.
ReduceStrategy
.
AllReduce
)
exe
=
fluid
.
ParallelExecutor
(
args
.
use_cuda
,
loss_name
=
avg_cost
.
name
,
exec_strategy
=
strategy
,
build_strategy
=
build_stra
,
)
feed_var_list
=
[
var
for
var
in
trainer_prog
.
global_block
().
vars
.
values
()
if
var
.
is_data
]
feeder
=
fluid
.
DataFeeder
(
feed_var_list
,
place
)
reader_generator
=
train_reader
()
def
get_data
():
origin_batch
=
next
(
reader_generator
)
if
args
.
update_method
==
"pserver"
and
args
.
use_reader_alloc
:
new_batch
=
[]
for
offset
,
item
in
enumerate
(
origin_batch
):
if
offset
%
2
==
args
.
trainer_id
:
new_batch
.
append
(
item
)
return
new_batch
else
:
return
origin_batch
need_save
=
bool
(
int
(
os
.
getenv
(
"SAVE"
,
"0"
)))
model_dir
=
os
.
getenv
(
"MODEL_DIR"
,
""
)
save_mode
=
os
.
getenv
(
"SAVE_MODE"
,
""
)
if
save_mode
==
"LOCAL"
:
if
need_save
:
for
_
in
range
(
RUN_STEP
):
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
())
)
if
need_save
and
model_dir
:
paddle
.
distributed
.
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
var
=
np
.
array
(
fluid
.
global_scope
().
find_var
(
'__fc_b__'
).
get_tensor
()
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
np
.
ravel
(
var
).
tolist
()))
elif
save_mode
==
"DIST"
:
skip_steps
=
int
(
os
.
getenv
(
"SKIP_STEPS"
))
loss
=
None
if
need_save
:
for
idx
in
range
(
8
):
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
get_data
())
)
if
(
need_save
and
model_dir
and
idx
==
skip_steps
and
args
.
trainer_id
==
0
):
paddle
.
distributed
.
io
.
save_persistables
(
startup_exe
,
model_dir
,
trainer_prog
)
else
:
for
idx
in
range
(
8
):
data
=
get_data
()
if
idx
<=
skip_steps
:
continue
(
loss
,)
=
exe
.
run
(
fetch_list
=
[
avg_cost
.
name
],
feed
=
feeder
.
feed
(
data
)
)
sys
.
stdout
.
buffer
.
write
(
pickle
.
dumps
(
loss
.
tolist
()))
else
:
raise
Exception
(
"save_mode must be LOCAL or DIST"
)
if
__name__
==
"__main__"
:
paddle
.
dataset
.
common
.
download
(
DATA_URL
,
'simnet'
,
DATA_MD5
,
"train"
)
runtime_main
(
TestDistSaveLoad2x2
)
python/paddle/fluid/tests/unittests/dist_transformer.py
已删除
100644 → 0
浏览文件 @
2440c980
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/test_dist_save_load.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
shutil
import
tempfile
import
unittest
import
numpy
as
np
from
test_dist_base
import
TestDistBase
flag_name
=
os
.
path
.
splitext
(
__file__
)[
0
]
class
TestDistSaveLoadDense2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{},
log_name
=
""
,
):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"http_proxy"
:
""
,
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_vmodule"
]
=
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
model_dir
=
tempfile
.
mkdtemp
()
local_env
=
{}
local_env
[
"SAVE"
]
=
"1"
local_env
[
"MODEL_DIR"
]
=
model_dir
local_env
.
update
(
required_envs
)
cluster_env
=
{}
cluster_env
[
"LOAD"
]
=
"1"
cluster_env
[
"MODEL_DIR"
]
=
model_dir
cluster_env
.
update
(
required_envs
)
local_var
=
self
.
_run_local
(
model_file
,
local_env
,
check_error_log
)
tr0_var
,
tr1_var
=
self
.
_run_cluster
(
model_file
,
cluster_env
,
check_error_log
,
log_name
=
flag_name
)
shutil
.
rmtree
(
model_dir
)
local_np
=
np
.
array
(
local_var
)
train0_np
=
np
.
array
(
tr0_var
)
train1_np
=
np
.
array
(
tr1_var
)
np
.
testing
.
assert_almost_equal
(
local_np
,
train0_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
local_np
,
train1_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
train0_np
,
train1_np
,
decimal
=
2
)
def
test_dist
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
,
'SAVE_MODE'
:
'LOCAL'
,
}
self
.
check_with_place
(
"dist_save_load.py"
,
delta
=
0
,
check_error_log
=
False
,
need_envs
=
need_envs
,
)
class
TestDistSaveLoadWithPServerStateDense2x2
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
self
.
_enforce_place
=
"CPU"
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
,
need_envs
=
{},
log_name
=
""
,
):
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
,
""
),
"PYTHONPATH"
:
os
.
getenv
(
"PYTHONPATH"
,
""
),
"LD_LIBRARY_PATH"
:
os
.
getenv
(
"LD_LIBRARY_PATH"
,
""
),
"http_proxy"
:
""
,
}
required_envs
.
update
(
need_envs
)
if
check_error_log
:
required_envs
[
"GLOG_vmodule"
]
=
"fused_all_reduce_op_handle=10,all_reduce_op_handle=10,alloc_continuous_space_op=10,fuse_all_reduce_op_pass=10,alloc_continuous_space_for_grad_pass=10,fast_threaded_ssa_graph_executor=10"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
model_dir
=
tempfile
.
mkdtemp
()
save_env
=
{}
save_env
[
"SAVE_MODE"
]
=
"DIST"
save_env
[
"SAVE"
]
=
"1"
save_env
[
"MODEL_DIR"
]
=
model_dir
save_env
.
update
(
required_envs
)
tr0_var_1
,
tr1_var_1
=
self
.
_run_cluster
(
model_file
,
save_env
,
check_error_log
,
log_name
=
flag_name
)
load_env
=
{}
load_env
[
"LOAD"
]
=
"1"
load_env
[
"MODEL_DIR"
]
=
model_dir
load_env
.
update
(
required_envs
)
tr0_var_2
,
tr1_var_2
=
self
.
_run_cluster
(
model_file
,
load_env
,
check_error_log
,
log_name
=
flag_name
)
shutil
.
rmtree
(
model_dir
)
train0_1_np
=
np
.
array
(
tr0_var_1
)
train1_1_np
=
np
.
array
(
tr1_var_1
)
train0_2_np
=
np
.
array
(
tr0_var_2
)
train1_2_np
=
np
.
array
(
tr1_var_2
)
np
.
testing
.
assert_almost_equal
(
train0_1_np
,
train0_2_np
,
decimal
=
2
)
np
.
testing
.
assert_almost_equal
(
train1_1_np
,
train1_2_np
,
decimal
=
2
)
def
test_dist
(
self
):
need_envs
=
{
"IS_DISTRIBUTED"
:
'0'
,
"IS_SPARSE"
:
'0'
,
'IS_SELF_CONTAINED_LR'
:
'1'
,
'SAVE_MODE'
:
'DIST'
,
'OPTIMIZER'
:
'ADAM'
,
'SKIP_STEPS'
:
str
(
np
.
random
.
randint
(
2
,
6
)),
}
self
.
check_with_place
(
"dist_save_load.py"
,
delta
=
0
,
check_error_log
=
True
,
need_envs
=
need_envs
,
log_name
=
flag_name
,
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_transformer.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
from
test_dist_base
import
TestDistBase
import
paddle
def
download_files
():
url_prefix
=
'http://paddle-unittest-data.bj.bcebos.com/dist_transformer/'
vocab_url
=
url_prefix
+
'vocab.bpe.32000'
vocab_md5
=
'a86d345ca6e27f6591d0dccb1b9be853'
paddle
.
dataset
.
common
.
download
(
vocab_url
,
'test_dist_transformer'
,
vocab_md5
)
local_train_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de'
local_train_md5
=
'033eb02b9449e6dd823f050782ac8914'
paddle
.
dataset
.
common
.
download
(
local_train_url
,
'test_dist_transformer'
,
local_train_md5
)
train0_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_0'
train0_md5
=
'ddce7f602f352a0405267285379a38b1'
paddle
.
dataset
.
common
.
download
(
train0_url
,
'test_dist_transformer'
,
train0_md5
)
train1_url
=
url_prefix
+
'train.tok.clean.bpe.32000.en-de.train_1'
train1_md5
=
'8757798200180285b1a619cd7f408747'
paddle
.
dataset
.
common
.
download
(
train1_url
,
'test_dist_transformer'
,
train1_md5
)
test_url
=
url_prefix
+
'newstest2013.tok.bpe.32000.en-de'
test_md5
=
'9dd74a266dbdb25314183899f269b4a2'
paddle
.
dataset
.
common
.
download
(
test_url
,
'test_dist_transformer'
,
test_md5
)
# cut test data for faster CI
orig_path
=
os
.
path
.
join
(
paddle
.
dataset
.
common
.
DATA_HOME
,
"test_dist_transformer"
,
"newstest2013.tok.bpe.32000.en-de"
,
)
head_path
=
os
.
path
.
join
(
paddle
.
dataset
.
common
.
DATA_HOME
,
"test_dist_transformer"
,
"newstest2013.tok.bpe.32000.en-de.cut"
,
)
os
.
system
(
"head -n10 %s > %s"
%
(
orig_path
,
head_path
))
class
TestDistTransformer2x2Sync
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
True
def
test_dist_train
(
self
):
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
class
TestDistTransformer2x2Async
(
TestDistBase
):
def
_setup_config
(
self
):
self
.
_sync_mode
=
False
def
test_dist_train
(
self
):
download_files
()
self
.
check_with_place
(
"dist_transformer.py"
,
delta
=
1.0
,
check_error_log
=
False
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_api_input.py
已删除
100644 → 0
浏览文件 @
2440c980
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
dist_fleet_simnet_bow
import
train_network
import
paddle
import
paddle.fluid
as
fluid
import
paddle.incubate.distributed.fleet.role_maker
as
role_maker
from
paddle.distributed.transpiler.distribute_transpiler
import
(
DistributeTranspilerConfig
,
)
from
paddle.incubate.distributed.fleet.collective
import
CollectiveOptimizer
# from paddle.incubate.distributed.fleet.parameter_server import TranspilerOptimizer
from
paddle.incubate.distributed.fleet.parameter_server.distribute_transpiler
import
(
fleet
,
)
from
paddle.incubate.distributed.fleet.role_maker
import
(
Role
,
UserDefinedCollectiveRoleMaker
,
UserDefinedRoleMaker
,
)
class
DistributeTranspilerConfigTest
(
unittest
.
TestCase
):
def
set_runtime_split_send_recv
(
self
,
config
,
value
):
config
.
runtime_split_send_recv
=
value
def
set_sync_mode
(
self
,
config
,
value
):
config
.
sync_mode
=
value
def
testConfig
(
self
):
config
=
DistributeTranspilerConfig
()
self
.
assertRaises
(
Exception
,
self
.
set_sync_mode
,
config
,
None
)
self
.
assertRaises
(
Exception
,
self
.
set_runtime_split_send_recv
,
config
,
None
)
self
.
assertRaises
(
Exception
,
self
.
set_runtime_split_send_recv
,
config
,
True
)
self
.
set_sync_mode
(
config
,
False
)
self
.
assertFalse
(
config
.
sync_mode
)
self
.
set_runtime_split_send_recv
(
config
,
True
)
self
.
assertRaises
(
Exception
,
self
.
set_sync_mode
,
config
,
True
)
class
FleetTest
(
unittest
.
TestCase
):
def
testInvalidInputs
(
self
):
self
.
assertRaises
(
Exception
,
fleet
.
split_files
,
"files"
)
self
.
assertRaises
(
Exception
,
fleet
.
init
,
"pserver"
)
data
=
paddle
.
static
.
data
(
name
=
'X'
,
shape
=
[
-
1
,
1
],
dtype
=
'float32'
)
hidden
=
paddle
.
static
.
nn
.
fc
(
x
=
data
,
size
=
10
)
loss
=
paddle
.
mean
(
hidden
)
adam
=
fluid
.
optimizer
.
Adam
()
adam
.
minimize
(
loss
)
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
False
,
loss_name
=
loss
.
name
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
pe
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
"executor"
,
)
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()
)
self
.
assertRaises
(
Exception
,
fleet
.
save_inference_model
,
dirname
=
'/tmp/'
,
feeded_var_names
=
[
'X'
],
target_vars
=
[
loss
],
executor
=
exe
,
main_program
=
compiled_prog
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
pe
,
dirname
=
'/tmp/'
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
"executor"
,
dirname
=
'/tmp/'
,
)
self
.
assertRaises
(
Exception
,
fleet
.
save_persistables
,
executor
=
exe
,
dirname
=
'/tmp/'
,
main_program
=
compiled_prog
,
)
# self.assertRaises(Exception, fleet._transpile, "config")
def
set_program
(
self
,
avg_cost
,
strategy
):
with
fluid
.
scope_guard
(
fluid
.
Scope
()):
optimizer
=
fluid
.
optimizer
.
SGD
(
0.1
)
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
avg_cost
)
def
test_init_role
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
],
)
# for test optimizer without init(role)
# fleet.init(role)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
geo_sgd_mode
=
True
strategy
.
geo_sgd_need_push_nums
=
5
avg_cost
,
_
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
assertRaises
(
Exception
,
self
.
set_program
,
avg_cost
,
strategy
)
def
test_transpile
(
self
):
role
=
role_maker
.
UserDefinedRoleMaker
(
current_id
=
0
,
role
=
role_maker
.
Role
.
SERVER
,
worker_num
=
2
,
server_endpoints
=
[
"127.0.0.1:36011"
,
"127.0.0.1:36012"
],
)
# for test optimizer without init(role)
fleet
.
init
(
role
)
batch_size
=
128
is_sparse
=
True
is_distribute
=
False
strategy
=
DistributeTranspilerConfig
()
strategy
.
sync_mode
=
False
strategy
.
runtime_split_send_recv
=
True
avg_cost
,
_
,
_
,
_
=
train_network
(
batch_size
,
is_distribute
,
is_sparse
)
self
.
set_program
(
avg_cost
,
strategy
)
strategy
.
runtime_split_send_recv
=
False
self
.
set_program
(
avg_cost
,
strategy
)
"""
class TranspilerOptimizerTest(unittest.TestCase):
def testInvalidInputs(self):
self.assertRaises(Exception, TranspilerOptimizer, "Adam", None)
self.assertRaises(
Exception,
TranspilerOptimizer,
fluid.optimizer.Adam(0.001),
"strategy",
)
transpiler = TranspilerOptimizer(fluid.optimizer.Adam(0.001))
self.assertRaises(Exception, transpiler.minimize, loss=[])
data = paddle.static.data(name='X', shape=[-1, 1], dtype='float32')
hidden = paddle.static.nn.fc(x=data, size=10)
loss = paddle.mean(hidden)
self.assertRaises(
Exception, transpiler.minimize, loss=loss.name, startup_program=[]
)
"""
class
UserDefinedRoleMakerTest
(
unittest
.
TestCase
):
def
createRoleMaker
(
self
,
current_id
=
0
,
role
=
Role
.
WORKER
,
worker_num
=
1
,
server_endpoints
=
[
"127.0.0.1:8080"
],
):
role
=
UserDefinedRoleMaker
(
current_id
,
role
,
worker_num
,
server_endpoints
)
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
# test all invalid server_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
None
)
# server_endpoints must be as list
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[]
)
# server_endpoints can't be empty
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[
3
,
[]]
)
# element in server_endpoints must be as string
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
server_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
],
)
# element in server_endpoints can't be duplicate
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=-
1
)
# current_id must be greater than or equal to 0
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
1
,
role
=
Role
.
SERVER
,
server_endpoints
=
[
"127.0.0.1:8080"
],
)
# if role is server, current_id must be less than len(server_endpoints)
# test all invalid worker_num
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
"1"
)
# worker_num must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_num
=
0
)
# worker_num must be greater than 0
# test all invalid role
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
role
=
3
)
# role must be as Role(Role.WORKER=1, Role.SERVER=2)
class
UserDefinedCollectiveRoleMakerTest
(
unittest
.
TestCase
):
def
createRoleMaker
(
self
,
current_id
=
0
,
worker_endpoints
=
[
"127.0.0.1:8080"
]
):
role
=
UserDefinedCollectiveRoleMaker
(
current_id
,
worker_endpoints
)
def
testRoleMaker
(
self
):
self
.
createRoleMaker
()
# test all invalid worker_endpoints
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
None
)
# worker_endpoints must be as list
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[]
)
# worker_endpoints can't be empty
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[
3
,
[]]
)
# element worker_endpoints must be as string
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
worker_endpoints
=
[
"127.0.0.1:8080"
,
"127.0.0.1:8080"
],
)
# element in worker_endpoints can't be duplicate
# test all invalid current_id
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
"0"
)
# current_id must be as int
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=-
1
)
# current_id must be greater than or equal to 0
self
.
assertRaises
(
Exception
,
self
.
createRoleMaker
,
current_id
=
1
,
worker_endpoints
=
[
"127.0.0.1:8080"
],
)
# current_id must be less than len(worker_endpoints)
class
CollectiveOptimizerTest
(
unittest
.
TestCase
):
def
test_ds_as_None
(
self
):
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
()
dist_optimizer
=
CollectiveOptimizer
(
optimizer
,
strategy
=
None
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
浏览文件 @
3346d681
...
...
@@ -71,7 +71,6 @@ class TestFleetBase(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
paddle
.
static
.
default_startup_program
())
pe
=
fluid
.
ParallelExecutor
(
use_cuda
=
False
,
loss_name
=
avg_cost
.
name
)
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
()
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录