Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a5ccc713
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a5ccc713
编写于
6月 20, 2022
作者:
Z
zhaoyingli
提交者:
GitHub
6月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
place all save/load path into temporary directory (#43652)
上级
0f16ccf5
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
175 addition
and
92 deletion
+175
-92
python/paddle/fluid/tests/unittests/test_collective_api_base.py
.../paddle/fluid/tests/unittests/test_collective_api_base.py
+15
-5
python/paddle/fluid/tests/unittests/test_collective_base.py
python/paddle/fluid/tests/unittests/test_collective_base.py
+11
-3
python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
.../paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
+5
-2
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+25
-9
python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
.../paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+8
-3
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
+10
-1
python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
...sts/unittests/test_distributed_fused_lamb_op_with_clip.py
+7
-5
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+9
-5
python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
...le/fluid/tests/unittests/test_fleet_elastic_collective.py
+8
-5
python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
...le/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
+11
-9
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
...on/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+12
-4
python/paddle/fluid/tests/unittests/test_monitor.py
python/paddle/fluid/tests/unittests/test_monitor.py
+8
-8
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+46
-33
未找到文件。
python/paddle/fluid/tests/unittests/test_collective_api_base.py
浏览文件 @
a5ccc713
...
...
@@ -23,6 +23,7 @@ import subprocess
import
traceback
import
functools
import
pickle
import
tempfile
from
contextlib
import
closing
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -97,6 +98,11 @@ class TestDistBase(unittest.TestCase):
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -155,9 +161,13 @@ class TestDistBase(unittest.TestCase):
tr_cmd
=
"%s %s"
tr0_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr1_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr0_pipe
=
open
(
"/tmp/tr0_err_%d.log"
%
os
.
getpid
(),
"w"
)
tr1_pipe
=
open
(
"/tmp/tr1_err_%d.log"
%
os
.
getpid
(),
"w"
)
#print(tr0_cmd)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr0_err_%d.log"
%
os
.
getpid
())
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr1_err_%d.log"
%
os
.
getpid
())
tr0_pipe
=
open
(
path0
,
"w"
)
tr1_pipe
=
open
(
path1
,
"w"
)
#print(tr0_cmd)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
...
...
@@ -177,9 +187,9 @@ class TestDistBase(unittest.TestCase):
# close trainer file
tr0_pipe
.
close
()
tr1_pipe
.
close
()
with
open
(
"/tmp/tr0_err_%d.log"
%
os
.
getpid
()
,
"r"
)
as
f
:
with
open
(
path0
,
"r"
)
as
f
:
sys
.
stderr
.
write
(
'trainer 0 stderr file: %s
\n
'
%
f
.
read
())
with
open
(
"/tmp/tr1_err_%d.log"
%
os
.
getpid
()
,
"r"
)
as
f
:
with
open
(
path1
,
"r"
)
as
f
:
sys
.
stderr
.
write
(
'trainer 1 stderr file: %s
\n
'
%
f
.
read
())
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
),
tr0_proc
.
pid
,
tr1_proc
.
pid
...
...
python/paddle/fluid/tests/unittests/test_collective_base.py
浏览文件 @
a5ccc713
...
...
@@ -23,6 +23,7 @@ import subprocess
import
traceback
import
functools
import
pickle
import
tempfile
from
contextlib
import
closing
import
paddle.fluid
as
fluid
import
paddle.fluid.unique_name
as
nameGen
...
...
@@ -145,6 +146,11 @@ class TestDistBase(unittest.TestCase):
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -183,9 +189,11 @@ class TestDistBase(unittest.TestCase):
tr_cmd
=
"%s %s"
tr0_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr1_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
#print(tr0_cmd)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr1_err.log"
)
tr0_pipe
=
open
(
path0
,
"wb"
)
tr1_pipe
=
open
(
path1
,
"wb"
)
#print(tr0_cmd)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
...
...
python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
浏览文件 @
a5ccc713
...
...
@@ -19,6 +19,7 @@ import unittest
import
time
import
threading
import
numpy
import
tempfile
import
paddle
paddle
.
enable_static
()
...
...
@@ -30,7 +31,9 @@ import paddle.distributed.fleet as fleet
class
TestCommunicator
(
unittest
.
TestCase
):
def
test_communicator_ps_gpu
(
self
):
with
open
(
"test_communicator_ps_gpu.txt"
,
"w"
)
as
f
:
temp_dir
=
tempfile
.
TemporaryDirectory
()
path
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_communicator_ps_gpu.txt"
)
with
open
(
path
,
"w"
)
as
f
:
data
=
"1 0.6 1 0.7
\n
"
f
.
write
(
data
)
...
...
@@ -90,7 +93,7 @@ class TestCommunicator(unittest.TestCase):
self
.
assertTrue
(
False
)
time
.
sleep
(
10
)
fleet
.
stop_worker
()
os
.
remove
(
"./test_communicator_ps_gpu.txt"
)
temp_dir
.
cleanup
(
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
a5ccc713
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
import
t
im
e
import
t
empfil
e
import
ast
import
unittest
...
...
@@ -867,6 +867,11 @@ class TestDistBase(unittest.TestCase):
self
.
_after_setup_config
()
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -909,8 +914,10 @@ class TestDistBase(unittest.TestCase):
print
(
ps0_cmd
)
print
(
ps1_cmd
)
ps0_pipe
=
open
(
log_name
+
"_ps0_err.log"
,
"wb"
)
ps1_pipe
=
open
(
log_name
+
"_ps1_err.log"
,
"wb"
)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_ps0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_ps1_err.log"
)
ps0_pipe
=
open
(
path0
,
"wb"
)
ps1_pipe
=
open
(
path1
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start pserver process 0"
)
ps0_proc
=
subprocess
.
Popen
(
...
...
@@ -990,7 +997,8 @@ class TestDistBase(unittest.TestCase):
print
(
"local_cmd: {}, env: {}"
.
format
(
cmd
,
env_local
))
if
check_error_log
:
err_log
=
open
(
log_name
+
"_local.log"
,
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_local.log"
)
err_log
=
open
(
path
,
"wb"
)
local_proc
=
subprocess
.
Popen
(
cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
...
...
@@ -1076,8 +1084,11 @@ class TestDistBase(unittest.TestCase):
print
(
"tr0_cmd: {}, env: {}"
.
format
(
tr0_cmd
,
env0
))
print
(
"tr1_cmd: {}, env: {}"
.
format
(
tr1_cmd
,
env1
))
tr0_pipe
=
open
(
log_name
+
"_tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
log_name
+
"_tr1_err.log"
,
"wb"
)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr1_err.log"
)
tr0_pipe
=
open
(
path0
,
"wb"
)
tr1_pipe
=
open
(
path1
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start trainer process 0"
)
tr0_proc
=
subprocess
.
Popen
(
...
...
@@ -1293,7 +1304,9 @@ class TestDistBase(unittest.TestCase):
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
log_name
+
"_tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
@@ -1355,7 +1368,9 @@ class TestDistBase(unittest.TestCase):
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
log_name
+
"_tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
@@ -1401,7 +1416,8 @@ class TestDistBase(unittest.TestCase):
tr_env
[
'FLAGS_cudnn_deterministic'
]
=
'0'
print
(
"tr_cmd:{}, env: {}"
.
format
(
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
"/tmp/"
+
"tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
+
"tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
浏览文件 @
a5ccc713
...
...
@@ -57,13 +57,15 @@ class TestDistMnistNCCL2DGC(TestDistBase):
def
tearDown
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
result
=
count_of_sparse_all_reduce_calls
(
'test_dist_mnist_dgc_nccl_tr0_err.log'
)
log_file
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'test_dist_mnist_dgc_nccl_tr0_err.log'
)
result
=
count_of_sparse_all_reduce_calls
(
log_file
)
# only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3
# temp close this test. In python3 CI, the log is right, but the result
# has a problem, may be in multi process mode, log is not written in time.
# self.assertEqual(result, 3)
self
.
temp_dir
.
cleanup
()
class
TestDistMnistNCCL2DGCMultiCards
(
TestDistBase
):
...
...
@@ -86,10 +88,13 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
def
tearDown
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
result
=
count_of_sparse_all_reduce_calls
(
log_file
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'test_dist_mnist_dgc_nccl_dgc_2cards_local.log'
)
result
=
count_of_sparse_all_reduce_calls
(
log_file
)
# same as above, but use two cards
self
.
assertEqual
(
result
,
6
)
self
.
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
浏览文件 @
a5ccc713
...
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
tempfile
import
unittest
from
paddle.dataset.common
import
download
,
DATA_HOME
from
paddle.distributed.fleet.dataset
import
TreeIndex
...
...
@@ -102,6 +104,12 @@ class TestTreeIndex(unittest.TestCase):
class
TestIndexSampler
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_layerwise_sampler
(
self
):
path
=
download
(
"https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb"
,
...
...
@@ -109,7 +117,8 @@ class TestIndexSampler(unittest.TestCase):
tdm_layer_counts
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
#tree = TreeIndex("demo", path)
file_name
=
"test_in_memory_dataset_tdm_sample_run.txt"
file_name
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"test_in_memory_dataset_tdm_sample_run.txt"
)
with
open
(
file_name
,
"w"
)
as
f
:
#data = "29 d 29 d 29 29 29 29 29 29 29 29 29 29 29 29\n"
data
=
"1 1 1 15 15 15
\n
"
...
...
python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
浏览文件 @
a5ccc713
...
...
@@ -18,6 +18,7 @@ import sys
import
shutil
import
unittest
import
paddle
import
tempfile
def
get_test_file
():
...
...
@@ -35,12 +36,13 @@ def remove_file_if_exists(file_name):
def
run_test
(
clip_after_allreduce
=
True
,
max_global_norm
=-
1.0
):
temp_dir
=
tempfile
.
TemporaryDirectory
()
if
not
paddle
.
is_compiled_with_cuda
():
return
if
os
.
name
==
'nt'
:
return
args
=
locals
()
log_dir
=
'log_{}'
.
format
(
os
.
getpid
(
))
log_dir
=
os
.
path
.
join
(
temp_dir
.
name
,
'log_{}'
.
format
(
os
.
getpid
()
))
cmd
=
[
sys
.
executable
,
'-u'
,
...
...
@@ -57,15 +59,15 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
os
.
environ
[
'MAX_GLOBAL_NORM'
]
=
str
(
max_global_norm
)
touch_file_env
=
'SUCCESS_TOUCH_FILE'
touch_file_name
=
'distributed_fused_lamb_touch_file_{}'
.
format
(
os
.
getpid
())
touch_file_name
=
os
.
path
.
join
(
temp_dir
.
name
,
'distributed_fused_lamb_touch_file_{}'
.
format
(
os
.
getpid
()))
os
.
environ
[
touch_file_env
]
=
touch_file_name
remove_file_if_exists
(
touch_file_name
)
try
:
assert
os
.
system
(
cmd
)
==
0
and
os
.
path
.
exists
(
touch_file_name
),
'Test failed when {}'
.
format
(
args
)
finally
:
remove_file_if_exists
(
touch_file_name
)
remove_file_if_exists
(
log_dir
)
temp_dir
.
cleanup
()
class
TestDistributedFusedLambWithClip
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
浏览文件 @
a5ccc713
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
tempfile
import
unittest
import
paddle
paddle
.
enable_static
()
...
...
@@ -69,14 +70,17 @@ class TestFleetBase(unittest.TestCase):
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
temp_dir
=
tempfile
.
TemporaryDirectory
()
fleet
.
init_worker
()
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
,
feed
=
[
'x'
,
'y'
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
,
feed
=
[
input_x
,
input_y
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
)
dirname
=
temp_dir
.
name
,
feed
=
[
'x'
,
'y'
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
temp_dir
.
name
,
feed
=
[
input_x
,
input_y
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
temp_dir
.
name
)
fleet
.
load_model
(
path
=
"/tmp"
,
mode
=
0
)
fleet
.
load_model
(
path
=
"/tmp"
,
mode
=
1
)
fleet
.
load_model
(
path
=
temp_dir
.
name
,
mode
=
0
)
fleet
.
load_model
(
path
=
temp_dir
.
name
,
mode
=
1
)
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
浏览文件 @
a5ccc713
...
...
@@ -33,12 +33,15 @@ print("test")
class
TestCollectiveLauncher
(
unittest
.
TestCase
):
def
setUp
(
self
):
file_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)
)
self
.
code_path
=
os
.
path
.
join
(
file_dir
,
"fake_python_for_elastic.py"
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
(
)
self
.
code_path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"fake_python_for_elastic.py"
)
with
open
(
self
.
code_path
,
"w"
)
as
f
:
f
.
write
(
fake_python_code
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_launch
(
self
):
class
Argument
:
elastic_server
=
"127.0.0.1:2379"
...
...
@@ -56,7 +59,7 @@ class TestCollectiveLauncher(unittest.TestCase):
run_mode
=
"cpuonly"
servers
=
None
rank_mapping_path
=
None
training_script
=
"fake_python_for_elastic.py"
training_script
=
self
.
code_path
training_script_args
=
[
"--use_amp false"
]
log_dir
=
None
...
...
@@ -94,7 +97,7 @@ class TestCollectiveLauncher(unittest.TestCase):
run_mode
=
"cpuonly"
servers
=
None
rank_mapping_path
=
None
training_script
=
"fake_python_for_elastic.py"
training_script
=
self
.
code_path
training_script_args
=
[
"--use_amp false"
]
log_dir
=
None
...
...
python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
浏览文件 @
a5ccc713
...
...
@@ -16,17 +16,25 @@ import unittest
import
paddle
import
numpy
as
np
import
os
import
tempfile
from
paddle.fluid
import
core
paddle
.
enable_static
()
class
TestDistModelRun
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
# step 6: clean up the env, delete the saved model and params
print
(
'cleaned up the env'
)
self
.
temp_dir
.
cleanup
()
def
test_dist_model_run
(
self
):
# step 0: declare folder to save the model and params
folder
=
'./dist_model_run_test/'
file
=
'inf'
path_prefix
=
folder
+
file
path_prefix
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"dist_model_run_test/inf"
)
# step 1: saving the inference model and params
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
28
,
28
],
dtype
=
'float32'
)
...
...
@@ -75,12 +83,6 @@ class TestDistModelRun(unittest.TestCase):
# step 5: compare two results
self
.
assertTrue
(
np
.
allclose
(
dist_model_rst
,
load_inference_model_rst
))
# step 6: clean up the env, delete the saved model and params
os
.
remove
(
path_prefix
+
'.pdiparams'
)
os
.
remove
(
path_prefix
+
'.pdmodel'
)
os
.
rmdir
(
folder
)
print
(
'cleaned up the env'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
浏览文件 @
a5ccc713
...
...
@@ -17,6 +17,7 @@ from __future__ import print_function
import
paddle
import
os
import
unittest
import
tempfile
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
...
@@ -28,7 +29,10 @@ class TestCloudRoleMaker2(unittest.TestCase):
def
setUp
(
self
):
"""Set up, set envs."""
pass
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_pslib_2
(
self
):
"""Test cases for pslib."""
...
...
@@ -37,6 +41,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
RoleMakerBase
paddle
.
enable_static
()
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
...
...
@@ -155,17 +161,19 @@ class TestCloudRoleMaker2(unittest.TestCase):
role23
=
GeneralRoleMaker
(
path
=
"./test_gloo_23"
)
role23
.
_get_size
()
role23
.
_get_size
()
with
open
(
"test_fleet_gloo_role_maker_1.txt"
,
"w"
)
as
f
:
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"test_fleet_gloo_role_maker_1.txt"
)
with
open
(
path
,
"w"
)
as
f
:
data
=
"1 1 1 1
\n
"
f
.
write
(
data
)
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
set_filelist
([
"test_fleet_gloo_role_maker_1.txt"
])
dataset
.
set_filelist
([
path
])
dataset
.
_set_use_var
([
show
,
label
])
dataset
.
load_into_memory
()
dataset
.
get_memory_data_size
(
fleet
)
dataset
.
get_shuffle_data_size
(
fleet
)
os
.
remove
(
"./test_fleet_gloo_role_maker_1.txt"
)
class
TmpClass
():
"""
...
...
python/paddle/fluid/tests/unittests/test_monitor.py
浏览文件 @
a5ccc713
...
...
@@ -24,6 +24,7 @@ import paddle.fluid.core as core
import
numpy
as
np
import
os
import
unittest
import
tempfile
class
TestDatasetWithStat
(
unittest
.
TestCase
):
...
...
@@ -35,12 +36,15 @@ class TestDatasetWithStat(unittest.TestCase):
self
.
drop_last
=
False
def
test_dataset_run_with_stat
(
self
):
with
open
(
"test_in_memory_dataset_run_a.txt"
,
"w"
)
as
f
:
temp_dir
=
tempfile
.
TemporaryDirectory
()
path_a
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_in_memory_dataset_run_a.txt"
)
path_b
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_in_memory_dataset_run_b.txt"
)
with
open
(
path_a
,
"w"
)
as
f
:
data
=
"1 1 2 3 3 4 5 5 5 5 1 1
\n
"
data
+=
"1 2 2 3 4 4 6 6 6 6 1 2
\n
"
data
+=
"1 3 2 3 5 4 7 7 7 7 1 3
\n
"
f
.
write
(
data
)
with
open
(
"test_in_memory_dataset_run_b.txt"
,
"w"
)
as
f
:
with
open
(
path_b
,
"w"
)
as
f
:
data
=
"1 4 2 3 3 4 5 5 5 5 1 4
\n
"
data
+=
"1 5 2 3 4 4 6 6 6 6 1 5
\n
"
data
+=
"1 6 2 3 5 4 7 7 7 7 1 6
\n
"
...
...
@@ -62,10 +66,7 @@ class TestDatasetWithStat(unittest.TestCase):
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
_set_batch_size
(
32
)
dataset
.
_set_thread
(
3
)
dataset
.
set_filelist
([
"test_in_memory_dataset_run_a.txt"
,
"test_in_memory_dataset_run_b.txt"
])
dataset
.
set_filelist
([
path_a
,
path_b
])
dataset
.
_set_pipe_command
(
"cat"
)
dataset
.
_set_use_var
(
slots_vars
)
dataset
.
load_into_memory
()
...
...
@@ -99,8 +100,7 @@ class TestDatasetWithStat(unittest.TestCase):
# total 56 keys
print
(
int_stat
[
"STAT_total_feasign_num_in_mem"
])
os
.
remove
(
"./test_in_memory_dataset_run_a.txt"
)
os
.
remove
(
"./test_in_memory_dataset_run_b.txt"
)
temp_dir
.
cleanup
()
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
a5ccc713
...
...
@@ -17,6 +17,7 @@ import subprocess
import
sys
,
os
import
json
import
shutil
import
tempfile
import
random
...
...
@@ -57,13 +58,18 @@ def get_files(pth, prefix):
class
Collective_Test
(
unittest
.
TestCase
):
def
setUp
(
self
):
write_file
(
pyname
,
colpyfile
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
self
.
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
pyname
)
write_file
(
self
.
path
,
colpyfile
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
pdrun
(
self
,
args
,
env
=
None
):
cmd
=
[
sys
.
executable
.
split
(
'/'
)[
-
1
],
"-m"
,
"paddle.distributed.launch"
]
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
cmd
.
extend
([
self
.
path
])
env
=
os
.
environ
.
copy
()
# virtual devies for testing
env
.
update
({
'CUDA_VISIBLE_DEVICES'
:
'0,1,2,3,4,5,6,7'
})
...
...
@@ -71,30 +77,30 @@ class Collective_Test(unittest.TestCase):
return
proc
def
test_collective_1
(
self
):
args
=
"--job_id test1"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id test1 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
log_dir
.
cleanup
()
def
test_collective_2
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id test2 --devices 0,1,2"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id test2 --devices 0,1,2 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'test2'
)
c
=
get_files
(
log_dir
.
name
,
'test2'
)
self
.
assertTrue
(
len
(
c
)
==
4
)
log_dir
.
cleanup
()
def
test_collective_3
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id test3 --devices 0,1 --
master 127.0.0.1:{} --np
2"
.
format
(
port
)
args
=
"--job_id test3 --devices 0,1 --
log_dir {} --master 127.0.0.1:{} --nnodes
2"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
p1
.
wait
()
...
...
@@ -102,47 +108,53 @@ class Collective_Test(unittest.TestCase):
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'test3'
)
c
=
get_files
(
log_dir
.
name
,
'test3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
log_dir
.
cleanup
()
class
PS_Test
(
unittest
.
TestCase
):
def
setUp
(
self
):
write_file
(
pyname
,
pspyfile
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
self
.
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
pyname
)
write_file
(
self
.
path
,
pspyfile
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
pdrun
(
self
,
args
,
env
=
None
):
cmd
=
[
sys
.
executable
.
split
(
'/'
)[
-
1
],
"-m"
,
"paddle.distributed.launch"
]
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
cmd
.
extend
([
self
.
path
])
proc
=
subprocess
.
Popen
(
cmd
,
env
)
return
proc
def
test_ps_1
(
self
):
args
=
"--run_mode ps"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--run_mode ps --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
log_dir
.
cleanup
()
def
test_ps_2
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id ps2 --server_num=2 --trainer_num=2"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id ps2 --server_num=2 --trainer_num=2 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps2'
)
c
=
get_files
(
log_dir
.
name
,
'ps2'
)
self
.
assertTrue
(
len
(
c
)
==
5
)
log_dir
.
cleanup
()
def
test_ps_3
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id ps3 --
master 127.0.0.1:{} --np
2 --server_num=1 --trainer_num=1"
.
format
(
port
)
args
=
"--job_id ps3 --
log_dir {} --master 127.0.0.1:{} --nnodes
2 --server_num=1 --trainer_num=1"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
p1
.
wait
()
...
...
@@ -150,20 +162,21 @@ class PS_Test(unittest.TestCase):
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps3'
)
c
=
get_files
(
log_dir
.
name
,
'ps3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
log_dir
.
cleanup
()
def
test_ps_4
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id ps4 --log_dir {} --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
.
format
(
log_dir
.
name
)
p1
=
self
.
pdrun
(
args
)
p1
.
wait
()
self
.
assertTrue
(
p1
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps4'
)
c
=
get_files
(
log_dir
.
name
,
'ps4'
)
self
.
assertTrue
(
len
(
c
)
==
5
)
log_dir
.
cleanup
()
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录