Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
a5ccc713
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a5ccc713
编写于
6月 20, 2022
作者:
Z
zhaoyingli
提交者:
GitHub
6月 20, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
place all save/load path into temporary directory (#43652)
上级
0f16ccf5
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
175 addition
and
92 deletion
+175
-92
python/paddle/fluid/tests/unittests/test_collective_api_base.py
.../paddle/fluid/tests/unittests/test_collective_api_base.py
+15
-5
python/paddle/fluid/tests/unittests/test_collective_base.py
python/paddle/fluid/tests/unittests/test_collective_base.py
+11
-3
python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
.../paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
+5
-2
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+25
-9
python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
.../paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+8
-3
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
+10
-1
python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
...sts/unittests/test_distributed_fused_lamb_op_with_clip.py
+7
-5
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
+9
-5
python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
...le/fluid/tests/unittests/test_fleet_elastic_collective.py
+8
-5
python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
...le/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
+11
-9
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
...on/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+12
-4
python/paddle/fluid/tests/unittests/test_monitor.py
python/paddle/fluid/tests/unittests/test_monitor.py
+8
-8
python/paddle/fluid/tests/unittests/test_run.py
python/paddle/fluid/tests/unittests/test_run.py
+46
-33
未找到文件。
python/paddle/fluid/tests/unittests/test_collective_api_base.py
浏览文件 @
a5ccc713
...
...
@@ -23,6 +23,7 @@ import subprocess
import
traceback
import
functools
import
pickle
import
tempfile
from
contextlib
import
closing
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -97,6 +98,11 @@ class TestDistBase(unittest.TestCase):
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -155,9 +161,13 @@ class TestDistBase(unittest.TestCase):
tr_cmd
=
"%s %s"
tr0_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr1_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr0_pipe
=
open
(
"/tmp/tr0_err_%d.log"
%
os
.
getpid
(),
"w"
)
tr1_pipe
=
open
(
"/tmp/tr1_err_%d.log"
%
os
.
getpid
(),
"w"
)
#print(tr0_cmd)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr0_err_%d.log"
%
os
.
getpid
())
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr1_err_%d.log"
%
os
.
getpid
())
tr0_pipe
=
open
(
path0
,
"w"
)
tr1_pipe
=
open
(
path1
,
"w"
)
#print(tr0_cmd)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
...
...
@@ -177,9 +187,9 @@ class TestDistBase(unittest.TestCase):
# close trainer file
tr0_pipe
.
close
()
tr1_pipe
.
close
()
with
open
(
"/tmp/tr0_err_%d.log"
%
os
.
getpid
()
,
"r"
)
as
f
:
with
open
(
path0
,
"r"
)
as
f
:
sys
.
stderr
.
write
(
'trainer 0 stderr file: %s
\n
'
%
f
.
read
())
with
open
(
"/tmp/tr1_err_%d.log"
%
os
.
getpid
()
,
"r"
)
as
f
:
with
open
(
path1
,
"r"
)
as
f
:
sys
.
stderr
.
write
(
'trainer 1 stderr file: %s
\n
'
%
f
.
read
())
return
pickle
.
loads
(
tr0_out
),
pickle
.
loads
(
tr1_out
),
tr0_proc
.
pid
,
tr1_proc
.
pid
...
...
python/paddle/fluid/tests/unittests/test_collective_base.py
浏览文件 @
a5ccc713
...
...
@@ -23,6 +23,7 @@ import subprocess
import
traceback
import
functools
import
pickle
import
tempfile
from
contextlib
import
closing
import
paddle.fluid
as
fluid
import
paddle.fluid.unique_name
as
nameGen
...
...
@@ -145,6 +146,11 @@ class TestDistBase(unittest.TestCase):
self
.
_find_free_port
(),
self
.
_find_free_port
())
self
.
_python_interp
=
sys
.
executable
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -183,9 +189,11 @@ class TestDistBase(unittest.TestCase):
tr_cmd
=
"%s %s"
tr0_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr1_cmd
=
tr_cmd
%
(
self
.
_python_interp
,
model_file
)
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
#print(tr0_cmd)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"/tmp/tr1_err.log"
)
tr0_pipe
=
open
(
path0
,
"wb"
)
tr1_pipe
=
open
(
path1
,
"wb"
)
#print(tr0_cmd)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
strip
().
split
(),
stdout
=
subprocess
.
PIPE
,
...
...
python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py
浏览文件 @
a5ccc713
...
...
@@ -19,6 +19,7 @@ import unittest
import
time
import
threading
import
numpy
import
tempfile
import
paddle
paddle
.
enable_static
()
...
...
@@ -30,7 +31,9 @@ import paddle.distributed.fleet as fleet
class
TestCommunicator
(
unittest
.
TestCase
):
def
test_communicator_ps_gpu
(
self
):
with
open
(
"test_communicator_ps_gpu.txt"
,
"w"
)
as
f
:
temp_dir
=
tempfile
.
TemporaryDirectory
()
path
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_communicator_ps_gpu.txt"
)
with
open
(
path
,
"w"
)
as
f
:
data
=
"1 0.6 1 0.7
\n
"
f
.
write
(
data
)
...
...
@@ -90,7 +93,7 @@ class TestCommunicator(unittest.TestCase):
self
.
assertTrue
(
False
)
time
.
sleep
(
10
)
fleet
.
stop_worker
()
os
.
remove
(
"./test_communicator_ps_gpu.txt"
)
temp_dir
.
cleanup
(
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
a5ccc713
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
import
t
im
e
import
t
empfil
e
import
ast
import
unittest
...
...
@@ -867,6 +867,11 @@ class TestDistBase(unittest.TestCase):
self
.
_after_setup_config
()
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
_find_free_port
(
self
):
def
__free_port
():
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
...
...
@@ -909,8 +914,10 @@ class TestDistBase(unittest.TestCase):
print
(
ps0_cmd
)
print
(
ps1_cmd
)
ps0_pipe
=
open
(
log_name
+
"_ps0_err.log"
,
"wb"
)
ps1_pipe
=
open
(
log_name
+
"_ps1_err.log"
,
"wb"
)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_ps0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_ps1_err.log"
)
ps0_pipe
=
open
(
path0
,
"wb"
)
ps1_pipe
=
open
(
path1
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start pserver process 0"
)
ps0_proc
=
subprocess
.
Popen
(
...
...
@@ -990,7 +997,8 @@ class TestDistBase(unittest.TestCase):
print
(
"local_cmd: {}, env: {}"
.
format
(
cmd
,
env_local
))
if
check_error_log
:
err_log
=
open
(
log_name
+
"_local.log"
,
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_local.log"
)
err_log
=
open
(
path
,
"wb"
)
local_proc
=
subprocess
.
Popen
(
cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
...
...
@@ -1076,8 +1084,11 @@ class TestDistBase(unittest.TestCase):
print
(
"tr0_cmd: {}, env: {}"
.
format
(
tr0_cmd
,
env0
))
print
(
"tr1_cmd: {}, env: {}"
.
format
(
tr1_cmd
,
env1
))
tr0_pipe
=
open
(
log_name
+
"_tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
log_name
+
"_tr1_err.log"
,
"wb"
)
path0
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr0_err.log"
)
path1
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr1_err.log"
)
tr0_pipe
=
open
(
path0
,
"wb"
)
tr1_pipe
=
open
(
path1
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
"going to start trainer process 0"
)
tr0_proc
=
subprocess
.
Popen
(
...
...
@@ -1293,7 +1304,9 @@ class TestDistBase(unittest.TestCase):
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
log_name
+
"_tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
@@ -1355,7 +1368,9 @@ class TestDistBase(unittest.TestCase):
print
(
"use_hallreduce:{} tr_cmd:{}, env: {}"
.
format
(
self
.
_use_hallreduce
,
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
log_name
+
"_tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
log_name
+
"_tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
@@ -1401,7 +1416,8 @@ class TestDistBase(unittest.TestCase):
tr_env
[
'FLAGS_cudnn_deterministic'
]
=
'0'
print
(
"tr_cmd:{}, env: {}"
.
format
(
tr_cmd
,
tr_env
))
tr_pipe
=
open
(
"/tmp/"
+
"tr{}_err.log"
.
format
(
i
),
"wb"
)
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
+
"tr{}_err.log"
.
format
(
i
))
tr_pipe
=
open
(
path
,
"wb"
)
print_to_err
(
type
(
self
).
__name__
,
...
...
python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
浏览文件 @
a5ccc713
...
...
@@ -57,13 +57,15 @@ class TestDistMnistNCCL2DGC(TestDistBase):
def
tearDown
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
result
=
count_of_sparse_all_reduce_calls
(
'test_dist_mnist_dgc_nccl_tr0_err.log'
)
log_file
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'test_dist_mnist_dgc_nccl_tr0_err.log'
)
result
=
count_of_sparse_all_reduce_calls
(
log_file
)
# only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3
# temp close this test. In python3 CI, the log is right, but the result
# has a problem, may be in multi process mode, log is not written in time.
# self.assertEqual(result, 3)
self
.
temp_dir
.
cleanup
()
class
TestDistMnistNCCL2DGCMultiCards
(
TestDistBase
):
...
...
@@ -86,10 +88,13 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase):
def
tearDown
(
self
):
import
paddle.fluid
as
fluid
if
fluid
.
core
.
is_compiled_with_cuda
():
result
=
count_of_sparse_all_reduce_calls
(
log_file
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'test_dist_mnist_dgc_nccl_dgc_2cards_local.log'
)
result
=
count_of_sparse_all_reduce_calls
(
log_file
)
# same as above, but use two cards
self
.
assertEqual
(
result
,
6
)
self
.
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_tree_index.py
浏览文件 @
a5ccc713
...
...
@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
tempfile
import
unittest
from
paddle.dataset.common
import
download
,
DATA_HOME
from
paddle.distributed.fleet.dataset
import
TreeIndex
...
...
@@ -102,6 +104,12 @@ class TestTreeIndex(unittest.TestCase):
class
TestIndexSampler
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_layerwise_sampler
(
self
):
path
=
download
(
"https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb"
,
...
...
@@ -109,7 +117,8 @@ class TestIndexSampler(unittest.TestCase):
tdm_layer_counts
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
#tree = TreeIndex("demo", path)
file_name
=
"test_in_memory_dataset_tdm_sample_run.txt"
file_name
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"test_in_memory_dataset_tdm_sample_run.txt"
)
with
open
(
file_name
,
"w"
)
as
f
:
#data = "29 d 29 d 29 29 29 29 29 29 29 29 29 29 29 29\n"
data
=
"1 1 1 15 15 15
\n
"
...
...
python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
浏览文件 @
a5ccc713
...
...
@@ -18,6 +18,7 @@ import sys
import
shutil
import
unittest
import
paddle
import
tempfile
def
get_test_file
():
...
...
@@ -35,12 +36,13 @@ def remove_file_if_exists(file_name):
def
run_test
(
clip_after_allreduce
=
True
,
max_global_norm
=-
1.0
):
temp_dir
=
tempfile
.
TemporaryDirectory
()
if
not
paddle
.
is_compiled_with_cuda
():
return
if
os
.
name
==
'nt'
:
return
args
=
locals
()
log_dir
=
'log_{}'
.
format
(
os
.
getpid
(
))
log_dir
=
os
.
path
.
join
(
temp_dir
.
name
,
'log_{}'
.
format
(
os
.
getpid
()
))
cmd
=
[
sys
.
executable
,
'-u'
,
...
...
@@ -57,15 +59,15 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0):
os
.
environ
[
'MAX_GLOBAL_NORM'
]
=
str
(
max_global_norm
)
touch_file_env
=
'SUCCESS_TOUCH_FILE'
touch_file_name
=
'distributed_fused_lamb_touch_file_{}'
.
format
(
os
.
getpid
())
touch_file_name
=
os
.
path
.
join
(
temp_dir
.
name
,
'distributed_fused_lamb_touch_file_{}'
.
format
(
os
.
getpid
()))
os
.
environ
[
touch_file_env
]
=
touch_file_name
remove_file_if_exists
(
touch_file_name
)
try
:
assert
os
.
system
(
cmd
)
==
0
and
os
.
path
.
exists
(
touch_file_name
),
'Test failed when {}'
.
format
(
args
)
finally
:
remove_file_if_exists
(
touch_file_name
)
remove_file_if_exists
(
log_dir
)
temp_dir
.
cleanup
()
class
TestDistributedFusedLambWithClip
(
unittest
.
TestCase
):
...
...
python/paddle/fluid/tests/unittests/test_fleet_base_2.py
浏览文件 @
a5ccc713
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
tempfile
import
unittest
import
paddle
paddle
.
enable_static
()
...
...
@@ -69,14 +70,17 @@ class TestFleetBase(unittest.TestCase):
compiled_prog
=
fluid
.
compiler
.
CompiledProgram
(
fluid
.
default_main_program
())
temp_dir
=
tempfile
.
TemporaryDirectory
()
fleet
.
init_worker
()
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
,
feed
=
[
'x'
,
'y'
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
,
feed
=
[
input_x
,
input_y
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
"/tmp"
)
dirname
=
temp_dir
.
name
,
feed
=
[
'x'
,
'y'
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
temp_dir
.
name
,
feed
=
[
input_x
,
input_y
],
fetch
=
[
avg_cost
])
fleet
.
fleet
.
save
(
dirname
=
temp_dir
.
name
)
fleet
.
load_model
(
path
=
"/tmp"
,
mode
=
0
)
fleet
.
load_model
(
path
=
"/tmp"
,
mode
=
1
)
fleet
.
load_model
(
path
=
temp_dir
.
name
,
mode
=
0
)
fleet
.
load_model
(
path
=
temp_dir
.
name
,
mode
=
1
)
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py
浏览文件 @
a5ccc713
...
...
@@ -33,12 +33,15 @@ print("test")
class
TestCollectiveLauncher
(
unittest
.
TestCase
):
def
setUp
(
self
):
file_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
)
)
self
.
code_path
=
os
.
path
.
join
(
file_dir
,
"fake_python_for_elastic.py"
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
(
)
self
.
code_path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"fake_python_for_elastic.py"
)
with
open
(
self
.
code_path
,
"w"
)
as
f
:
f
.
write
(
fake_python_code
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_launch
(
self
):
class
Argument
:
elastic_server
=
"127.0.0.1:2379"
...
...
@@ -56,7 +59,7 @@ class TestCollectiveLauncher(unittest.TestCase):
run_mode
=
"cpuonly"
servers
=
None
rank_mapping_path
=
None
training_script
=
"fake_python_for_elastic.py"
training_script
=
self
.
code_path
training_script_args
=
[
"--use_amp false"
]
log_dir
=
None
...
...
@@ -94,7 +97,7 @@ class TestCollectiveLauncher(unittest.TestCase):
run_mode
=
"cpuonly"
servers
=
None
rank_mapping_path
=
None
training_script
=
"fake_python_for_elastic.py"
training_script
=
self
.
code_path
training_script_args
=
[
"--use_amp false"
]
log_dir
=
None
...
...
python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py
浏览文件 @
a5ccc713
...
...
@@ -16,17 +16,25 @@ import unittest
import
paddle
import
numpy
as
np
import
os
import
tempfile
from
paddle.fluid
import
core
paddle
.
enable_static
()
class
TestDistModelRun
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
# step 6: clean up the env, delete the saved model and params
print
(
'cleaned up the env'
)
self
.
temp_dir
.
cleanup
()
def
test_dist_model_run
(
self
):
# step 0: declare folder to save the model and params
folder
=
'./dist_model_run_test/'
file
=
'inf'
path_prefix
=
folder
+
file
path_prefix
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"dist_model_run_test/inf"
)
# step 1: saving the inference model and params
x
=
paddle
.
static
.
data
(
name
=
'x'
,
shape
=
[
28
,
28
],
dtype
=
'float32'
)
...
...
@@ -75,12 +83,6 @@ class TestDistModelRun(unittest.TestCase):
# step 5: compare two results
self
.
assertTrue
(
np
.
allclose
(
dist_model_rst
,
load_inference_model_rst
))
# step 6: clean up the env, delete the saved model and params
os
.
remove
(
path_prefix
+
'.pdiparams'
)
os
.
remove
(
path_prefix
+
'.pdmodel'
)
os
.
rmdir
(
folder
)
print
(
'cleaned up the env'
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
浏览文件 @
a5ccc713
...
...
@@ -17,6 +17,7 @@ from __future__ import print_function
import
paddle
import
os
import
unittest
import
tempfile
import
paddle.fluid.incubate.fleet.base.role_maker
as
role_maker
...
...
@@ -28,7 +29,10 @@ class TestCloudRoleMaker2(unittest.TestCase):
def
setUp
(
self
):
"""Set up, set envs."""
pass
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
test_pslib_2
(
self
):
"""Test cases for pslib."""
...
...
@@ -37,6 +41,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
from
paddle.fluid.incubate.fleet.base.role_maker
import
GeneralRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
RoleMakerBase
paddle
.
enable_static
()
os
.
environ
[
"POD_IP"
]
=
"127.0.0.1"
os
.
environ
[
"PADDLE_PORT"
]
=
"36001"
os
.
environ
[
"TRAINING_ROLE"
]
=
"TRAINER"
...
...
@@ -155,17 +161,19 @@ class TestCloudRoleMaker2(unittest.TestCase):
role23
=
GeneralRoleMaker
(
path
=
"./test_gloo_23"
)
role23
.
_get_size
()
role23
.
_get_size
()
with
open
(
"test_fleet_gloo_role_maker_1.txt"
,
"w"
)
as
f
:
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
"test_fleet_gloo_role_maker_1.txt"
)
with
open
(
path
,
"w"
)
as
f
:
data
=
"1 1 1 1
\n
"
f
.
write
(
data
)
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
set_filelist
([
"test_fleet_gloo_role_maker_1.txt"
])
dataset
.
set_filelist
([
path
])
dataset
.
_set_use_var
([
show
,
label
])
dataset
.
load_into_memory
()
dataset
.
get_memory_data_size
(
fleet
)
dataset
.
get_shuffle_data_size
(
fleet
)
os
.
remove
(
"./test_fleet_gloo_role_maker_1.txt"
)
class
TmpClass
():
"""
...
...
python/paddle/fluid/tests/unittests/test_monitor.py
浏览文件 @
a5ccc713
...
...
@@ -24,6 +24,7 @@ import paddle.fluid.core as core
import
numpy
as
np
import
os
import
unittest
import
tempfile
class
TestDatasetWithStat
(
unittest
.
TestCase
):
...
...
@@ -35,12 +36,15 @@ class TestDatasetWithStat(unittest.TestCase):
self
.
drop_last
=
False
def
test_dataset_run_with_stat
(
self
):
with
open
(
"test_in_memory_dataset_run_a.txt"
,
"w"
)
as
f
:
temp_dir
=
tempfile
.
TemporaryDirectory
()
path_a
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_in_memory_dataset_run_a.txt"
)
path_b
=
os
.
path
.
join
(
temp_dir
.
name
,
"test_in_memory_dataset_run_b.txt"
)
with
open
(
path_a
,
"w"
)
as
f
:
data
=
"1 1 2 3 3 4 5 5 5 5 1 1
\n
"
data
+=
"1 2 2 3 4 4 6 6 6 6 1 2
\n
"
data
+=
"1 3 2 3 5 4 7 7 7 7 1 3
\n
"
f
.
write
(
data
)
with
open
(
"test_in_memory_dataset_run_b.txt"
,
"w"
)
as
f
:
with
open
(
path_b
,
"w"
)
as
f
:
data
=
"1 4 2 3 3 4 5 5 5 5 1 4
\n
"
data
+=
"1 5 2 3 4 4 6 6 6 6 1 5
\n
"
data
+=
"1 6 2 3 5 4 7 7 7 7 1 6
\n
"
...
...
@@ -62,10 +66,7 @@ class TestDatasetWithStat(unittest.TestCase):
dataset
=
paddle
.
distributed
.
InMemoryDataset
()
dataset
.
_set_batch_size
(
32
)
dataset
.
_set_thread
(
3
)
dataset
.
set_filelist
([
"test_in_memory_dataset_run_a.txt"
,
"test_in_memory_dataset_run_b.txt"
])
dataset
.
set_filelist
([
path_a
,
path_b
])
dataset
.
_set_pipe_command
(
"cat"
)
dataset
.
_set_use_var
(
slots_vars
)
dataset
.
load_into_memory
()
...
...
@@ -99,8 +100,7 @@ class TestDatasetWithStat(unittest.TestCase):
# total 56 keys
print
(
int_stat
[
"STAT_total_feasign_num_in_mem"
])
os
.
remove
(
"./test_in_memory_dataset_run_a.txt"
)
os
.
remove
(
"./test_in_memory_dataset_run_b.txt"
)
temp_dir
.
cleanup
()
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_run.py
浏览文件 @
a5ccc713
...
...
@@ -17,6 +17,7 @@ import subprocess
import
sys
,
os
import
json
import
shutil
import
tempfile
import
random
...
...
@@ -57,13 +58,18 @@ def get_files(pth, prefix):
class
Collective_Test
(
unittest
.
TestCase
):
def
setUp
(
self
):
write_file
(
pyname
,
colpyfile
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
self
.
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
pyname
)
write_file
(
self
.
path
,
colpyfile
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
pdrun
(
self
,
args
,
env
=
None
):
cmd
=
[
sys
.
executable
.
split
(
'/'
)[
-
1
],
"-m"
,
"paddle.distributed.launch"
]
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
cmd
.
extend
([
self
.
path
])
env
=
os
.
environ
.
copy
()
# virtual devies for testing
env
.
update
({
'CUDA_VISIBLE_DEVICES'
:
'0,1,2,3,4,5,6,7'
})
...
...
@@ -71,30 +77,30 @@ class Collective_Test(unittest.TestCase):
return
proc
def
test_collective_1
(
self
):
args
=
"--job_id test1"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id test1 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
log_dir
.
cleanup
()
def
test_collective_2
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id test2 --devices 0,1,2"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id test2 --devices 0,1,2 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'test2'
)
c
=
get_files
(
log_dir
.
name
,
'test2'
)
self
.
assertTrue
(
len
(
c
)
==
4
)
log_dir
.
cleanup
()
def
test_collective_3
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id test3 --devices 0,1 --
master 127.0.0.1:{} --np
2"
.
format
(
port
)
args
=
"--job_id test3 --devices 0,1 --
log_dir {} --master 127.0.0.1:{} --nnodes
2"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
p1
.
wait
()
...
...
@@ -102,47 +108,53 @@ class Collective_Test(unittest.TestCase):
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'test3'
)
c
=
get_files
(
log_dir
.
name
,
'test3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
log_dir
.
cleanup
()
class
PS_Test
(
unittest
.
TestCase
):
def
setUp
(
self
):
write_file
(
pyname
,
pspyfile
)
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
self
.
path
=
os
.
path
.
join
(
self
.
temp_dir
.
name
,
pyname
)
write_file
(
self
.
path
,
pspyfile
)
def
tearDown
(
self
):
self
.
temp_dir
.
cleanup
()
def
pdrun
(
self
,
args
,
env
=
None
):
cmd
=
[
sys
.
executable
.
split
(
'/'
)[
-
1
],
"-m"
,
"paddle.distributed.launch"
]
if
args
:
cmd
.
extend
(
args
.
split
(
" "
))
cmd
.
extend
([
pyname
])
cmd
.
extend
([
self
.
path
])
proc
=
subprocess
.
Popen
(
cmd
,
env
)
return
proc
def
test_ps_1
(
self
):
args
=
"--run_mode ps"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--run_mode ps --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
log_dir
.
cleanup
()
def
test_ps_2
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id ps2 --server_num=2 --trainer_num=2"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id ps2 --server_num=2 --trainer_num=2 --log_dir {}"
.
format
(
log_dir
.
name
)
p
=
self
.
pdrun
(
args
)
p
.
wait
()
self
.
assertTrue
(
p
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps2'
)
c
=
get_files
(
log_dir
.
name
,
'ps2'
)
self
.
assertTrue
(
len
(
c
)
==
5
)
log_dir
.
cleanup
()
def
test_ps_3
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
log_dir
=
tempfile
.
TemporaryDirectory
()
port
=
random
.
randrange
(
6000
,
8000
)
args
=
"--job_id ps3 --
master 127.0.0.1:{} --np
2 --server_num=1 --trainer_num=1"
.
format
(
port
)
args
=
"--job_id ps3 --
log_dir {} --master 127.0.0.1:{} --nnodes
2 --server_num=1 --trainer_num=1"
.
format
(
log_dir
.
name
,
port
)
p1
=
self
.
pdrun
(
args
)
p2
=
self
.
pdrun
(
args
)
p1
.
wait
()
...
...
@@ -150,20 +162,21 @@ class PS_Test(unittest.TestCase):
self
.
assertTrue
(
p1
.
poll
()
==
0
)
self
.
assertTrue
(
p2
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps3'
)
c
=
get_files
(
log_dir
.
name
,
'ps3'
)
self
.
assertTrue
(
len
(
c
)
==
6
)
log_dir
.
cleanup
()
def
test_ps_4
(
self
):
if
os
.
path
.
exists
(
'./log'
):
shutil
.
rmtree
(
'./log'
)
args
=
"--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
log_dir
=
tempfile
.
TemporaryDirectory
()
args
=
"--job_id ps4 --log_dir {} --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
.
format
(
log_dir
.
name
)
p1
=
self
.
pdrun
(
args
)
p1
.
wait
()
self
.
assertTrue
(
p1
.
poll
()
==
0
)
c
=
get_files
(
'log'
,
'ps4'
)
c
=
get_files
(
log_dir
.
name
,
'ps4'
)
self
.
assertTrue
(
len
(
c
)
==
5
)
log_dir
.
cleanup
()
if
__name__
==
'__main__'
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录