From a5ccc7135eaa9abb74f075f81bd17612d057b55c Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com> Date: Mon, 20 Jun 2022 11:08:45 +0800 Subject: [PATCH] place all save/load path into temporary directory (#43652) --- .../unittests/test_collective_api_base.py | 20 +++-- .../tests/unittests/test_collective_base.py | 14 +++- .../unittests/test_communicator_ps_gpu.py | 7 +- .../fluid/tests/unittests/test_dist_base.py | 34 +++++--- .../unittests/test_dist_mnist_dgc_nccl.py | 11 ++- .../tests/unittests/test_dist_tree_index.py | 11 ++- ...est_distributed_fused_lamb_op_with_clip.py | 12 +-- .../tests/unittests/test_fleet_base_2.py | 14 ++-- .../test_fleet_elastic_collective.py | 13 +-- .../test_fleet_exe_dist_model_run.py | 20 ++--- .../tests/unittests/test_fleet_rolemaker_2.py | 16 +++- .../fluid/tests/unittests/test_monitor.py | 16 ++-- .../paddle/fluid/tests/unittests/test_run.py | 79 +++++++++++-------- 13 files changed, 175 insertions(+), 92 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py index 00294bf6071..821702a30fe 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py @@ -23,6 +23,7 @@ import subprocess import traceback import functools import pickle +import tempfile from contextlib import closing import paddle import paddle.fluid as fluid @@ -97,6 +98,11 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def _find_free_port(self): def __free_port(): with closing(socket.socket(socket.AF_INET, @@ -155,9 +161,13 @@ class TestDistBase(unittest.TestCase): tr_cmd = "%s %s" tr0_cmd = tr_cmd % (self._python_interp, model_file) tr1_cmd = tr_cmd % (self._python_interp, model_file) - tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w") - tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w") - #print(tr0_cmd) + path0 = os.path.join(self.temp_dir.name, + "/tmp/tr0_err_%d.log" % os.getpid()) + path1 = os.path.join(self.temp_dir.name, + "/tmp/tr1_err_%d.log" % os.getpid()) + tr0_pipe = open(path0, "w") + tr1_pipe = open(path1, "w") + #print(tr0_cmd) tr0_proc = subprocess.Popen( tr0_cmd.strip().split(), stdout=subprocess.PIPE, @@ -177,9 +187,9 @@ class TestDistBase(unittest.TestCase): # close trainer file tr0_pipe.close() tr1_pipe.close() - with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f: + with open(path0, "r") as f: sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) - with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f: + with open(path1, "r") as f: sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) return pickle.loads(tr0_out), pickle.loads( tr1_out), tr0_proc.pid, tr1_proc.pid diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py index 1b55395ede5..cc8a697f747 100644 --- a/python/paddle/fluid/tests/unittests/test_collective_base.py +++ b/python/paddle/fluid/tests/unittests/test_collective_base.py @@ -23,6 +23,7 @@ import subprocess import traceback import functools import pickle +import tempfile from contextlib import closing import paddle.fluid as fluid import paddle.fluid.unique_name as nameGen @@ -145,6 +146,11 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def _find_free_port(self): def __free_port(): with closing(socket.socket(socket.AF_INET, @@ -183,9 +189,11 @@ class TestDistBase(unittest.TestCase): tr_cmd = "%s %s" tr0_cmd = tr_cmd % (self._python_interp, model_file) tr1_cmd = tr_cmd % (self._python_interp, model_file) - tr0_pipe = open("/tmp/tr0_err.log", "wb") - tr1_pipe = open("/tmp/tr1_err.log", "wb") - #print(tr0_cmd) + path0 = os.path.join(self.temp_dir.name, "/tmp/tr0_err.log") + path1 = os.path.join(self.temp_dir.name, "/tmp/tr1_err.log") + tr0_pipe = open(path0, "wb") + tr1_pipe = open(path1, "wb") + #print(tr0_cmd) tr0_proc = subprocess.Popen( tr0_cmd.strip().split(), stdout=subprocess.PIPE, diff --git a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py index 7b0c28e64bc..e23eba13bf2 100644 --- a/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_communicator_ps_gpu.py @@ -19,6 +19,7 @@ import unittest import time import threading import numpy +import tempfile import paddle paddle.enable_static() @@ -30,7 +31,9 @@ import paddle.distributed.fleet as fleet class TestCommunicator(unittest.TestCase): def test_communicator_ps_gpu(self): - with open("test_communicator_ps_gpu.txt", "w") as f: + temp_dir = tempfile.TemporaryDirectory() + path = os.path.join(temp_dir.name, "test_communicator_ps_gpu.txt") + with open(path, "w") as f: data = "1 0.6 1 0.7\n" f.write(data) @@ -90,7 +93,7 @@ class TestCommunicator(unittest.TestCase): self.assertTrue(False) time.sleep(10) fleet.stop_worker() - os.remove("./test_communicator_ps_gpu.txt") + temp_dir.cleanup() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 11972059c83..131b47bc57a 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import time +import tempfile import ast import unittest @@ -867,6 +867,11 @@ class TestDistBase(unittest.TestCase): self._after_setup_config() + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def _find_free_port(self): def __free_port(): with closing(socket.socket(socket.AF_INET, @@ -909,8 +914,10 @@ class TestDistBase(unittest.TestCase): print(ps0_cmd) print(ps1_cmd) - ps0_pipe = open(log_name + "_ps0_err.log", "wb") - ps1_pipe = open(log_name + "_ps1_err.log", "wb") + path0 = os.path.join(self.temp_dir.name, log_name + "_ps0_err.log") + path1 = os.path.join(self.temp_dir.name, log_name + "_ps1_err.log") + ps0_pipe = open(path0, "wb") + ps1_pipe = open(path1, "wb") print_to_err(type(self).__name__, "going to start pserver process 0") ps0_proc = subprocess.Popen( @@ -990,7 +997,8 @@ class TestDistBase(unittest.TestCase): print("local_cmd: {}, env: {}".format(cmd, env_local)) if check_error_log: - err_log = open(log_name + "_local.log", "wb") + path = os.path.join(self.temp_dir.name, log_name + "_local.log") + err_log = open(path, "wb") local_proc = subprocess.Popen( cmd.split(" "), stdout=subprocess.PIPE, @@ -1076,8 +1084,11 @@ class TestDistBase(unittest.TestCase): print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0)) print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1)) - tr0_pipe = open(log_name + "_tr0_err.log", "wb") - tr1_pipe = open(log_name + "_tr1_err.log", "wb") + + path0 = os.path.join(self.temp_dir.name, log_name + "_tr0_err.log") + path1 = os.path.join(self.temp_dir.name, log_name + "_tr1_err.log") + tr0_pipe = open(path0, "wb") + tr1_pipe = open(path1, "wb") print_to_err(type(self).__name__, "going to start trainer process 0") tr0_proc = subprocess.Popen( @@ -1293,7 +1304,9 @@ class TestDistBase(unittest.TestCase): print("use_hallreduce:{} tr_cmd:{}, env: {}".format( self._use_hallreduce, tr_cmd, tr_env)) - tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb") + path = os.path.join(self.temp_dir.name, + log_name + "_tr{}_err.log".format(i)) + tr_pipe = open(path, "wb") print_to_err( type(self).__name__, @@ -1355,7 +1368,9 @@ class TestDistBase(unittest.TestCase): print("use_hallreduce:{} tr_cmd:{}, env: {}".format( self._use_hallreduce, tr_cmd, tr_env)) - tr_pipe = open(log_name + "_tr{}_err.log".format(i), "wb") + path = os.path.join(self.temp_dir.name, + log_name + "_tr{}_err.log".format(i)) + tr_pipe = open(path, "wb") print_to_err( type(self).__name__, @@ -1401,7 +1416,8 @@ class TestDistBase(unittest.TestCase): tr_env['FLAGS_cudnn_deterministic'] = '0' print("tr_cmd:{}, env: {}".format(tr_cmd, tr_env)) - tr_pipe = open("/tmp/" + "tr{}_err.log".format(i), "wb") + path = os.path.join(self.temp_dir.name + "tr{}_err.log".format(i)) + tr_pipe = open(path, "wb") print_to_err( type(self).__name__, diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py index eae19afb2ef..b4b33c4c773 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py @@ -57,13 +57,15 @@ class TestDistMnistNCCL2DGC(TestDistBase): def tearDown(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - result = count_of_sparse_all_reduce_calls( - 'test_dist_mnist_dgc_nccl_tr0_err.log') + log_file = os.path.join(self.temp_dir.name, + 'test_dist_mnist_dgc_nccl_tr0_err.log') + result = count_of_sparse_all_reduce_calls(log_file) # only 1 layer use dgc now, run_step=5, rampup_begin_step=2, so 1 * (5 - 2) = 3 # temp close this test. In python3 CI, the log is right, but the result # has a problem, may be in multi process mode, log is not written in time. # self.assertEqual(result, 3) + self.temp_dir.cleanup() class TestDistMnistNCCL2DGCMultiCards(TestDistBase): @@ -86,10 +88,13 @@ class TestDistMnistNCCL2DGCMultiCards(TestDistBase): def tearDown(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - result = count_of_sparse_all_reduce_calls( + log_file = os.path.join( + self.temp_dir.name, 'test_dist_mnist_dgc_nccl_dgc_2cards_local.log') + result = count_of_sparse_all_reduce_calls(log_file) # same as above, but use two cards self.assertEqual(result, 6) + self.temp_dir.cleanup() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py index 6ea15319204..337dd434198 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_tree_index.py +++ b/python/paddle/fluid/tests/unittests/test_dist_tree_index.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import tempfile import unittest from paddle.dataset.common import download, DATA_HOME from paddle.distributed.fleet.dataset import TreeIndex @@ -102,6 +104,12 @@ class TestTreeIndex(unittest.TestCase): class TestIndexSampler(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() + def test_layerwise_sampler(self): path = download( "https://paddlerec.bj.bcebos.com/tree-based/data/mini_tree.pb", @@ -109,7 +117,8 @@ class TestIndexSampler(unittest.TestCase): tdm_layer_counts = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] #tree = TreeIndex("demo", path) - file_name = "test_in_memory_dataset_tdm_sample_run.txt" + file_name = os.path.join(self.temp_dir.name, + "test_in_memory_dataset_tdm_sample_run.txt") with open(file_name, "w") as f: #data = "29 d 29 d 29 29 29 29 29 29 29 29 29 29 29 29\n" data = "1 1 1 15 15 15\n" diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py index af99529adfa..5507cffb9f0 100644 --- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py +++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py @@ -18,6 +18,7 @@ import sys import shutil import unittest import paddle +import tempfile def get_test_file(): @@ -35,12 +36,13 @@ def remove_file_if_exists(file_name): def run_test(clip_after_allreduce=True, max_global_norm=-1.0): + temp_dir = tempfile.TemporaryDirectory() if not paddle.is_compiled_with_cuda(): return if os.name == 'nt': return args = locals() - log_dir = 'log_{}'.format(os.getpid()) + log_dir = os.path.join(temp_dir.name, 'log_{}'.format(os.getpid())) cmd = [ sys.executable, '-u', @@ -57,15 +59,15 @@ def run_test(clip_after_allreduce=True, max_global_norm=-1.0): os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) touch_file_env = 'SUCCESS_TOUCH_FILE' - touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid()) + touch_file_name = os.path.join( + temp_dir.name, + 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())) os.environ[touch_file_env] = touch_file_name - remove_file_if_exists(touch_file_name) try: assert os.system(cmd) == 0 and os.path.exists( touch_file_name), 'Test failed when {}'.format(args) finally: - remove_file_if_exists(touch_file_name) - remove_file_if_exists(log_dir) + temp_dir.cleanup() class TestDistributedFusedLambWithClip(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py index 9675a77d676..2d5fa764004 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_base_2.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_base_2.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest import paddle paddle.enable_static() @@ -69,14 +70,17 @@ class TestFleetBase(unittest.TestCase): compiled_prog = fluid.compiler.CompiledProgram( fluid.default_main_program()) + temp_dir = tempfile.TemporaryDirectory() fleet.init_worker() - fleet.fleet.save(dirname="/tmp", feed=['x', 'y'], fetch=[avg_cost]) fleet.fleet.save( - dirname="/tmp", feed=[input_x, input_y], fetch=[avg_cost]) - fleet.fleet.save(dirname="/tmp") + dirname=temp_dir.name, feed=['x', 'y'], fetch=[avg_cost]) + fleet.fleet.save( + dirname=temp_dir.name, feed=[input_x, input_y], fetch=[avg_cost]) + fleet.fleet.save(dirname=temp_dir.name) - fleet.load_model(path="/tmp", mode=0) - fleet.load_model(path="/tmp", mode=1) + fleet.load_model(path=temp_dir.name, mode=0) + fleet.load_model(path=temp_dir.name, mode=1) + temp_dir.cleanup() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py index 2d2f019c5ed..391ac869513 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_elastic_collective.py @@ -33,12 +33,15 @@ print("test") class TestCollectiveLauncher(unittest.TestCase): def setUp(self): - file_dir = os.path.dirname(os.path.abspath(__file__)) - - self.code_path = os.path.join(file_dir, "fake_python_for_elastic.py") + self.temp_dir = tempfile.TemporaryDirectory() + self.code_path = os.path.join(self.temp_dir.name, + "fake_python_for_elastic.py") with open(self.code_path, "w") as f: f.write(fake_python_code) + def tearDown(self): + self.temp_dir.cleanup() + def test_launch(self): class Argument: elastic_server = "127.0.0.1:2379" @@ -56,7 +59,7 @@ class TestCollectiveLauncher(unittest.TestCase): run_mode = "cpuonly" servers = None rank_mapping_path = None - training_script = "fake_python_for_elastic.py" + training_script = self.code_path training_script_args = ["--use_amp false"] log_dir = None @@ -94,7 +97,7 @@ class TestCollectiveLauncher(unittest.TestCase): run_mode = "cpuonly" servers = None rank_mapping_path = None - training_script = "fake_python_for_elastic.py" + training_script = self.code_path training_script_args = ["--use_amp false"] log_dir = None diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py index 544fe4dd43e..71a3c31bc10 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_run.py @@ -16,17 +16,25 @@ import unittest import paddle import numpy as np import os +import tempfile from paddle.fluid import core paddle.enable_static() class TestDistModelRun(unittest.TestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + # step 6: clean up the env, delete the saved model and params + print('cleaned up the env') + self.temp_dir.cleanup() + def test_dist_model_run(self): # step 0: declare folder to save the model and params - folder = './dist_model_run_test/' - file = 'inf' - path_prefix = folder + file + path_prefix = os.path.join(self.temp_dir.name, + "dist_model_run_test/inf") # step 1: saving the inference model and params x = paddle.static.data(name='x', shape=[28, 28], dtype='float32') @@ -75,12 +83,6 @@ class TestDistModelRun(unittest.TestCase): # step 5: compare two results self.assertTrue(np.allclose(dist_model_rst, load_inference_model_rst)) - # step 6: clean up the env, delete the saved model and params - os.remove(path_prefix + '.pdiparams') - os.remove(path_prefix + '.pdmodel') - os.rmdir(folder) - print('cleaned up the env') - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py index cabd07a399a..179fd20c1e7 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py @@ -17,6 +17,7 @@ from __future__ import print_function import paddle import os import unittest +import tempfile import paddle.fluid.incubate.fleet.base.role_maker as role_maker @@ -28,7 +29,10 @@ class TestCloudRoleMaker2(unittest.TestCase): def setUp(self): """Set up, set envs.""" - pass + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + self.temp_dir.cleanup() def test_pslib_2(self): """Test cases for pslib.""" @@ -37,6 +41,8 @@ class TestCloudRoleMaker2(unittest.TestCase): from paddle.fluid.incubate.fleet.base.role_maker import GeneralRoleMaker from paddle.fluid.incubate.fleet.base.role_maker import RoleMakerBase + paddle.enable_static() + os.environ["POD_IP"] = "127.0.0.1" os.environ["PADDLE_PORT"] = "36001" os.environ["TRAINING_ROLE"] = "TRAINER" @@ -155,17 +161,19 @@ class TestCloudRoleMaker2(unittest.TestCase): role23 = GeneralRoleMaker(path="./test_gloo_23") role23._get_size() role23._get_size() - with open("test_fleet_gloo_role_maker_1.txt", "w") as f: + + path = os.path.join(self.temp_dir.name, + "test_fleet_gloo_role_maker_1.txt") + with open(path, "w") as f: data = "1 1 1 1\n" f.write(data) dataset = paddle.distributed.InMemoryDataset() - dataset.set_filelist(["test_fleet_gloo_role_maker_1.txt"]) + dataset.set_filelist([path]) dataset._set_use_var([show, label]) dataset.load_into_memory() dataset.get_memory_data_size(fleet) dataset.get_shuffle_data_size(fleet) - os.remove("./test_fleet_gloo_role_maker_1.txt") class TmpClass(): """ diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py index bea2f6c8b38..f8bcccc911f 100644 --- a/python/paddle/fluid/tests/unittests/test_monitor.py +++ b/python/paddle/fluid/tests/unittests/test_monitor.py @@ -24,6 +24,7 @@ import paddle.fluid.core as core import numpy as np import os import unittest +import tempfile class TestDatasetWithStat(unittest.TestCase): @@ -35,12 +36,15 @@ class TestDatasetWithStat(unittest.TestCase): self.drop_last = False def test_dataset_run_with_stat(self): - with open("test_in_memory_dataset_run_a.txt", "w") as f: + temp_dir = tempfile.TemporaryDirectory() + path_a = os.path.join(temp_dir.name, "test_in_memory_dataset_run_a.txt") + path_b = os.path.join(temp_dir.name, "test_in_memory_dataset_run_b.txt") + with open(path_a, "w") as f: data = "1 1 2 3 3 4 5 5 5 5 1 1\n" data += "1 2 2 3 4 4 6 6 6 6 1 2\n" data += "1 3 2 3 5 4 7 7 7 7 1 3\n" f.write(data) - with open("test_in_memory_dataset_run_b.txt", "w") as f: + with open(path_b, "w") as f: data = "1 4 2 3 3 4 5 5 5 5 1 4\n" data += "1 5 2 3 4 4 6 6 6 6 1 5\n" data += "1 6 2 3 5 4 7 7 7 7 1 6\n" @@ -62,10 +66,7 @@ class TestDatasetWithStat(unittest.TestCase): dataset = paddle.distributed.InMemoryDataset() dataset._set_batch_size(32) dataset._set_thread(3) - dataset.set_filelist([ - "test_in_memory_dataset_run_a.txt", - "test_in_memory_dataset_run_b.txt" - ]) + dataset.set_filelist([path_a, path_b]) dataset._set_pipe_command("cat") dataset._set_use_var(slots_vars) dataset.load_into_memory() @@ -99,8 +100,7 @@ class TestDatasetWithStat(unittest.TestCase): # total 56 keys print(int_stat["STAT_total_feasign_num_in_mem"]) - os.remove("./test_in_memory_dataset_run_a.txt") - os.remove("./test_in_memory_dataset_run_b.txt") + temp_dir.cleanup() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py index 365d3f931c2..386e722f105 100644 --- a/python/paddle/fluid/tests/unittests/test_run.py +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -17,6 +17,7 @@ import subprocess import sys, os import json import shutil +import tempfile import random @@ -57,13 +58,18 @@ def get_files(pth, prefix): class Collective_Test(unittest.TestCase): def setUp(self): - write_file(pyname, colpyfile) + self.temp_dir = tempfile.TemporaryDirectory() + self.path = os.path.join(self.temp_dir.name, pyname) + write_file(self.path, colpyfile) + + def tearDown(self): + self.temp_dir.cleanup() def pdrun(self, args, env=None): cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"] if args: cmd.extend(args.split(" ")) - cmd.extend([pyname]) + cmd.extend([self.path]) env = os.environ.copy() # virtual devies for testing env.update({'CUDA_VISIBLE_DEVICES': '0,1,2,3,4,5,6,7'}) @@ -71,30 +77,30 @@ class Collective_Test(unittest.TestCase): return proc def test_collective_1(self): - args = "--job_id test1" + log_dir = tempfile.TemporaryDirectory() + args = "--job_id test1 --log_dir {}".format(log_dir.name) p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) + log_dir.cleanup() def test_collective_2(self): - if os.path.exists('./log'): - shutil.rmtree('./log') - - args = "--job_id test2 --devices 0,1,2" + log_dir = tempfile.TemporaryDirectory() + args = "--job_id test2 --devices 0,1,2 --log_dir {}".format( + log_dir.name) p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) - c = get_files('log', 'test2') + c = get_files(log_dir.name, 'test2') self.assertTrue(len(c) == 4) + log_dir.cleanup() def test_collective_3(self): - if os.path.exists('./log'): - shutil.rmtree('./log') - + log_dir = tempfile.TemporaryDirectory() port = random.randrange(6000, 8000) - args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format( - port) + args = "--job_id test3 --devices 0,1 --log_dir {} --master 127.0.0.1:{} --nnodes 2".format( + log_dir.name, port) p1 = self.pdrun(args) p2 = self.pdrun(args) p1.wait() @@ -102,47 +108,53 @@ class Collective_Test(unittest.TestCase): self.assertTrue(p1.poll() == 0) self.assertTrue(p2.poll() == 0) - c = get_files('log', 'test3') + c = get_files(log_dir.name, 'test3') self.assertTrue(len(c) == 6) + log_dir.cleanup() class PS_Test(unittest.TestCase): def setUp(self): - write_file(pyname, pspyfile) + self.temp_dir = tempfile.TemporaryDirectory() + self.path = os.path.join(self.temp_dir.name, pyname) + write_file(self.path, pspyfile) + + def tearDown(self): + self.temp_dir.cleanup() def pdrun(self, args, env=None): cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"] if args: cmd.extend(args.split(" ")) - cmd.extend([pyname]) + cmd.extend([self.path]) proc = subprocess.Popen(cmd, env) return proc def test_ps_1(self): - args = "--run_mode ps" + log_dir = tempfile.TemporaryDirectory() + args = "--run_mode ps --log_dir {}".format(log_dir.name) p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) + log_dir.cleanup() def test_ps_2(self): - if os.path.exists('./log'): - shutil.rmtree('./log') - - args = "--job_id ps2 --server_num=2 --trainer_num=2" + log_dir = tempfile.TemporaryDirectory() + args = "--job_id ps2 --server_num=2 --trainer_num=2 --log_dir {}".format( + log_dir.name) p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) - c = get_files('log', 'ps2') + c = get_files(log_dir.name, 'ps2') self.assertTrue(len(c) == 5) + log_dir.cleanup() def test_ps_3(self): - if os.path.exists('./log'): - shutil.rmtree('./log') - + log_dir = tempfile.TemporaryDirectory() port = random.randrange(6000, 8000) - args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format( - port) + args = "--job_id ps3 --log_dir {} --master 127.0.0.1:{} --nnodes 2 --server_num=1 --trainer_num=1".format( + log_dir.name, port) p1 = self.pdrun(args) p2 = self.pdrun(args) p1.wait() @@ -150,20 +162,21 @@ class PS_Test(unittest.TestCase): self.assertTrue(p1.poll() == 0) self.assertTrue(p2.poll() == 0) - c = get_files('log', 'ps3') + c = get_files(log_dir.name, 'ps3') self.assertTrue(len(c) == 6) + log_dir.cleanup() def test_ps_4(self): - if os.path.exists('./log'): - shutil.rmtree('./log') - - args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903" + log_dir = tempfile.TemporaryDirectory() + args = "--job_id ps4 --log_dir {} --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903".format( + log_dir.name) p1 = self.pdrun(args) p1.wait() self.assertTrue(p1.poll() == 0) - c = get_files('log', 'ps4') + c = get_files(log_dir.name, 'ps4') self.assertTrue(len(c) == 5) + log_dir.cleanup() if __name__ == '__main__': -- GitLab