diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py index bca602fb8bf66f57e784afd073e2935fe0c78c01..50fde2c47bffffa01d62d11a502b089b364cf3a5 100644 --- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py +++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py @@ -920,7 +920,7 @@ class FleetUtil(object): feeded_var_names=feeded_var_names, target_vars=target_vars, executor=executor, - main_program=program, + main_program=program.clone(), params_filename="params") else: fluid.io.save_inference_model( @@ -928,7 +928,7 @@ class FleetUtil(object): feeded_var_names=feeded_var_names, target_vars=target_vars, executor=executor, - main_program=program) + main_program=program.clone()) configs = { "fs.default.name": hadoop_fs_name, diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py index 7474d418911d905a38ccdf0dc9f9a62aa3be34c3..23a22531a45f11aa61e25304dfe973989aacbcf6 100644 --- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py +++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py @@ -22,7 +22,7 @@ from datetime import datetime import re import copy import errno - +import time import logging __all__ = ["HDFSClient"] @@ -83,6 +83,7 @@ class HDFSClient(object): ret_code = 0 ret_out = None ret_err = None + retry_sleep_second = 3 whole_commands = " ".join(whole_commands) for x in range(retry_times + 1): proc = subprocess.Popen( @@ -99,6 +100,7 @@ class HDFSClient(object): if ret_code == 0: break + time.sleep(retry_sleep_second) return ret_code, ret_out, ret_err @@ -329,7 +331,7 @@ class HDFSClient(object): ls_commands = ['-ls', hdfs_path] returncode, output, errors = self.__run_hdfs_cmd( - ls_commands, retry_times=1) + ls_commands, retry_times=10) if returncode: _logger.error("HDFS list path: {} failed".format(hdfs_path))