未验证 提交 43622a20 编写于 作者: W wangzhen38 提交者: GitHub

[RM FLUID] trainer_pass&heter_trainer_pass (#50610)

* [RM FLUID] trainer_pass&heter_trainer_pass
* [RM FLUID] rm distributed_strategy
上级 47306c58
......@@ -96,7 +96,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return strategy
def _build_trainer_programs(self, compiled_config):
from paddle.fluid.incubate.fleet.parameter_server.ir import (
from paddle.incubate.fleet.parameter_server.ir import (
trainer_pass as worker,
)
......@@ -106,7 +106,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
use_ps_gpu = self.user_defined_strategy.a_sync_configs["use_ps_gpu"]
if not compiled_config.is_geo_mode():
from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
from paddle.incubate.fleet.parameter_server.ir.public import (
_add_lr_decay_table_pass,
)
......@@ -150,7 +150,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
compiled_config.set_origin_ps_startup_program(_startup)
# for heter program
if self.role_maker._is_heter_parameter_server_mode:
from paddle.fluid.incubate.fleet.parameter_server.ir import (
from paddle.incubate.fleet.parameter_server.ir import (
heter_trainer_pass as heter_worker,
)
......@@ -191,13 +191,13 @@ class ParameterServerOptimizer(MetaOptimizerBase):
_main = paddle.static.Program()
_startup = paddle.static.Program()
from paddle.fluid.incubate.fleet.parameter_server.ir import (
from paddle.incubate.fleet.parameter_server.ir import (
pserver_pass as server,
)
if not compiled_config.is_geo_mode():
from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
from paddle.incubate.fleet.parameter_server.ir.public import (
_get_optimize_ops,
)
......@@ -209,7 +209,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
if len(ops) == 0:
return _main, _startup
from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
from paddle.incubate.fleet.parameter_server.ir.public import (
_add_lr_decay_table_pass,
)
......@@ -299,9 +299,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
free = get_sys_free_mem()
from paddle.fluid.incubate.fleet.parameter_server.ir import (
vars_metatools,
)
from paddle.incubate.fleet.parameter_server.ir import vars_metatools
processed_var_names = set(["@EMPTY@"])
param_memory_size = 0
......@@ -371,9 +369,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
_origin_main_program = loss.block.program
_origin_startup_program = startup_program
from paddle.fluid.incubate.fleet.parameter_server.ir import (
public as public,
)
from paddle.incubate.fleet.parameter_server.ir import public as public
compiled_config = public.CompileTimeStrategy(
_origin_main_program,
......@@ -409,14 +405,14 @@ class ParameterServerOptimizer(MetaOptimizerBase):
}
else:
loss.block.program = main_program
fluid.framework.switch_startup_program(startup_program)
paddle.framework.switch_startup_program(startup_program)
elif self.role_maker._is_server():
main_program, startup_program = self._build_pserver_programs(
compiled_config
)
loss.block.program = main_program
fluid.framework.switch_startup_program(startup_program)
paddle.framework.switch_startup_program(startup_program)
return None, None
def _disable_strategy(self, dist_strategy):
......
......@@ -123,7 +123,7 @@ class Hogwild(DeviceWorker):
hogwild.stat_var_names.extend([i])
downpour.stat_var_names.extend([i])
from paddle.fluid.incubate.fleet.parameter_server import version
from paddle.incubate.fleet.parameter_server import version
if (
version.is_transpiler()
......@@ -271,7 +271,7 @@ class DownpourLite(DeviceWorker):
for i in opt_info["stat_var_names"]:
downpour.stat_var_names.extend([i])
from paddle.fluid.incubate.fleet.parameter_server import version
from paddle.incubate.fleet.parameter_server import version
if (
version.is_transpiler()
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = [
"TrainerRuntimeConfig",
"DistributedStrategy",
"SyncStrategy",
"AsyncStrategy",
"HalfAsyncStrategy",
"GeoStrategy",
"StrategyFactory",
]
import os
import paddle.fluid as fluid
from paddle.fluid.transpiler.distribute_transpiler import (
DistributeTranspilerConfig,
ServerRuntimeConfig,
)
from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
class TrainerRuntimeConfig:
def __init__(self):
self.mode = None
num_threads = os.getenv("CPU_NUM", "1")
self.runtime_configs = {}
self.runtime_configs['communicator_max_merge_var_num'] = os.getenv(
"FLAGS_communicator_max_merge_var_num", num_threads
)
self.runtime_configs['communicator_send_queue_size'] = os.getenv(
"FLAGS_communicator_send_queue_size", num_threads
)
self.runtime_configs[
'communicator_independent_recv_thread'
] = os.getenv("FLAGS_communicator_independent_recv_thread", "1")
self.runtime_configs[
'communicator_min_send_grad_num_before_recv'
] = os.getenv(
"FLAGS_communicator_min_send_grad_num_before_recv", num_threads
)
self.runtime_configs['communicator_thread_pool_size'] = os.getenv(
"FLAGS_communicator_thread_pool_size", "5"
)
self.runtime_configs['communicator_send_wait_times'] = os.getenv(
"FLAGS_communicator_send_wait_times", "5"
)
self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv(
"FLAGS_communicator_is_sgd_optimizer", "1"
)
# not used
self.runtime_configs['rpc_deadline'] = os.getenv(
"FLAGS_rpc_deadline", "180000"
)
self.runtime_configs['rpc_retry_times'] = os.getenv(
"FLAGS_rpc_retry_times", "3"
)
def get_communicator_flags(self):
need_keys = []
num_threads = os.getenv("CPU_NUM", "1")
mode_str = ""
if self.mode is None or self.mode == DistributedMode.ASYNC:
need_keys = self.runtime_configs.keys()
mode_str = "async"
elif (
self.mode == DistributedMode.SYNC
or self.mode == DistributedMode.HALF_ASYNC
):
mode_str = "sync or half_async"
need_keys = [
'communicator_max_merge_var_num',
'communicator_send_wait_times',
'communicator_thread_pool_size',
'communicator_send_queue_size',
]
elif self.mode == DistributedMode.GEO:
mode_str = "GEO"
need_keys = [
'communicator_thread_pool_size',
'communicator_send_wait_times',
'communicator_max_merge_var_num',
'communicator_send_queue_size',
]
else:
raise ValueError("Unsupported Mode")
if (
self.mode == DistributedMode.SYNC
or self.mode == DistributedMode.HALF_ASYNC
):
max_merge_var_num = self.runtime_configs[
'communicator_max_merge_var_num'
]
send_queue_size = self.runtime_configs[
'communicator_send_queue_size'
]
if max_merge_var_num != num_threads:
print(
'WARNING: In {} mode, communicator_max_merge_var_num '
'must be equal to CPU_NUM. But received, '
'communicator_max_merge_var_num = {}, CPU_NUM = '
'{}. communicator_max_merge_var_num will be fored to {}.'.format(
mode_str, max_merge_var_num, num_threads, num_threads
)
)
self.runtime_configs[
'communicator_max_merge_var_num'
] = num_threads
if send_queue_size != num_threads:
print(
'WARNING: In {} mode, communicator_send_queue_size '
'must be equal to CPU_NUM. But received, '
'communicator_send_queue_size = {}, CPU_NUM = '
'{}. communicator_send_queue_size will be fored to {}.'.format(
mode_str, send_queue_size, num_threads, num_threads
)
)
self.runtime_configs[
'communicator_send_queue_size'
] = num_threads
return dict((key, str(self.runtime_configs[key])) for key in need_keys)
def display(self, configs):
raw0, raw1, length = 45, 5, 50
h_format = "{:^45s}{:<5s}\n"
l_format = "{:<45s}{:<5s}\n"
border = "".join(["="] * length)
line = "".join(["-"] * length)
draws = ""
draws += border + "\n"
draws += h_format.format("TrainerRuntimeConfig Overview", "Value")
draws += line + "\n"
for k, v in configs.items():
draws += l_format.format(k, v)
draws += border
_str = "\n{}\n".format(draws)
return _str
def __repr__(self):
return self.display(self.get_communicator_flags())
class PSLibRuntimeConfig:
def __init__(self):
self.runtime_configs = {}
def get_runtime_configs(self):
return self.runtime_configs
class DistributedStrategy:
def __init__(self):
self._program_config = DistributeTranspilerConfig()
self._trainer_runtime_config = TrainerRuntimeConfig()
self._pslib_runtime_config = PSLibRuntimeConfig()
self._server_runtime_config = ServerRuntimeConfig()
num_threads = int(os.getenv("CPU_NUM", "1"))
self._execute_strategy = fluid.ExecutionStrategy()
self._build_strategy = fluid.BuildStrategy()
self._execute_strategy.num_threads = num_threads
if num_threads > 1:
self._build_strategy.reduce_strategy = (
fluid.BuildStrategy.ReduceStrategy.Reduce
)
self.debug_opt = None
self.use_ps_gpu = False
def set_debug_opt(self, opt_info):
self.debug_opt = opt_info
def get_debug_opt(self):
opt_info = dict()
if self.debug_opt is not None and isinstance(self.debug_opt, dict):
opt_info["dump_slot"] = bool(self.debug_opt.get("dump_slot", 0))
opt_info["dump_converter"] = str(
self.debug_opt.get("dump_converter", "")
)
opt_info["dump_fields"] = self.debug_opt.get("dump_fields", [])
opt_info["dump_file_num"] = self.debug_opt.get("dump_file_num", 16)
opt_info["dump_fields_path"] = self.debug_opt.get(
"dump_fields_path", ""
)
opt_info["dump_param"] = self.debug_opt.get("dump_param", [])
return opt_info
def get_program_config(self):
return self._program_config
def set_program_config(self, config):
if isinstance(config, DistributeTranspilerConfig):
self._program_config = config
elif isinstance(config, dict):
for key in config:
if hasattr(self._program_config, key):
setattr(self._program_config, key, config[key])
else:
raise ValueError(
"DistributeTranspilerConfig doesn't have key: {}".format(
key
)
)
else:
raise TypeError(
"program_config only accept input type: dict or DistributeTranspilerConfig"
)
self.check_program_config()
def check_program_config(self):
raise NotImplementedError(
"check_program_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
)
def get_trainer_runtime_config(self):
return self._trainer_runtime_config
def set_trainer_runtime_config(self, config):
if isinstance(config, TrainerRuntimeConfig):
self._trainer_runtime_config = config
elif isinstance(config, dict):
for key, Value in config.items():
if key in self._trainer_runtime_config.runtime_configs:
self._trainer_runtime_config.runtime_configs[key] = Value
else:
raise ValueError(
"TrainerRuntimeConfig doesn't have key: {}".format(key)
)
else:
raise TypeError(
"trainer_runtime_config only accept input type: dict or TrainerRuntimeConfig"
)
self.check_trainer_runtime_config()
def check_trainer_runtime_config(self):
raise NotImplementedError(
"check_trainer_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
)
def get_pslib_runtime_config(self):
return self._pslib_runtime_config
def set_pslib_runtime_config(self, config):
self._pslib_runtime_config.runtime_configs = config
def get_server_runtime_config(self):
return self._server_runtime_config
def set_server_runtime_config(self, config):
if isinstance(config, ServerRuntimeConfig):
self._server_runtime_config = config
elif isinstance(config, dict):
for key in config:
if hasattr(self._server_runtime_config, key):
setattr(self._server_runtime_config, key, config[key])
else:
raise ValueError(
"ServerRuntimeConfig doesn't have key: {}".format(key)
)
else:
raise TypeError(
"server_runtime_config only accept input type: dict or ServerRuntimeConfig"
)
self.check_server_runtime_config()
def check_server_runtime_config(self):
raise NotImplementedError(
"check_server_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
)
def get_execute_strategy(self):
return self._execute_strategy
def set_execute_strategy(self, config):
if isinstance(config, fluid.ExecutionStrategy):
self._execute_strategy = config
elif isinstance(config, dict):
for key in config:
if hasattr(self._execute_strategy, key):
setattr(self._execute_strategy, key, config[key])
else:
raise ValueError(
"ExecutionStrategy doesn't have key: {}".format(key)
)
else:
raise TypeError(
"execute_strategy only accept input type: dict or ExecutionStrategy"
)
self.check_execute_strategy()
def check_execute_strategy(self):
raise NotImplementedError(
"check_execute_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
)
def get_build_strategy(self):
return self._build_strategy
def set_build_strategy(self, config):
if isinstance(config, fluid.BuildStrategy):
self._build_strategy = config
elif isinstance(config, dict):
for key in config:
if hasattr(self._build_strategy, key):
setattr(self._build_strategy, key, config[key])
else:
raise ValueError(
"BuildStrategy doesn't have key: {}".format(key)
)
else:
raise TypeError(
"build_strategy only accept input type: dict or BuildStrategy"
)
self.check_build_strategy()
def check_build_strategy(self):
raise NotImplementedError(
"check_build_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy."
)
class SyncStrategy(DistributedStrategy):
def __init__(self):
super().__init__()
self.check_program_config()
self.check_trainer_runtime_config()
self.check_server_runtime_config()
self.check_build_strategy()
self.check_execute_strategy()
def check_trainer_runtime_config(self):
self._trainer_runtime_config.mode = DistributedMode.SYNC
def check_program_config(self):
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = True
self._program_config.half_async = True
self._program_config.completely_not_async = True
def check_server_runtime_config(self):
pass
def check_execute_strategy(self):
self._execute_strategy.use_thread_barrier = True
def check_build_strategy(self):
self._build_strategy.async_mode = True
class AsyncStrategy(DistributedStrategy):
def __init__(self):
super().__init__()
self.check_program_config()
self.check_trainer_runtime_config()
self.check_server_runtime_config()
self.check_build_strategy()
self.check_execute_strategy()
def check_trainer_runtime_config(self):
self._trainer_runtime_config.mode = DistributedMode.ASYNC
def check_program_config(self):
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = True
def check_server_runtime_config(self):
pass
def check_execute_strategy(self):
pass
def check_build_strategy(self):
self._build_strategy.async_mode = True
class HalfAsyncStrategy(DistributedStrategy):
def __init__(self):
super().__init__()
self.check_program_config()
self.check_trainer_runtime_config()
self.check_server_runtime_config()
self.check_build_strategy()
self.check_execute_strategy()
def check_trainer_runtime_config(self):
self._trainer_runtime_config.mode = DistributedMode.HALF_ASYNC
def check_program_config(self):
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = True
self._program_config.half_async = True
def check_server_runtime_config(self):
pass
def check_execute_strategy(self):
self._execute_strategy.use_thread_barrier = True
def check_build_strategy(self):
self._build_strategy.async_mode = True
class GeoStrategy(DistributedStrategy):
def __init__(self, update_frequency=100):
super().__init__()
self._program_config.geo_sgd_need_push_nums = update_frequency
self.check_program_config()
self.check_trainer_runtime_config()
self.check_server_runtime_config()
self.check_build_strategy()
self.check_execute_strategy()
def check_program_config(self):
self._program_config.sync_mode = False
self._program_config.runtime_split_send_recv = True
self._program_config.geo_sgd_mode = True
def check_trainer_runtime_config(self):
self._trainer_runtime_config.mode = DistributedMode.GEO
self._trainer_runtime_config.runtime_configs[
'communicator_send_queue_size'
] = self._program_config.geo_sgd_need_push_nums
self._trainer_runtime_config.runtime_configs[
'communicator_max_merge_var_num'
] = self._program_config.geo_sgd_need_push_nums
def check_server_runtime_config(self):
pass
def check_execute_strategy(self):
pass
def check_build_strategy(self):
self._build_strategy.async_mode = True
class StrategyFactory:
def __init_(self):
pass
@staticmethod
def create_sync_strategy():
return SyncStrategy()
@staticmethod
def create_half_async_strategy():
return HalfAsyncStrategy()
@staticmethod
def create_async_strategy():
return AsyncStrategy()
@staticmethod
def create_geo_strategy(update_frequency=100):
return GeoStrategy(update_frequency)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
import paddle.framework.core as core
import paddle
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
find_heter_ops,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
union_forward_gradient_op,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
create_heter_program,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
create_trainer_program,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
find_block_joints,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
find_op_input_output,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
get_vars_name_in_block,
)
def split_heter_worker_ops_pass(program, config, stage_id, device):
"""
split heter worker program from origin-program
1. find heter op (located on different device)
2. find input&output of every heter-block
3. create heter worker program, add listen&serv op
"""
default_deveice = "cpu"
program, heter_ops, _, program_block_ops = find_heter_ops(
program, default_deveice
)
if len(heter_ops) == 0:
warnings.warn(
"Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code."
)
return program
program_block_ops = union_forward_gradient_op(program_block_ops)
block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
heter_program = paddle.static.Program()
create_heter_program(
program,
config,
heter_program,
program_block_ops,
heter_ops,
block_vars_detail,
device,
stage_id,
)
return heter_program
def split_trainer_ops_pass(program, config, default_device="cpu"):
"""
split cpu-trainer program from origin-program
1. find heter op (located on different device)
2. find input&output of every heter-block
3. create cpu-trainer program, add send&recv op
"""
# Todo: support user define default_device (MrChengmo)
default_device_ = default_device
program, heter_ops, default_ops, program_block_ops = find_heter_ops(
program, default_device_
)
program_block_ops = union_forward_gradient_op(program_block_ops)
block_vars_detail = find_block_joints(program, program_block_ops, heter_ops)
trainer_program = program.clone()
create_trainer_program(
trainer_program, program, config, program_block_ops, block_vars_detail
)
return trainer_program
......@@ -24,7 +24,7 @@ main_program = default_main_program()
class TestFleetPS(unittest.TestCase):
def test_version(self):
from paddle.fluid.incubate.fleet.parameter_server import version
from paddle.incubate.fleet.parameter_server import version
transpiler = version.is_transpiler()
self.assertEqual(transpiler, True)
......
......@@ -68,6 +68,7 @@ from ..fluid.framework import in_dygraph_mode # noqa: F401
from ..fluid.framework import _global_flags # noqa: F401
from ..fluid.framework import _apply_pass # noqa: F401
from ..fluid.framework import switch_main_program
from ..fluid.framework import switch_startup_program
from ..fluid.framework import _set_expected_place # noqa: F401
from ..fluid.framework import Block, Program # noqa: F401
from ..fluid.framework import IrGraph # noqa: F401
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -39,12 +39,12 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
from paddle.fluid.incubate.fleet.base.mode import Mode
from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker
from paddle.fluid.incubate.fleet.parameter_server import version
from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
from paddle.incubate.fleet.parameter_server import version
from paddle.incubate.fleet.parameter_server.ir.public import (
get_sparse_tablenames,
)
from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops
from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
from paddle.incubate.fleet.parameter_server.ir.public import _get_lr_ops
from paddle.incubate.fleet.parameter_server.ir.public import (
_has_global_step,
)
from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import (
......@@ -60,16 +60,16 @@ from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_st
from paddle.distributed.fleet.base.private_helper_function import (
wait_server_ready,
)
from paddle.fluid.incubate.fleet.parameter_server.mode import PSMode
from paddle.incubate.fleet.parameter_server.mode import PSMode
from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer
from paddle.fluid.incubate.fleet.parameter_server.ir import (
from paddle.incubate.fleet.parameter_server.ir import (
trainer_pass as worker,
)
from paddle.fluid.incubate.fleet.parameter_server.ir import (
from paddle.incubate.fleet.parameter_server.ir import (
pserver_pass as server,
)
from paddle.fluid.incubate.fleet.parameter_server.ir import public as public
from paddle.incubate.fleet.parameter_server.ir import public as public
class FleetTranspiler(Fleet):
......
......@@ -25,11 +25,11 @@ __all__ = [
import os
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
from paddle.fluid.transpiler.distribute_transpiler import (
DistributeTranspilerConfig,
ServerRuntimeConfig,
)
from paddle.incubate.fleet.parameter_server.mode import DistributedMode
class TrainerRuntimeConfig:
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
......@@ -15,7 +15,7 @@
import warnings
import paddle
from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import (
from paddle.incubate.fleet.parameter_server.ir.trainer_pass import (
create_heter_program,
create_trainer_program,
find_block_joints,
......
......@@ -26,8 +26,8 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import (
_get_optimize_ops,
get_sparse_tablenames,
)
from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode
from paddle.framework import core
from paddle.incubate.fleet.parameter_server.mode import DistributedMode
OP_NAME_SCOPE = "op_namescope"
CLIP_OP_NAME_SCOPE = "gradient_clip"
......
......@@ -293,7 +293,7 @@ os.environ['CUDA_CACHE_MAXSIZE'] = '805306368'
write_cuda_env_config_py(filename='@PADDLE_BINARY_DIR@/python/paddle/cuda_env.py')
def write_distributed_training_mode_py(filename='paddle/fluid/incubate/fleet/parameter_server/version.py'):
def write_distributed_training_mode_py(filename='paddle/incubate/fleet/parameter_server/version.py'):
cnt = '''
# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
......@@ -320,7 +320,7 @@ def is_transpiler():
'mode': 'PSLIB' if '${WITH_PSLIB}' == 'ON' else 'TRANSPILER'
})
write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/fluid/incubate/fleet/parameter_server/version.py')
write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/incubate/fleet/parameter_server/version.py')
packages=['paddle',
......@@ -405,11 +405,10 @@ packages=['paddle',
'paddle.fluid.incubate.fleet',
'paddle.fluid.incubate.checkpoint',
'paddle.fluid.incubate.fleet.base',
'paddle.fluid.incubate.fleet.parameter_server',
'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
'paddle.fluid.incubate.fleet.parameter_server.ir',
'paddle.fluid.incubate.fleet.collective',
'paddle.fluid.incubate.fleet.utils',
'paddle.fluid.incubate.fleet.parameter_server.ir',
'paddle.fluid.incubate.fleet.parameter_server',
'paddle.amp',
'paddle.cost_model',
'paddle.hapi',
......@@ -437,8 +436,10 @@ packages=['paddle',
'paddle.incubate.distributed.models',
'paddle.incubate.distributed.models.moe',
'paddle.incubate.distributed.models.moe.gate',
'paddle.incubate.fleet.parameter_server',
'paddle.incubate.fleet.parameter_server.distribute_transpiler',
'paddle.incubate.fleet.parameter_server.pslib',
'paddle.incubate.fleet.parameter_server.ir',
'paddle.quantization',
'paddle.quantization.quanters',
'paddle.quantization.observers',
......
......@@ -572,7 +572,7 @@ os.environ['CUDA_CACHE_MAXSIZE'] = '805306368'
def write_parameter_server_version_py(
filename='paddle/fluid/incubate/fleet/parameter_server/version.py',
filename='paddle/incubate/fleet/parameter_server/version.py',
):
cnt = '''
......@@ -1298,11 +1298,10 @@ def get_setup_parameters():
'paddle.fluid.incubate.fleet',
'paddle.fluid.incubate.checkpoint',
'paddle.fluid.incubate.fleet.base',
'paddle.fluid.incubate.fleet.parameter_server',
'paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler',
'paddle.fluid.incubate.fleet.parameter_server.ir',
'paddle.fluid.incubate.fleet.collective',
'paddle.fluid.incubate.fleet.utils',
'paddle.fluid.incubate.fleet.parameter_server',
'paddle.fluid.incubate.fleet.parameter_server.ir',
'paddle.amp',
'paddle.cost_model',
'paddle.hapi',
......@@ -1330,7 +1329,9 @@ def get_setup_parameters():
'paddle.incubate.distributed.models',
'paddle.incubate.distributed.models.moe',
'paddle.incubate.distributed.models.moe.gate',
'paddle.incubate.fleet.parameter_server',
'paddle.incubate.fleet.parameter_server.distribute_transpiler',
'paddle.incubate.fleet.parameter_server.ir',
'paddle.incubate.fleet.parameter_server.pslib',
'paddle.quantization',
'paddle.quantization.quanters',
......@@ -1457,7 +1458,7 @@ def main():
filename='{}/python/paddle/cuda_env.py'.format(paddle_binary_dir)
)
write_parameter_server_version_py(
filename='{}/python/paddle/fluid/incubate/fleet/parameter_server/version.py'.format(
filename='{}/python/paddle/incubate/fleet/parameter_server/version.py'.format(
paddle_binary_dir
)
)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册