diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py index 5bc27bfd9c032f346cbec3d24b80e65bd56b4c9e..bf18f9316de960347eaa473d8f891fdf10ddb952 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py @@ -74,7 +74,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): } def _get_distributed_strategy(self): - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) diff --git a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py index 61e853a4435a6556e879e0b6281f058814f34ac8..44cd3904f3e3e314402f39e0fc7aebd999fd9bde 100644 --- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py +++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py @@ -51,7 +51,7 @@ class ParameterServerRuntime(RuntimeBase): def _get_distributed_strategy(self): strategy = None - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) @@ -239,14 +239,14 @@ class ParameterServerRuntime(RuntimeBase): kwargs["sparse_attrs"] = get_sparse_attrs() return kwargs - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( - GeoStrategy, - SyncStrategy, - ) from paddle.fluid.incubate.fleet.parameter_server.ir.public import ( _get_lr_ops, _has_global_step, ) + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + GeoStrategy, + SyncStrategy, + ) trainer_config = self.async_strategy.get_trainer_runtime_config() print(trainer_config) diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index 84c2592d130a95b339dc36b19f2c124d664d67e5..618f19a9562de550d9ba0f3177b23166d03364e5 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -692,7 +692,7 @@ class TheOnePSRuntime(RuntimeBase): def _get_distributed_strategy(self): strategy = None - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) @@ -731,7 +731,7 @@ class TheOnePSRuntime(RuntimeBase): return compiled_config def _init_worker(self): - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( SyncStrategy, ) diff --git a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py index 4853e22712f164dce43092be1e66a1339626a208..9e70e108f6c4689205b51db52c53b79b939a3703 100644 --- a/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py +++ b/python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py @@ -19,10 +19,10 @@ import time import paddle import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( +from paddle.incubate.fleet.parameter_server.distribute_transpiler import ( fleet, ) -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( +from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py index e0ae707be934d9dad09faf87914ce5714fb0f20a..7600b56d016a2f57b63b0598975f86a80ec74ff4 100644 --- a/python/paddle/fluid/incubate/fleet/utils/fleet_util.py +++ b/python/paddle/fluid/incubate/fleet/utils/fleet_util.py @@ -63,7 +63,7 @@ class FleetUtil: fleet = fleet_pslib elif mode == "transpiler": - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler import ( fleet as fleet_transpiler, ) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py index 085cb293c0e46418fea8cf73850dc257d8e2aca0..0d7e94a7ce6838f0028c22b14563958a593d387b 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_distributed_strategy.py @@ -18,16 +18,14 @@ import unittest import paddle import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( - fleet, -) -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( - StrategyFactory, -) from paddle.fluid.transpiler.distribute_transpiler import ( DistributeTranspilerConfig, ServerRuntimeConfig, ) +from paddle.incubate.fleet.parameter_server.distribute_transpiler import fleet +from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + StrategyFactory, +) class TestStrategyFactor(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py index 5b0c66538bbd70f58910a9bf652e74bac555bcb1..76cf50ea0caf23bc3b4b42efa58c4256dd41f332 100644 --- a/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py +++ b/python/paddle/fluid/tests/unittests/dist_fleet_debug_gloo.py @@ -16,9 +16,7 @@ import logging # import paddle.fluid.incubate.fleet.base.role_maker as role_maker import paddle.distributed.fleet.base.role_maker as role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( - fleet, -) +from paddle.incubate.fleet.parameter_server.distribute_transpiler import fleet logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("fluid") diff --git a/python/paddle/fluid/tests/unittests/fleet_ps_training.py b/python/paddle/fluid/tests/unittests/fleet_ps_training.py index 47109d4ec27c167b34819d6322188b5b940ad439..7c6ac7d2e0ba65d1ad41608a124a0b150c64b3d9 100644 --- a/python/paddle/fluid/tests/unittests/fleet_ps_training.py +++ b/python/paddle/fluid/tests/unittests/fleet_ps_training.py @@ -18,9 +18,7 @@ from utils import gen_data import paddle import paddle.fluid as fluid from paddle.fluid.incubate.fleet.base import role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( - fleet, -) +from paddle.incubate.fleet.parameter_server.distribute_transpiler import fleet input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index c5ed8ea484e62ac631165b5441ad465a7d65a5ee..47dd5a4154262ddbd9f4f76b63a6600183969466 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -233,7 +233,7 @@ def get_user_defined_strategy(config): def get_distributed_strategy(user_defined_strategy): # pslib - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index e1fbf3bf47cd4b2fe3230f9f9d29246c846cdaca..9d32695371758ed6774a1cb2809aa51ce4306268 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -1113,7 +1113,7 @@ class TestDataset2(unittest.TestCase): train_program = fluid.Program() startup_program = fluid.Program() scope = fluid.Scope() - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler import ( fleet, ) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py index b57a30d75266d3676718b93099dbffb6b61c423a..55da7e2fddbb423d167aed2b8a3d671adafa76b1 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_api_input.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_api_input.py @@ -25,13 +25,11 @@ from paddle.fluid.incubate.fleet.base.role_maker import ( UserDefinedRoleMaker, ) from paddle.fluid.incubate.fleet.collective import CollectiveOptimizer -from paddle.fluid.incubate.fleet.parameter_server import TranspilerOptimizer -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( - fleet, -) from paddle.fluid.transpiler.distribute_transpiler import ( DistributeTranspilerConfig, ) +from paddle.incubate.fleet.parameter_server import TranspilerOptimizer +from paddle.incubate.fleet.parameter_server.distribute_transpiler import fleet class DistributeTranspilerConfigTest(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py b/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py index 10a7a7037e2f6d643c2c6a7c0c8cf47862fe0a1d..1c1ac972fc5217d6ed6200945ac60f0de7aa1b1b 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pyramid_hash.py @@ -17,10 +17,8 @@ import unittest import paddle import paddle.fluid as fluid import paddle.fluid.incubate.fleet.base.role_maker as role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( - fleet, -) -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( +from paddle.incubate.fleet.parameter_server.distribute_transpiler import fleet +from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( StrategyFactory, ) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py index 8ee1de4df3f37c276ed7e9df2f4f9209336cb878..3ae506191bd8b323ab83f513d020426ff44617c3 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py @@ -39,7 +39,7 @@ class TestCloudRoleMaker2(unittest.TestCase): GeneralRoleMaker, RoleMakerBase, ) - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import ( + from paddle.incubate.fleet.parameter_server.distribute_transpiler import ( fleet, ) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 9532ece8503eef528de0572260db8a14ad113ef7..ac64448ea51e2799ac1556848af398d32969419c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -169,7 +169,7 @@ class DistributeTranspilerConfig: We can use bandwidth efficiently when data size is larger than 2MB.If you want to change it, please be sure you have read the slice_variable function. You can find the definition of slice_variable in - https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/transpiler/distribute_transpiler.py + https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/transpiler/distribute_transpiler.py . Examples: diff --git a/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..99e5dd2219e37d1da9a141a41991c28a0c3a0332 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/__init__.py @@ -0,0 +1,978 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Convert the static program to distributed data-parallelism programs. +""" + +import os +import sys +import warnings + +import paddle +from paddle.framework import core +from paddle.static import ( + default_main_program, + default_startup_program, + Program, + Executor, +) +from paddle.fluid.compiler import CompiledProgram +from paddle.fluid.parallel_executor import ParallelExecutor +from paddle.fluid.optimizer import Optimizer + +from paddle.fluid.transpiler.distribute_transpiler import ( + DistributeTranspilerConfig, +) + +from paddle.fluid.incubate.fleet.base.fleet_base import Fleet +from paddle.fluid.incubate.fleet.base.mode import Mode +from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker + +from paddle.fluid.incubate.fleet.parameter_server import version +from paddle.fluid.incubate.fleet.parameter_server.ir.public import ( + get_sparse_tablenames, +) +from paddle.fluid.incubate.fleet.parameter_server.ir.public import _get_lr_ops +from paddle.fluid.incubate.fleet.parameter_server.ir.public import ( + _has_global_step, +) +from paddle.incubate.fleet.parameter_server.distribute_transpiler.distributed_strategy import ( + TrainerRuntimeConfig, + DistributedStrategy, + SyncStrategy, + AsyncStrategy, + HalfAsyncStrategy, + GeoStrategy, + StrategyFactory, +) + +from paddle.fluid.transpiler.details.checkport import wait_server_ready + +from paddle.fluid.incubate.fleet.parameter_server.mode import PSMode +from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer + +from paddle.fluid.incubate.fleet.parameter_server.ir import ( + trainer_pass as worker, +) +from paddle.fluid.incubate.fleet.parameter_server.ir import ( + pserver_pass as server, +) +from paddle.fluid.incubate.fleet.parameter_server.ir import public as public + + +class FleetTranspiler(Fleet): + """ + A subclass for compatibility with fluid.transpiler.DistributeTranspiler. + """ + + def __init__(self): + super().__init__(Mode.TRANSPILER) + + self._inner_mode = None + + if version.is_transpiler(): + self._inner_mode = PSMode.TRANSPILER + else: + self._inner_mode = PSMode.PSLIB + + self._strategy = None + self._transpiler = None + self._origin_main_program = None + self._origin_startup_program = None + self._communicator = None + self.startup_program = None + self.main_program = None + + self._opt_info = None + self._local_ip = 0 + self._fleet_ptr = None + self._main_programs = [] + self._scopes = [] + self._client2client_request_timeout_ms = 500000 + self._client2client_connect_timeout_ms = 10000 + self._client2client_max_retry = 3 + + def init(self, role_maker=None): + if role_maker is None: + role_maker = MPISymetricRoleMaker() + super().init(role_maker) + if self._fleet_ptr is None: + self._fleet_ptr = core.Fleet() + + def _init_transpiler_worker(self): + """ + `init_worker` has many many functions to do before training, + first, wait for all parameter servers launch completely. + second, run executor to initialize startup program + third, wait for all worker initialize completely. + + Returns: + None + """ + + def sync_strategy_envs(): + kwargs = {} + kwargs[ + "pserver_endpoints" + ] = self._role_maker.get_pserver_endpoints() + kwargs["trainer_id"] = self._role_maker.worker_index() + return kwargs + + def geo_strategy_envs(): + def get_sparse_attrs(): + opt_init_map = {} + opt_init_map["gaussian_random"] = ["seed", "mean", "std"] + opt_init_map["fill_constant"] = ["value"] + opt_init_map["uniform_random"] = ["seed", "min", "max"] + opt_init_map["truncated_gaussian_random"] = [ + "seed", + "mean", + "std", + ] + + dist_varnames = get_sparse_tablenames( + self._origin_main_program, True + ) + sparse_varnames = get_sparse_tablenames( + self._origin_main_program, False + ) + + if len(dist_varnames) != 0: + raise ValueError( + "GeoStrategy can not support large scale embeding now, please use fluid.layers.embedding" + ) + + init_attrs = [] + for value_name in sparse_varnames: + value_var = self._origin_main_program.global_block().vars[ + value_name + ] + value_attr = [ + value_name, + ",".join([str(dim) for dim in value_var.shape]), + ] + for op in self._origin_startup_program.global_block().ops: + if ( + op.type in opt_init_map.keys() + and value_name == op.output("Out")[0] + ): + init_attr = [op.type] + for attr in opt_init_map[op.type]: + init_attr.append(str(op.attr(attr))) + value_attr.append("&".join(init_attr)) + init_attrs.append(":".join(value_attr)) + break + return "#".join(init_attrs) + + kwargs = {} + kwargs["trainers"] = self.worker_num() + kwargs["sparse_attrs"] = get_sparse_attrs() + return kwargs + + # if MPISymetricRoleMaker is defined + # we suppose a user wants to submit job on mpi cluster + + if isinstance(self._role_maker, MPISymetricRoleMaker): + # check whether server has been initialized + wait_server_ready(self.server_endpoints(to_string=False)) + + trainer_config = self._strategy.get_trainer_runtime_config() + + print(trainer_config) + + lrs = _has_global_step(_get_lr_ops(self._origin_main_program)) + + if lrs > 0: + kwargs = {"need_global_step": "1"} + else: + kwargs = {"need_global_step": "0"} + + if isinstance(self._strategy, GeoStrategy): + geo_kwargs = geo_strategy_envs() + kwargs.update(geo_kwargs) + if isinstance(self._strategy, SyncStrategy): + sync_kwargs = sync_strategy_envs() + kwargs.update(sync_kwargs) + + kwargs = kwargs if kwargs else None + + send_ctx = fleet.compiled_config.get_communicator_send_context() + + if self.compiled_config.is_geo_mode(): + recv_ctx = fleet.compiled_config.get_communicator_recv_context( + recv_type=4 + ) + else: + recv_ctx = fleet.compiled_config.get_communicator_recv_context( + recv_type=1 + ) + + from paddle.distributed.communicator import Communicator + + self._communicator = Communicator( + trainer_config.mode, kwargs, trainer_config.get_communicator_flags() + ) + + self._communicator.init_with_ctx(send_ctx, recv_ctx) + + if not self._communicator.is_running(): + self._communicator.start() + else: + raise ValueError( + "Communicator can only be inited once, please check" + ) + + def init_worker(self): + """ + `init_worker` has many many functions to do before training, + first, wait for all parameter servers launch completely. + second, run executor to initialize startup program + third, wait for all worker initialize completely. + + Returns: + None + """ + if self._inner_mode == PSMode.TRANSPILER: + self._init_transpiler_worker() + else: + raise NotImplementedError("add implement later") + + def _init_transpiler_server(self, model_dir=None): + if not self.startup_program: + raise ValueError( + "startup_program is None, need invoke DistributedOptimizer.minimize first" + ) + + self._executor.run(self.startup_program) + + if model_dir: + if not os.path.isdir(model_dir): + raise ValueError("There is no directory named '%s'", model_dir) + + sparse_varnames = self.compiled_config.get_sparse_varname_on_ps( + True + ) + distribtued_varnames = ( + self.compiled_config.get_sparse_varname_on_ps(False) + ) + + remaining_vars = list( + filter( + FleetTranspiler.__exclude_vars( + sparse_varnames + distribtued_varnames + ), + self.main_program.list_vars(), + ) + ) + + paddle.static.load_vars( + self._executor, + main_program=self.main_program, + dirname=model_dir, + vars=remaining_vars, + ) + + self._load_sparse_params( + dirname=model_dir, varnames=sparse_varnames + ) + + # todo(tangwei12) load distributed vars + # self._load_sparse_params(dirname=model_dir, varnames=distribtued_varnames) + + def init_server(self, model_dir=None, **kwargs): + """ + `init_server` has many many functions to do before start pserver, + first, run executor to initialize startup program, + second, if the `model_dir` is not empty, it will load parameters from it for increment training. + + Args: + model_dir(str): The directory path. + + Returns: + None + """ + + if self._inner_mode == PSMode.TRANSPILER: + self._init_transpiler_server(model_dir) + else: + raise NotImplementedError("add implement later") + + def run_server(self): + """ + `run_server` execute executor to start pserver main program. + + Returns: + None + """ + + if self._inner_mode == PSMode.TRANSPILER: + if not self.main_program: + raise ValueError( + "main_program is None, need invoke DistributedOptimizer.minimize first" + ) + + self._executor.run(self.main_program) + else: + raise NotImplementedError("add implement later") + + def stop_worker(self): + """ + Close this executor. + + For the distributed training, this method would free the resource on PServers related to + the current Trainer. + + Returns: + None + """ + + if self._inner_mode == PSMode.TRANSPILER: + self._communicator.stop() + if isinstance(self._role_maker, MPISymetricRoleMaker): + self._role_maker._finalize() + self._executor.close() + else: + raise NotImplementedError("add implement later") + + def distributed_optimizer(self, optimizer, strategy=None): + """ + Optimizer for distributed training. + + For the distributed training, this method would rebuild a new instance of DistributedOptimizer. + Which has basic Optimizer function and special features for distributed training. + + Args: + optimizer(Optimizer): The executor to run for init server. + strategy(DistributeTranspilerConfig): Extra properties for distributed optimizer. + + Returns: + TranspilerOptimizer: subclass of DistributedOptimizer. + """ + + if not isinstance(optimizer, Optimizer): + raise ValueError("optimizer must be an instance of Optimizer") + if not self._is_initialized: + raise ValueError( + "fleet.init(role) to initialize before optimizer.minimize(loss)" + ) + + if not strategy: + _strategy = StrategyFactory.create_async_strategy() + + if isinstance(strategy, DistributedStrategy): + _strategy = strategy + elif isinstance(strategy, DistributeTranspilerConfig): + if strategy.sync_mode: + _strategy = SyncStrategy() + else: + if strategy.runtime_split_send_recv: + if strategy.geo_sgd_mode: + _strategy = GeoStrategy(strategy.geo_sgd_need_push_nums) + elif strategy.half_async: + _strategy = HalfAsyncStrategy() + else: + _strategy = AsyncStrategy() + else: + _strategy = HalfAsyncStrategy() + # for half_async compatibility + strategy.half_async = True + strategy.runtime_split_send_recv = True + _strategy.set_program_config(strategy) + elif isinstance(strategy, dict): + if self._inner_mode != PSMode.PSLIB: + raise TypeError("Dict strategy can only be used at PSLIB Mode") + + _strategy = StrategyFactory.create_async_strategy() + _strategy.set_pslib_runtime_config(strategy) + else: + raise TypeError( + "strategy must be an instance of DistributeTranspilerConfig, DistributedStrategy" + ) + + self._strategy = _strategy + self._optimizer = ParameterServerOptimizer(optimizer, _strategy) + return self._optimizer + + def save_inference_model( + self, + executor, + dirname, + feeded_var_names, + target_vars, + main_program=None, + export_for_deployment=True, + ): + """ + Prune the given `main_program` to build a new program especially for inference, + and then save it and all related parameters to given `dirname` by the `executor`. + """ + + if self._inner_mode == PSMode.PSLIB: + raise NotImplementedError("add implement later") + + if isinstance(executor, ParallelExecutor): + raise TypeError( + "in fleet.save_inference_model() function, executor must be as Executor type, ParallelExecutor is not allowed" + ) + + if not isinstance(executor, Executor): + raise TypeError( + "in fleet.save_inference_model() function, executor must be as Executor type" + ) + + # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save + if not isinstance(executor.place, paddle.CPUPlace): + save_executor = Executor(paddle.CPUPlace()) + else: + save_executor = executor + + if main_program is not None: + if isinstance(main_program, CompiledProgram): + raise TypeError( + "in fleet.save_inference_model() function, main_program must be as Program type, CompiledProgram is not allowed" + ) + paddle.static.save_inference_model( + dirname, + feeded_var_names, + target_vars, + executor, + main_program, + None, + None, + export_for_deployment, + ) + else: + paddle.static.save_inference_model( + dirname, + feeded_var_names, + target_vars, + executor, + self._origin_main_program, + None, + None, + export_for_deployment, + True, + ) + + model_basename = "__model__" + model_filename = os.path.join(dirname, model_basename) + + with open(model_filename, "rb") as f: + program_desc_str = f.read() + + program = Program.parse_from_string(program_desc_str) + program._copy_dist_param_info_from(self.main_program) + self.save_persistables(executor, dirname, program) + + def _load_sparse_params(self, dirname, varnames): + from paddle.distributed.communicator import LargeScaleKV + + scale_kv = LargeScaleKV() + for varname in varnames: + origin_varname, _, _ = public._get_varname_parts(varname) + sparse_dir = os.path.join(dirname, origin_varname, varname) + scale_kv.load(varname, sparse_dir) + + def _get_optimizer_status(self, op, param_name): + supported_opts = [ + "sgd", + "adam", + "adagrad", + "adamax", + "momentum", + "lars_momentum", + "rmsprop", + "decayed_adagrad", + "ftrl", + ] + + reshaped_val_map = {} + reshaped_val_map["sgd"] = [] + reshaped_val_map["adam"] = ["moment1_0", "moment2_0"] + reshaped_val_map["adagrad"] = ["moment_0"] + reshaped_val_map["adamax"] = ["moment_0", "inf_norm_0"] + reshaped_val_map["momentum"] = ["velocity_0"] + reshaped_val_map["lars_momentum"] = ["velocity_0"] + reshaped_val_map["rmsprop"] = [ + "momentum_0", + "mean_square_0", + "mean_grad_0", + ] + reshaped_val_map["decayed_adagrad"] = ["moment_0"] + reshaped_val_map["ftrl"] = ["squared_0", "linear_0"] + + orishaped_val_map = {} + orishaped_val_map["adam"] = ["beta1_pow_acc_0", "beta2_pow_acc_0"] + orishaped_val_map["adamax"] = ["beta1_pow_acc_0"] + + if op not in supported_opts: + raise ValueError( + "fleet can not support optimizer: {}, only this can be supported: {}".format( + op, supported_opts + ) + ) + + reshaped_names = [ + param_name + "_" + val for val in reshaped_val_map[op] + ] + + if op not in orishaped_val_map: + origin_names = [] + else: + origin_names = [ + param_name + "_" + val for val in orishaped_val_map[op] + ] + return reshaped_names, origin_names + + def _get_optimizer_op(self, param_name): + opts = public._get_optimize_ops(self._origin_main_program) + for op in opts: + if ( + "Param" in op.input_names + and "LearningRate" in op.input_names + and op.input("Param")[0] == param_name + ): + return op + + def _save_dense_params(self, executor, dirname, context, main_program): + self._communicator.recv() + + prog = Program() + block = prog.global_block() + local_vars = [] + + for name, var_ctx in context.items(): + if len(var_ctx.origin_varnames()) != 1: + raise ValueError("Dense can not support split now.") + + varname = var_ctx.origin_varnames()[0] + local_vars.append(varname) + + optimizer = self._get_optimizer_op(varname) + reshaped_varnames, origin_varnames = self._get_optimizer_status( + optimizer.type, varname + ) + + for var_name in [varname] + reshaped_varnames + origin_varnames: + var = self._origin_main_program.global_block().vars[var_name] + block.append_op( + type='recv_save', + attrs={ + "trainer_id": self._role_maker.worker_index(), + "shape": var.shape, + "slice_shapes": [",".join([str(i) for i in var.shape])], + "slice_varnames": [var.name], + "remote_varnames": [var.name], + "is_sparse": False, + "endpoints": var_ctx.split_endpoints(), + "file_path": os.path.join(dirname, var.name), + }, + ) + + executor.run(prog) + return local_vars + + def _save_sparse_params(self, executor, dirname, context, main_program): + prog = Program() + block = prog.global_block() + local_vars = [] + + for name, var_ctx in context.items(): + if len(var_ctx.origin_varnames()) != 1: + raise ValueError("Dense can not support split now.") + + varname = var_ctx.origin_varnames()[0] + local_vars.append(varname) + + optimizer = self._get_optimizer_op(varname) + reshaped_varnames, origin_varnames = self._get_optimizer_status( + optimizer.type, varname + ) + + var = self._origin_main_program.global_block().vars[varname] + slice_shapes = [] + dims1 = ",".join([str(i) for i in var.shape[1:]]) + + for section in var_ctx.sections(): + slice_shapes.append(str(section) + dims1) + + block.append_op( + type='recv_save', + attrs={ + "trainer_id": self._role_maker.worker_index(), + "shape": var.shape, + "slice_shapes": slice_shapes, + "slice_varnames": var_ctx.split_varnames(), + "remote_varnames": var_ctx.split_varnames(), + "is_sparse": True, + "endpoints": var_ctx.split_endpoints(), + "pserver_num": len( + self._role_maker.get_pserver_endpoints() + ), + "file_path": os.path.join(dirname, var.name), + }, + ) + + for reshaped_varname in reshaped_varnames: + var = self._origin_main_program.global_block().vars[ + reshaped_varname + ] + + slice_varnames = [] + remote_varnames = [] + for i in range(len(var_ctx.split_varnames())): + slice_varnames.append( + "{}.block{}".format(reshaped_varname, i) + ) + remote_varnames.append(reshaped_varname) + + block.append_op( + type='recv_save', + attrs={ + "trainer_id": self._role_maker.worker_index(), + "shape": var.shape, + "slice_shapes": slice_shapes, + "slice_varnames": slice_varnames, + "remote_varnames": remote_varnames, + "is_sparse": True, + "endpoints": var_ctx.split_endpoints(), + "pserver_num": len( + self._role_maker.get_pserver_endpoints() + ), + "file_path": os.path.join(dirname, var.name), + }, + ) + + for origin_varname in origin_varnames: + var = self._origin_main_program.global_block().vars[ + origin_varname + ] + + block.append_op( + type='recv_save', + attrs={ + "trainer_id": self._role_maker.worker_index(), + "shape": var.shape, + "slice_shapes": [",".join([str(i) for i in var.shape])], + "slice_varnames": [origin_varname], + "remote_varnames": [origin_varname], + "is_sparse": False, + "endpoints": var_ctx.split_endpoints()[:1], + "file_path": os.path.join(dirname, var.name), + }, + ) + executor.run(prog) + return context.keys() + + def _save_distributed_params( + self, executor, dirname, context, main_program + ): + prog = Program() + block = prog.global_block() + + for name, var_ctx in context.items(): + block.append_op( + type='checkpoint_notify', + attrs={ + "varname": name, + "is_slice": True, + "slice_varnames": var_ctx.split_varnames(), + "remote_varnames": var_ctx.split_varnames(), + "endpoints": var_ctx.split_endpoints(), + "dirname": dirname, + }, + ) + + executor.run(prog) + return context.keys() + + def _save_distributed_persistables(self, executor, dirname, main_program): + dense_ctx = fleet.compiled_config.get_communicator_recv_context( + recv_type=1 + ) + + sparse_ctx = fleet.compiled_config.get_communicator_recv_context( + recv_type=2 + ) + + distributed_ctx = fleet.compiled_config.get_communicator_recv_context( + recv_type=3 + ) + + recv_dense_varnames = self._save_dense_params( + executor, dirname, dense_ctx, main_program + ) + + recv_sparse_varnames = self._save_sparse_params( + executor, dirname, sparse_ctx, main_program + ) + + recv_distributed_varnames = self._save_distributed_params( + executor, dirname, distributed_ctx, main_program + ) + + saved_varnames = ( + recv_dense_varnames + + list(recv_sparse_varnames) + + list(recv_distributed_varnames) + ) + + remaining_vars = list( + filter( + FleetTranspiler.__exclude_vars(saved_varnames), + main_program.list_vars(), + ) + ) + + paddle.static.save_vars( + executor, + main_program=main_program, + dirname=dirname, + vars=remaining_vars, + ) + + def save_persistables(self, executor, dirname, main_program=None, **kwargs): + """ + This function filters out all variables with `persistable==True` from the + give `main_program` and then saves these variables to the folder `dirname` + or file `filename`. + + The `dirname` is used to specify the folder where persistable variables + are going to be saved. If you would like to save variables in separate + files, set `filename` None; + if you would like to save all variables in a + single file, use `filename` to specify the file name. + """ + + if self._inner_mode == PSMode.PSLIB: + raise NotImplementedError("add implement later") + + if isinstance(executor, ParallelExecutor): + raise TypeError( + "in fleet.save_persistables() function, executor must be as Executor type, ParallelExecutor is not allowed" + ) + + if not isinstance(executor, Executor): + raise TypeError( + "in fleet.save_persistables() function, executor must be as Executor type" + ) + # Todo(MrChengmo): support recv&save GPU-Kernel for ps-gpu model save + if not isinstance(executor.place, paddle.CPUPlace): + save_executor = Executor(paddle.CPUPlace()) + else: + save_executor = executor + + if main_program is None: + main_program = self.main_program + + if isinstance(main_program, CompiledProgram): + raise TypeError( + "in fleet.save_persistables() function, main_program must be as Program type, CompiledProgram is not allowed" + ) + + self._save_distributed_persistables( + save_executor, dirname, main_program + ) + + @staticmethod + def __exclude_vars(exclude_var_names=[]): + def is_valid(var): + if var.name in exclude_var_names: + return False + + origin_varname, _, _ = public._get_varname_parts(var.name) + if origin_varname.endswith("@GRAD"): + return False + + if origin_varname == "learning_rate_0": + return False + + if ( + var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH + or var.desc.type() == core.VarDesc.VarType.FETCH_LIST + or var.desc.type() == core.VarDesc.VarType.READER + ): + return False + return var.persistable + + return is_valid + + +# fleet is a global instance for parameter server. +fleet = FleetTranspiler() + + +class ParameterServerOptimizer(DistributedOptimizer): + """ + DistributedOptimizer is a wrapper for paddle.fluid.optimizer + A user should pass a paddle.fluid.optimizer to DistributedOptimizer + minimize() function is implemented. + DistributedOptimizer is the starting point for a user who wants to + run distributed training. The optimized information will be stored in + Fleet() instance who holds the global information about current distributed + training. + + Args: + optimizer(Optimizer): subclass of Optimizer. + strategy(DistributeTranspilerConfig): instance of DistributeTranspilerConfig. + + Returns: + None + """ + + def __init__(self, optimizer, strategy, mode=PSMode.TRANSPILER): + super().__init__(optimizer, strategy) + self._mode = mode + if self._mode == PSMode.PSLIB: + self._optimizer_name = "Distributed%s" % optimizer.type.capitalize() + if optimizer.type != "adam": + print( + "Currently, distributed optimizer only support Adam" + "Will config built-in adam for you." + "We will support more functions in DistributedOptimizer", + sys.stderr, + ) + self._optimizer_name = "DistributedAdam" + + self._optimizer = globals()[self._optimizer_name](optimizer) + else: + self._optimizer = optimizer + + self._window = 1 + self.type = "downpour" + self.data_norm_name = [ + ".batch_size", + ".batch_square_sum", + ".batch_sum", + ".batch_size@GRAD", + ".batch_square_sum@GRAD", + ".batch_sum@GRAD", + ] + + def backward( + self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None, + ): + raise NotImplementedError() + + def apply_gradients(self, params_grads): + raise NotImplementedError() + + def _build_trainer_programs(self, compiled_config): + _main = fleet._origin_main_program.clone() + _startup = fleet._origin_startup_program.clone() + + if not compiled_config.is_geo_mode(): + # for main program + _main = worker.delete_optimizer_pass(_main, compiled_config) + _main = worker.distributed_ops_pass(_main, compiled_config) + _main = worker.append_send_ops_pass(_main, compiled_config) + + # for startup program + _startup = worker.fake_init_ops_pass(_startup, compiled_config) + _startup = worker.init_from_server_pass(_startup, compiled_config) + _startup = worker.delet_extra_optimizes_pass( + _startup, compiled_config + ) + else: + _main = worker.append_send_ops_pass(_main, compiled_config) + _startup = _startup + + return _main, _startup + + def _build_pserver_programs(self, compiled_config): + _main = paddle.static.Program() + _startup = paddle.static.Program() + + if not compiled_config.is_geo_mode(): + _main = server.add_listen_and_serv_pass(_main, compiled_config) + _main = server.add_rpc_global_flags_pass(_main, compiled_config) + _main = server.add_optimizer_pass(_main, compiled_config) + _main = server.large_scale_sparse_pass( + _main, _main, compiled_config, False + ) + _startup = server.build_pserver_startup_program_pass( + _startup, _main, compiled_config + ) + _startup = server.large_scale_sparse_pass( + _startup, _main, compiled_config, True + ) + + if not compiled_config.is_sync_mode(): + _main = server.delete_unused_in_main_pass( + _main, compiled_config + ) + + _startup = server.delete_unused_in_startup_pass( + _startup, _main, compiled_config + ) + else: + _main = server.add_listen_and_serv_pass(_main, compiled_config) + _main = server.add_rpc_global_flags_pass(_main, compiled_config) + _main = server.add_geo_optimizer_pass(_main, compiled_config) + _main = server.large_scale_sparse_pass( + _main, _main, compiled_config, False + ) + _startup = server.build_pserver_startup_program_pass( + _startup, _main, compiled_config + ) + _startup = server.large_scale_sparse_pass( + _startup, _main, compiled_config, True + ) + _startup = server.delete_unused_in_startup_pass( + _startup, _main, compiled_config + ) + + return _main, _startup + + def minimize( + self, + losses, + scopes=None, + startup_programs=None, + parameter_list=None, + no_grad_set=None, + ): + + if isinstance(losses, list): + raise ValueError("need implement later") + + self._optimizer.minimize( + losses, startup_programs, parameter_list, no_grad_set + ) + + fleet._origin_main_program = default_main_program().clone( + for_test=False + ) + fleet._origin_startup_program = default_startup_program().clone( + for_test=False + ) + + compiled_config = public.CompileTimeStrategy( + fleet._origin_main_program, + fleet._origin_startup_program, + self._strategy, + fleet._role_maker, + ) + + fleet.compiled_config = compiled_config + fleet.main_program, fleet.startup_program = ( + self._build_trainer_programs(compiled_config) + if fleet.is_worker() + else self._build_pserver_programs(compiled_config) + ) diff --git a/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py b/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py new file mode 100644 index 0000000000000000000000000000000000000000..c6e2ef115a57dc7babbc2e70225f627fb8f0e3b5 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/distribute_transpiler/distributed_strategy.py @@ -0,0 +1,478 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "TrainerRuntimeConfig", + "DistributedStrategy", + "SyncStrategy", + "AsyncStrategy", + "HalfAsyncStrategy", + "GeoStrategy", + "StrategyFactory", +] + +import os + +import paddle.fluid as fluid +from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode +from paddle.fluid.transpiler.distribute_transpiler import ( + DistributeTranspilerConfig, + ServerRuntimeConfig, +) + + +class TrainerRuntimeConfig: + def __init__(self): + self.mode = None + num_threads = os.getenv("CPU_NUM", "1") + + self.runtime_configs = {} + self.runtime_configs['communicator_max_merge_var_num'] = os.getenv( + "FLAGS_communicator_max_merge_var_num", num_threads + ) + self.runtime_configs['communicator_send_queue_size'] = os.getenv( + "FLAGS_communicator_send_queue_size", num_threads + ) + self.runtime_configs[ + 'communicator_independent_recv_thread' + ] = os.getenv("FLAGS_communicator_independent_recv_thread", "1") + self.runtime_configs[ + 'communicator_min_send_grad_num_before_recv' + ] = os.getenv( + "FLAGS_communicator_min_send_grad_num_before_recv", num_threads + ) + self.runtime_configs['communicator_thread_pool_size'] = os.getenv( + "FLAGS_communicator_thread_pool_size", "5" + ) + self.runtime_configs['communicator_send_wait_times'] = os.getenv( + "FLAGS_communicator_send_wait_times", "5" + ) + self.runtime_configs['communicator_is_sgd_optimizer'] = os.getenv( + "FLAGS_communicator_is_sgd_optimizer", "1" + ) + + # not used + self.runtime_configs['rpc_deadline'] = os.getenv( + "FLAGS_rpc_deadline", "180000" + ) + self.runtime_configs['rpc_retry_times'] = os.getenv( + "FLAGS_rpc_retry_times", "3" + ) + + def get_communicator_flags(self): + need_keys = [] + num_threads = os.getenv("CPU_NUM", "1") + mode_str = "" + if self.mode is None or self.mode == DistributedMode.ASYNC: + need_keys = self.runtime_configs.keys() + mode_str = "async" + elif ( + self.mode == DistributedMode.SYNC + or self.mode == DistributedMode.HALF_ASYNC + ): + mode_str = "sync or half_async" + need_keys = [ + 'communicator_max_merge_var_num', + 'communicator_send_wait_times', + 'communicator_thread_pool_size', + 'communicator_send_queue_size', + ] + elif self.mode == DistributedMode.GEO: + mode_str = "GEO" + need_keys = [ + 'communicator_thread_pool_size', + 'communicator_send_wait_times', + 'communicator_max_merge_var_num', + 'communicator_send_queue_size', + ] + else: + raise ValueError("Unsupported Mode") + + if ( + self.mode == DistributedMode.SYNC + or self.mode == DistributedMode.HALF_ASYNC + ): + max_merge_var_num = self.runtime_configs[ + 'communicator_max_merge_var_num' + ] + send_queue_size = self.runtime_configs[ + 'communicator_send_queue_size' + ] + if max_merge_var_num != num_threads: + print( + 'WARNING: In {} mode, communicator_max_merge_var_num ' + 'must be equal to CPU_NUM. But received, ' + 'communicator_max_merge_var_num = {}, CPU_NUM = ' + '{}. communicator_max_merge_var_num will be fored to {}.'.format( + mode_str, max_merge_var_num, num_threads, num_threads + ) + ) + self.runtime_configs[ + 'communicator_max_merge_var_num' + ] = num_threads + if send_queue_size != num_threads: + print( + 'WARNING: In {} mode, communicator_send_queue_size ' + 'must be equal to CPU_NUM. But received, ' + 'communicator_send_queue_size = {}, CPU_NUM = ' + '{}. communicator_send_queue_size will be fored to {}.'.format( + mode_str, send_queue_size, num_threads, num_threads + ) + ) + self.runtime_configs[ + 'communicator_send_queue_size' + ] = num_threads + + return dict((key, str(self.runtime_configs[key])) for key in need_keys) + + def display(self, configs): + raw0, raw1, length = 45, 5, 50 + h_format = "{:^45s}{:<5s}\n" + l_format = "{:<45s}{:<5s}\n" + + border = "".join(["="] * length) + line = "".join(["-"] * length) + + draws = "" + draws += border + "\n" + draws += h_format.format("TrainerRuntimeConfig Overview", "Value") + draws += line + "\n" + + for k, v in configs.items(): + draws += l_format.format(k, v) + + draws += border + + _str = "\n{}\n".format(draws) + return _str + + def __repr__(self): + return self.display(self.get_communicator_flags()) + + +class PSLibRuntimeConfig: + def __init__(self): + self.runtime_configs = {} + + def get_runtime_configs(self): + return self.runtime_configs + + +class DistributedStrategy: + def __init__(self): + self._program_config = DistributeTranspilerConfig() + self._trainer_runtime_config = TrainerRuntimeConfig() + self._pslib_runtime_config = PSLibRuntimeConfig() + self._server_runtime_config = ServerRuntimeConfig() + num_threads = int(os.getenv("CPU_NUM", "1")) + + self._execute_strategy = fluid.ExecutionStrategy() + self._build_strategy = fluid.BuildStrategy() + + self._execute_strategy.num_threads = num_threads + if num_threads > 1: + self._build_strategy.reduce_strategy = ( + fluid.BuildStrategy.ReduceStrategy.Reduce + ) + self.debug_opt = None + self.use_ps_gpu = False + + def set_debug_opt(self, opt_info): + self.debug_opt = opt_info + + def get_debug_opt(self): + opt_info = dict() + if self.debug_opt is not None and isinstance(self.debug_opt, dict): + opt_info["dump_slot"] = bool(self.debug_opt.get("dump_slot", 0)) + opt_info["dump_converter"] = str( + self.debug_opt.get("dump_converter", "") + ) + opt_info["dump_fields"] = self.debug_opt.get("dump_fields", []) + opt_info["dump_file_num"] = self.debug_opt.get("dump_file_num", 16) + opt_info["dump_fields_path"] = self.debug_opt.get( + "dump_fields_path", "" + ) + opt_info["dump_param"] = self.debug_opt.get("dump_param", []) + return opt_info + + def get_program_config(self): + return self._program_config + + def set_program_config(self, config): + if isinstance(config, DistributeTranspilerConfig): + self._program_config = config + elif isinstance(config, dict): + for key in config: + if hasattr(self._program_config, key): + setattr(self._program_config, key, config[key]) + else: + raise ValueError( + "DistributeTranspilerConfig doesn't have key: {}".format( + key + ) + ) + else: + raise TypeError( + "program_config only accept input type: dict or DistributeTranspilerConfig" + ) + self.check_program_config() + + def check_program_config(self): + raise NotImplementedError( + "check_program_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy." + ) + + def get_trainer_runtime_config(self): + return self._trainer_runtime_config + + def set_trainer_runtime_config(self, config): + if isinstance(config, TrainerRuntimeConfig): + self._trainer_runtime_config = config + elif isinstance(config, dict): + for key, Value in config.items(): + if key in self._trainer_runtime_config.runtime_configs: + self._trainer_runtime_config.runtime_configs[key] = Value + else: + raise ValueError( + "TrainerRuntimeConfig doesn't have key: {}".format(key) + ) + else: + raise TypeError( + "trainer_runtime_config only accept input type: dict or TrainerRuntimeConfig" + ) + self.check_trainer_runtime_config() + + def check_trainer_runtime_config(self): + raise NotImplementedError( + "check_trainer_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy." + ) + + def get_pslib_runtime_config(self): + return self._pslib_runtime_config + + def set_pslib_runtime_config(self, config): + self._pslib_runtime_config.runtime_configs = config + + def get_server_runtime_config(self): + return self._server_runtime_config + + def set_server_runtime_config(self, config): + if isinstance(config, ServerRuntimeConfig): + self._server_runtime_config = config + elif isinstance(config, dict): + for key in config: + if hasattr(self._server_runtime_config, key): + setattr(self._server_runtime_config, key, config[key]) + else: + raise ValueError( + "ServerRuntimeConfig doesn't have key: {}".format(key) + ) + else: + raise TypeError( + "server_runtime_config only accept input type: dict or ServerRuntimeConfig" + ) + self.check_server_runtime_config() + + def check_server_runtime_config(self): + raise NotImplementedError( + "check_server_runtime_config must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy." + ) + + def get_execute_strategy(self): + return self._execute_strategy + + def set_execute_strategy(self, config): + if isinstance(config, fluid.ExecutionStrategy): + self._execute_strategy = config + elif isinstance(config, dict): + for key in config: + if hasattr(self._execute_strategy, key): + setattr(self._execute_strategy, key, config[key]) + else: + raise ValueError( + "ExecutionStrategy doesn't have key: {}".format(key) + ) + else: + raise TypeError( + "execute_strategy only accept input type: dict or ExecutionStrategy" + ) + self.check_execute_strategy() + + def check_execute_strategy(self): + raise NotImplementedError( + "check_execute_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy." + ) + + def get_build_strategy(self): + return self._build_strategy + + def set_build_strategy(self, config): + if isinstance(config, fluid.BuildStrategy): + self._build_strategy = config + elif isinstance(config, dict): + for key in config: + if hasattr(self._build_strategy, key): + setattr(self._build_strategy, key, config[key]) + else: + raise ValueError( + "BuildStrategy doesn't have key: {}".format(key) + ) + else: + raise TypeError( + "build_strategy only accept input type: dict or BuildStrategy" + ) + self.check_build_strategy() + + def check_build_strategy(self): + raise NotImplementedError( + "check_build_strategy must be implemented by derived class. You should use StrategyFactory to create DistributedStrategy." + ) + + +class SyncStrategy(DistributedStrategy): + def __init__(self): + super().__init__() + self.check_program_config() + self.check_trainer_runtime_config() + self.check_server_runtime_config() + self.check_build_strategy() + self.check_execute_strategy() + + def check_trainer_runtime_config(self): + self._trainer_runtime_config.mode = DistributedMode.SYNC + + def check_program_config(self): + self._program_config.sync_mode = False + self._program_config.runtime_split_send_recv = True + self._program_config.half_async = True + self._program_config.completely_not_async = True + + def check_server_runtime_config(self): + pass + + def check_execute_strategy(self): + self._execute_strategy.use_thread_barrier = True + + def check_build_strategy(self): + self._build_strategy.async_mode = True + + +class AsyncStrategy(DistributedStrategy): + def __init__(self): + super().__init__() + self.check_program_config() + self.check_trainer_runtime_config() + self.check_server_runtime_config() + self.check_build_strategy() + self.check_execute_strategy() + + def check_trainer_runtime_config(self): + self._trainer_runtime_config.mode = DistributedMode.ASYNC + + def check_program_config(self): + self._program_config.sync_mode = False + self._program_config.runtime_split_send_recv = True + + def check_server_runtime_config(self): + pass + + def check_execute_strategy(self): + pass + + def check_build_strategy(self): + self._build_strategy.async_mode = True + + +class HalfAsyncStrategy(DistributedStrategy): + def __init__(self): + super().__init__() + self.check_program_config() + self.check_trainer_runtime_config() + self.check_server_runtime_config() + self.check_build_strategy() + self.check_execute_strategy() + + def check_trainer_runtime_config(self): + self._trainer_runtime_config.mode = DistributedMode.HALF_ASYNC + + def check_program_config(self): + self._program_config.sync_mode = False + self._program_config.runtime_split_send_recv = True + self._program_config.half_async = True + + def check_server_runtime_config(self): + pass + + def check_execute_strategy(self): + self._execute_strategy.use_thread_barrier = True + + def check_build_strategy(self): + self._build_strategy.async_mode = True + + +class GeoStrategy(DistributedStrategy): + def __init__(self, update_frequency=100): + super().__init__() + self._program_config.geo_sgd_need_push_nums = update_frequency + self.check_program_config() + self.check_trainer_runtime_config() + self.check_server_runtime_config() + self.check_build_strategy() + self.check_execute_strategy() + + def check_program_config(self): + self._program_config.sync_mode = False + self._program_config.runtime_split_send_recv = True + self._program_config.geo_sgd_mode = True + + def check_trainer_runtime_config(self): + self._trainer_runtime_config.mode = DistributedMode.GEO + + self._trainer_runtime_config.runtime_configs[ + 'communicator_send_queue_size' + ] = self._program_config.geo_sgd_need_push_nums + + self._trainer_runtime_config.runtime_configs[ + 'communicator_max_merge_var_num' + ] = self._program_config.geo_sgd_need_push_nums + + def check_server_runtime_config(self): + pass + + def check_execute_strategy(self): + pass + + def check_build_strategy(self): + self._build_strategy.async_mode = True + + +class StrategyFactory: + def __init_(self): + pass + + @staticmethod + def create_sync_strategy(): + return SyncStrategy() + + @staticmethod + def create_half_async_strategy(): + return HalfAsyncStrategy() + + @staticmethod + def create_async_strategy(): + return AsyncStrategy() + + @staticmethod + def create_geo_strategy(update_frequency=100): + return GeoStrategy(update_frequency) diff --git a/python/paddle/incubate/fleet/parameter_server/ir/heter_trainer_pass.py b/python/paddle/incubate/fleet/parameter_server/ir/heter_trainer_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..40dea3e7f1eab5ee1c6ddccf91c8a5ecce7b3764 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/heter_trainer_pass.py @@ -0,0 +1,79 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import paddle +from paddle.fluid.incubate.fleet.parameter_server.ir.trainer_pass import ( + create_heter_program, + create_trainer_program, + find_block_joints, + find_heter_ops, + union_forward_gradient_op, +) + + +def split_heter_worker_ops_pass(program, config, stage_id, device): + """ + split heter worker program from origin-program + 1. find heter op (located on different device) + 2. find input&output of every heter-block + 3. create heter worker program, add listen&serv op + """ + default_deveice = "cpu" + program, heter_ops, _, program_block_ops = find_heter_ops( + program, default_deveice + ) + if len(heter_ops) == 0: + warnings.warn( + "Currently running in Heter Parameter Server mode, but no OP running on heterogeneous devices, Please check your code." + ) + return program + + program_block_ops = union_forward_gradient_op(program_block_ops) + block_vars_detail = find_block_joints(program, program_block_ops, heter_ops) + heter_program = paddle.static.Program() + create_heter_program( + program, + config, + heter_program, + program_block_ops, + heter_ops, + block_vars_detail, + device, + stage_id, + ) + return heter_program + + +def split_trainer_ops_pass(program, config, default_device="cpu"): + """ + split cpu-trainer program from origin-program + 1. find heter op (located on different device) + 2. find input&output of every heter-block + 3. create cpu-trainer program, add send&recv op + """ + # Todo: support user define default_device (MrChengmo) + default_device_ = default_device + program, heter_ops, default_ops, program_block_ops = find_heter_ops( + program, default_device_ + ) + program_block_ops = union_forward_gradient_op(program_block_ops) + + block_vars_detail = find_block_joints(program, program_block_ops, heter_ops) + trainer_program = program.clone() + create_trainer_program( + trainer_program, program, config, program_block_ops, block_vars_detail + ) + return trainer_program diff --git a/python/paddle/incubate/fleet/parameter_server/ir/ps_dispatcher.py b/python/paddle/incubate/fleet/parameter_server/ir/ps_dispatcher.py new file mode 100644 index 0000000000000000000000000000000000000000..4155413cb59f9b4745522ef86fe56aa83e7ebb75 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/ps_dispatcher.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class PSDispatcher: + """ + PSDispatcher is the base class for dispatching vars + into different pserver instance. + You need to implement the `dispatch` interface. + """ + + def __init__(self, pserver_endpoints): + self._eps = pserver_endpoints + self._step = 0 + + @property + def eps(self): + return self._eps + + def reset(self): + """ + reset the step counter, set it zero. + """ + self._step = 0 + + def dispatch(self, varlist): + """ + Args: + varlist(list): a list of Variables + Returns: + a map of pserver endpoint -> varname + """ + raise NotImplementedError("Interface has not been implemented.") + + +class HashName(PSDispatcher): + """ + Hash variable names to several endpoints using python + "hash()" function. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). + + Examples: + .. code-block:: python + + pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"] + vars = ["var1","var2","var3","var4","var5"] + + rr = RoundRobin(pserver_endpoints) + rr.dispatch(vars) + + """ + + def __init__(self, pserver_endpoints): + super().__init__(pserver_endpoints) + + def _hash_block(self, block_str, total): + return hash(block_str) % total + + def dispatch(self, varlist): + """ + use `HashName` method to dispatch variables with each parameter server. + Args: + varlist (list): a list of Variables + + """ + eplist = [] + for var in varlist: + server_id = self._hash_block(var.name(), len(self._eps)) + server_for_param = self._eps[server_id] + eplist.append(server_for_param) + return eplist + + +class RoundRobin(PSDispatcher): + """ + Distribute variables to several endpoints using + RondRobin method. + + Args: + pserver_endpoints (list): list of endpoint(ip:port). + + Examples: + .. code-block:: python + + pserver_endpoints = ["127.0.0.1:6007", "127.0.0.1:6008"] + vars = ["var1","var2","var3","var4","var5"] + + rr = RoundRobin(pserver_endpoints) + rr.dispatch(vars) + + """ + + def __init__(self, pserver_endpoints): + super().__init__(pserver_endpoints) + + def dispatch(self, varlist): + """ + use `RoundRobin` method to dispatch variables with each parameter server. + Args: + varlist (list): a list of Variables + + """ + eplist = [] + for var in varlist: + server_for_param = self._eps[self._step] + eplist.append(server_for_param) + self._step += 1 + if self._step >= len(self._eps): + self._step = 0 + return eplist diff --git a/python/paddle/incubate/fleet/parameter_server/ir/pserver_pass.py b/python/paddle/incubate/fleet/parameter_server/ir/pserver_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..cab34b00518f0fcee2b4ee80d5ab52efa8b56857 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/pserver_pass.py @@ -0,0 +1,1115 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections + +from paddle.fluid.incubate.fleet.parameter_server.ir.public import ( + _get_lr_ops, + _get_optimize_ops, + _get_varname_parts, + _orig_varname, + get_sparse_tablename, + get_sparse_tablenames, + is_distributed_sparse_op, +) +from paddle.framework import core + +LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@" +OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() +RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName() +OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize +LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched + + +def _is_optimizer_op(op): + if "Param" in op.input_names and "LearningRate" in op.input_names: + return True + return False + + +def _same_or_split_var(p_name, var_name): + return p_name == var_name or p_name.startswith(var_name + ".block") + + +def _get_optimizer_input_shape(op_type, varkey, orig_shape, param_shape): + """ + Returns the shape for optimizer inputs that need to be reshaped when + Param and Grad is split to multiple servers. + """ + # HACK(typhoonzero) : Should use functions of corresponding optimizer in + # optimizer.py to get the shape, do not bind this in the transpiler. + if op_type == "adam": + if varkey in ["Moment1", "Moment2"]: + return param_shape + elif op_type == "adagrad": + if varkey == "Moment": + return param_shape + elif op_type == "adamax": + if varkey in ["Moment", "InfNorm"]: + return param_shape + elif op_type in ["momentum", "lars_momentum"]: + if varkey == "Velocity": + return param_shape + elif op_type == "rmsprop": + if varkey in ["Moment", "MeanSquare"]: + return param_shape + elif op_type == "decayed_adagrad": + if varkey == "Moment": + return param_shape + elif op_type == "ftrl": + if varkey in ["SquaredAccumulator", "LinearAccumulator"]: + return param_shape + elif op_type == "sgd": + pass + else: + raise ValueError( + "Not supported optimizer for distributed training: %s" % op_type + ) + return orig_shape + + +def _append_pserver_non_opt_ops(optimize_block, opt_op, origin_program, config): + def _get_pserver_grad_param_var(var, var_dict): + """ + Return pserver side grad/param variable, return None + if the variable is not grad/param, e.g. + + a@GRAD -> a@GRAD.block0 + a@GRAD -> a@GRAD (a is not split) + fc_0.w_0 -> fc_0.w_0.block_0 + fc_0.w_0 -> fc_0.w_0 (weight is not split) + _generated_var_123 -> None + """ + + grad_block = None + for _, g in var_dict.items(): + if _orig_varname(g.name) == _orig_varname(var.name): + # skip per trainer vars + if g.name.find(".trainer_") == -1: + # only param or grads have split blocks + ovar_name = _orig_varname(g.name) + if ovar_name in config.param_grad_ep_mapping: + grad_block = g + break + elif ovar_name in config.grad_param_mapping: + grad_block = g + break + + return grad_block + + program = optimize_block.program + # Append the ops for parameters that do not need to be optimized / updated + inputs = _get_input_map_from_op(origin_program.global_block().vars, opt_op) + for key, varlist in inputs.items(): + if not isinstance(varlist, list): + varlist = [varlist] + for i in range(len(varlist)): + var = varlist[i] + # for ops like clipping and weight decay, get the split var(xxx.block0) + # for inputs / outputs + grad_block = _get_pserver_grad_param_var( + var, program.global_block().vars + ) + if grad_block: + varlist[i] = grad_block + elif var.name not in program.global_block().vars: + tmpvar = program.global_block()._clone_variable(var) + varlist[i] = tmpvar + else: + varlist[i] = program.global_block().vars[var.name] + inputs[key] = varlist + + outputs = _get_output_map_from_op( + origin_program.global_block().vars, opt_op + ) + for key, varlist in outputs.items(): + if not isinstance(varlist, list): + varlist = [varlist] + for i in range(len(varlist)): + var = varlist[i] + grad_block = _get_pserver_grad_param_var( + var, program.global_block().vars + ) + if grad_block: + varlist[i] = grad_block + elif var.name not in program.global_block().vars: + tmpvar = program.global_block()._clone_variable(var) + varlist[i] = tmpvar + else: + varlist[i] = program.global_block().vars[var.name] + outputs[key] = varlist + + return optimize_block.append_op( + type=opt_op.type, + inputs=inputs, + outputs=outputs, + attrs=opt_op.all_attrs(), + ) + + +def _append_pserver_ops( + optimize_block, + opt_op, + endpoint, + grad_to_block_id, + origin_program, + merged_var, + sparse_grad_to_param, + config, +): + program = optimize_block.program + pserver_block = program.global_block() + new_inputs = collections.OrderedDict() + + def _get_param_block(opt_op): + # param is already created on global program + unmerged_vars = [] + merged_vars = [] + merged_ordervars = [] + + param_vars = [ + p for p in config.param_grad_ep_mapping[endpoint]["params"] + ] + + for var in param_vars: + name = var.name + orig_varname = _orig_varname(name) + + for pairs in config.merged_variables_pairs: + merged_p = pairs[0] + if merged_p.merged_var.name == orig_varname: + if ( + merged_p.merged_var.name + == merged_p.ordered_vars[0].name + ): + unmerged_vars.append(merged_p.ordered_vars[0]) + else: + merged_vars.append(merged_p.merged_var) + merged_ordervars.append(merged_p.ordered_vars[0]) + break + + param_name = opt_op.input("Param")[0] + + for i in range(len(unmerged_vars)): + if _same_or_split_var(param_name, unmerged_vars[i].name): + for var in param_vars: + if _same_or_split_var(var.name, unmerged_vars[i].name): + return var + + for i in range(len(merged_ordervars)): + if _same_or_split_var(param_name, merged_ordervars[i].name): + for var in param_vars: + if _same_or_split_var(var.name, merged_vars[i].name): + return var + return None + + for key in opt_op.input_names: + if key == "Grad": + # Note !!This is for l2decay on sparse gradient, \ + # because it will create a new tensor for + # decayed gradient but not inplace modify the origin one + origin_grad_name = opt_op.input(key)[0] + if ( + core.kNewGradSuffix() in origin_grad_name + and pserver_block.has_var(origin_grad_name) + ): + new_grad = pserver_block.var(origin_grad_name) + new_inputs[key] = new_grad + else: + new_inputs[key] = merged_var + elif key == "Param": + param_block = _get_param_block(opt_op) + + if not param_block: + return + tmpvar = pserver_block.create_var( + name=param_block.name, + persistable=True, + dtype=param_block.dtype, + shape=param_block.shape, + ) + new_inputs[key] = tmpvar + + elif key == "LearningRate": + # learning rate variable has already be created by non - optimize op, + # don't create it once again. + lr_varname = opt_op.input(key)[0] + if lr_varname in pserver_block.vars: + new_inputs[key] = pserver_block.vars[opt_op.input(key)[0]] + else: + origin_var = origin_program.global_block().vars[lr_varname] + tmpvar = pserver_block.create_var( + name=origin_var.name, + persistable=origin_var.persistable, + dtype=origin_var.dtype, + shape=origin_var.shape, + ) + new_inputs[key] = tmpvar + + for key in opt_op.input_names: + new_shape = None + if key in [ + "Param", + "Grad", + "LearningRate", + "MasterParam", + "Beta1Tensor", + "Beta2Tensor", + ]: + continue + var = origin_program.global_block().vars[opt_op.input(key)[0]] + param_var = new_inputs["Param"] + # update accumulator variable shape + new_shape = _get_optimizer_input_shape( + opt_op.type, key, var.shape, param_var.shape + ) + tmpvar = pserver_block.create_var( + name=var.name, + persistable=var.persistable, + dtype=var.dtype, + shape=new_shape, + ) + new_inputs[key] = tmpvar + + # change output's ParamOut variable + outputs = _get_output_map_from_op( + origin_program.global_block().vars, opt_op + ) + outputs["ParamOut"] = new_inputs["Param"] + optimize_block.append_op( + type=opt_op.type, + inputs=new_inputs, + outputs=outputs, + attrs=opt_op.all_attrs(), + ) + + # record sparse grad to param name + if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS: + sparse_grad_to_param.append( + str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"].name) + ) + + +def _get_input_map_from_op(varmap, op): + """Returns a dict from op input name to the vars in varmap.""" + iomap = collections.OrderedDict() + for key in op.input_names: + vars = [] + for varname in op.input(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + +def _get_output_map_from_op(varmap, op): + """Returns a dict from op output name to the vars in varmap.""" + iomap = collections.OrderedDict() + for key in op.output_names: + vars = [] + for varname in op.output(key): + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + +def get_op_by_type(block, op_type): + for op in block.ops: + if op.type == op_type: + return op + raise ValueError("add_listen_and_serv_pass must at first") + + +def add_listen_and_serv_pass(program, config): + attrs = { + "grad_to_block_id": None, + "sparse_grad_to_param": None, + "lr_decay_block_id": None, + "dense_optimize_blocks": None, + "sparse_optimize_blocks": None, + # runtime attribute + "endpoint": config.get_ps_endpoint(), + "pserver_id": config.get_role_id(), + "Fanin": config.get_trainers(), + "distributed_mode": config.get_distributed_mode(), + "rpc_get_thread_num": -1, + "rpc_send_thread_num": -1, + "rpc_prefetch_thread_num": -1, + } + + # step5 append the listen_and_serv op + program.global_block().append_op( + type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs + ) + + return program + + +def add_rpc_global_flags_pass(program, config): + server_runtime = config.get_server_runtime_config() + send_threads = server_runtime._rpc_send_thread_num + get_threads = server_runtime._rpc_get_thread_num + pull_threads = server_runtime._rpc_prefetch_thread_num + + op = get_op_by_type(program.global_block(), "listen_and_serv") + + if get_threads < 1 or send_threads < 1 or pull_threads < 1: + raise ValueError( + "error arguments in get_threads/send_threads/pull_threads" + ) + + op._set_attr("rpc_get_thread_num", get_threads) + op._set_attr("rpc_send_thread_num", send_threads) + op._set_attr("rpc_prefetch_thread_num", pull_threads) + + return program + + +def _clone_var(block, var, persistable=True): + return block.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=persistable, + ) + + +def add_optimizer_pass(program, config): + def _append_pserver_grad_merge_ops( + optimize_block, grad_varname_for_block, endpoint, grad_to_block_id + ): + trainers = config.get_trainers() + + program = optimize_block.program + pserver_block = program.global_block() + grad_block = None + + for g in config.param_grad_ep_mapping[endpoint]["grads"]: + if _orig_varname(g.name) == _orig_varname(grad_varname_for_block): + grad_block = g + break + + if not grad_block: + # do not append this op if current endpoint + # is not dealing with this grad block + return None + + orig_varname, block_name, trainer_name = _get_varname_parts( + grad_block.name + ) + + if block_name: + merged_var_name = '.'.join([orig_varname, block_name]) + else: + merged_var_name = orig_varname + + merged_var = pserver_block.create_var( + name=grad_block.name, + persistable=True, + type=grad_block.type, + dtype=grad_block.dtype, + shape=grad_block.shape, + ) + + grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx)) + if config.is_sync_mode() and trainers > 1: + vars2merge = [] + for i in range(trainers): + per_trainer_name = "%s.trainer_%d" % (merged_var_name, i) + per_trainer_var = pserver_block.create_var( + name=per_trainer_name, + persistable=False, + type=grad_block.type, + dtype=grad_block.dtype, + shape=grad_block.shape, + ) + vars2merge.append(per_trainer_var) + + optimize_block.append_op( + type="sum", + inputs={"X": vars2merge}, + outputs={"Out": merged_var}, + attrs={"use_mkldnn": False}, + ) + optimize_block.append_op( + type="scale", + inputs={"X": merged_var}, + outputs={"Out": merged_var}, + attrs={"scale": 1.0 / float(trainers)}, + ) + return merged_var + + origin_program = config.get_origin_main_program() + origin_program = origin_program.clone() + ps_endpoint = config.get_ps_endpoint() + + opt_op_on_pserver = [] + # Iterate through the ops, and if an op and the optimize ops + # which located on current pserver are in one set, then + # append it into the sub program. + global_ops = [] + # sparse grad name to param name + sparse_grad_to_param = [] + + def _is_opt_op_on_pserver(endpoint, op): + param_names = [ + p.name for p in config.param_grad_ep_mapping[endpoint]["params"] + ] + + unmerged_varnames = [] + merged_varnames = [] + merged_ordernames = [] + + for name in param_names: + orig_varname = _orig_varname(name) + + for pairs in config.merged_variables_pairs: + merged_p = pairs[0] + if merged_p.merged_var.name == orig_varname: + if ( + merged_p.merged_var.name + == merged_p.ordered_vars[0].name + ): + unmerged_varnames.append(merged_p.ordered_vars[0].name) + else: + merged_varnames.append(merged_p.merged_var.name) + merged_ordernames.append(merged_p.ordered_vars[0].name) + break + + param = op.input("Param")[0] + + if param in unmerged_varnames: + return True + + for i in range(len(merged_ordernames)): + if param == merged_ordernames[i]: + merged_p = merged_varnames[i] + merged_g = "{}@GRAD".format(merged_varnames[i]) + op._set_attr(OP_ROLE_VAR_ATTR_NAME, [merged_p, merged_g]) + return True + return False + + def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): + if _is_optimizer_op(op): + _append_pserver_ops( + block, + op, + ps_endpoint, + grad_to_block_id, + origin_program, + merged_var, + sparse_grad_to_param, + config, + ) + elif op not in lr_ops: + _append_pserver_non_opt_ops(block, op, origin_program, config) + + optimize_ops = _get_optimize_ops(origin_program) + for _, op in enumerate(optimize_ops): + if _is_optimizer_op(op) and _is_opt_op_on_pserver(ps_endpoint, op): + opt_op_on_pserver.append(op) + + # append lr decay ops to the child block if exists + lr_ops = _get_lr_ops(origin_program) + has_lr_decay = True if len(lr_ops) > 0 else False + lr_decay_block_id = -1 + optimize_blocks = [] + + if has_lr_decay > 0: + counter_increment_idx = -1 + for idx, op in enumerate(lr_ops): + if op.type != 'increment': + continue + counter = op.input("X")[0] + if counter == LEARNING_RATE_DECAY_COUNTER: + counter_increment_idx = idx + break + + if counter_increment_idx != -1: + lr_ops.pop(counter_increment_idx) + + lr_decay_block = program._create_block(program.num_blocks - 1) + optimize_blocks.append(lr_decay_block) + for op in lr_ops: + cloned_op = _append_pserver_non_opt_ops( + lr_decay_block, op, origin_program, config + ) + # append sub blocks to pserver_program in lr_decay_op + # todo(tangwei12): __clone_lr_op_sub_block__ + lr_decay_block_id = lr_decay_block.idx + + # append op to the current block + grad_to_block_id = [] + pre_block_idx = program.num_blocks - 1 + + for idx, opt_op in enumerate(opt_op_on_pserver): + per_opt_block = program._create_block(pre_block_idx) + optimize_blocks.append(per_opt_block) + optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0] + # append grad merging ops before clip and weight decay + # e.g.merge grad->L2Decay op->clip op->optimize + merged_var = None + for _, op in enumerate(optimize_ops): + # find the origin grad var before clipping / L2Decay, + # merged_var should be the input var name of L2Decay + grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] + if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name: + merged_var = _append_pserver_grad_merge_ops( + per_opt_block, + grad_varname_for_block, + ps_endpoint, + grad_to_block_id, + ) + if merged_var: + break # append optimize op once then append other ops. + + if merged_var: + for _, op in enumerate(optimize_ops): + # optimizer is connected to itself + if ( + op.attr(OP_ROLE_VAR_ATTR_NAME)[0] + == optimize_target_param_name + and op not in global_ops + ): + __append_optimize_op__( + op, per_opt_block, grad_to_block_id, merged_var, lr_ops + ) + + # dedup grad to ids list + grad_to_block_id = list(set(grad_to_block_id)) + # append global ops + if global_ops: + opt_state_block = program._create_block(program.num_blocks - 1) + optimize_blocks.append(opt_state_block) + for glb_op in global_ops: + __append_optimize_op__( + glb_op, opt_state_block, grad_to_block_id, None, lr_ops + ) + + if len(optimize_blocks) == 0: + pre_block_idx = program.num_blocks - 1 + empty_block = program._create_block(pre_block_idx) + optimize_blocks.append(empty_block) + + op = get_op_by_type(program.global_block(), "listen_and_serv") + op._set_attr("optimize_blocks", optimize_blocks) + op._set_attr("grad_to_block_id", grad_to_block_id) + op._set_attr("sparse_grad_to_param", sparse_grad_to_param) + op._set_attr("lr_decay_block_id", lr_decay_block_id) + return program + + +def large_scale_sparse_pass(program, main_program, config, is_startup=False): + opt_value_map = {} + opt_value_map["sgd"] = ["Param"] + opt_value_map["adam"] = ["Param", "Moment1", "Moment2"] + opt_value_map["adagrad"] = ["Param", "Moment"] + opt_value_map["adamax"] = ["Param", "Moment", "InfNorm"] + opt_value_map["momentum"] = ["Param", "Velocity"] + opt_value_map["lars_momentum"] = ["Param", "Velocity"] + opt_value_map["rmsprop"] = ["Param", "Moment", "MeanSquare"] + opt_value_map["decayed_adagrad"] = ["Param", "Moment"] + opt_value_map["ftrl"] = ["Param", "SquaredAccumulator", "LinearAccumulator"] + + geo_value_map = {} + geo_value_map["sum"] = "Param" + + opt_init_map = {} + opt_init_map["gaussian_random"] = ["seed", "mean", "std"] + opt_init_map["fill_constant"] = ["value"] + opt_init_map["uniform_random"] = ["seed", "min", "max"] + opt_init_map["truncated_gaussian_random"] = ["seed", "mean", "std"] + + def get_entry_attr(param_name): + origin_name = _orig_varname(param_name) + o_main_program = config.get_origin_main_program() + for op in o_main_program.global_block().ops: + if ( + is_distributed_sparse_op(op) + and get_sparse_tablename(op) == origin_name + ): + entry = op.attr("entry") + return entry + + def get_initializer_attrs(acture_value_names): + l_sep = "," + l_in = "&" + init_attrs = [] + o_startup_program = config.get_origin_startup_program() + + for value_name in acture_value_names: + origin_var_name = _orig_varname(value_name) + for op in o_startup_program.global_block().ops: + if ( + op.type in opt_init_map.keys() + and origin_var_name == op.output("Out")[0] + ): + init_attr = [op.type] + for attr in opt_init_map[op.type]: + init_attr.append(str(op.attr(attr))) + init_attrs.append(l_in.join(init_attr)) + break + + return l_sep.join(init_attrs) + + def get_optimizer_values(block): + value_names = [] + acture_names = [] + value_dims = [] + grad = None + opt_idx = -1 + fuse = False + + for op in block.ops: + opt_idx += 1 + + if op.type not in opt_value_map.keys(): + continue + + if op.type in ["sgd", "adam"]: + fuse = True + + grad = main_program.global_block().vars[op.input("Grad")[0]] + + for value in opt_value_map[op.type]: + var = main_program.global_block().vars[op.input(value)[0]] + if len(var.shape) != 2: + raise ValueError("sparse param's dimension must be 2") + + value_names.append(value) + value_dims.append(var.shape[1]) + acture_names.append(var.name) + + if value_names: + break + return grad, opt_idx, value_names, value_dims, acture_names, fuse + + def add_fuse_large_scale_op( + block, + global_block, + table_name, + value_names, + acture_names, + grad, + is_entry, + opt_idx, + ): + + op = block.ops[opt_idx] + + if op.type == "sgd": + grad = main_program.global_block().vars[op.input("Grad")[0]] + lr = main_program.global_block().vars[op.input("LearningRate")[0]] + + block._insert_op( + opt_idx, + type="lookup_sparse_table_fuse_sgd", + inputs={"Grad": grad, "LearningRate": lr}, + attrs={ + "is_entry": is_entry, + "tablename": table_name, + "value_names": value_names, + }, + ) + + elif op.type == "adam": + grad = main_program.global_block().vars[op.input("Grad")[0]] + lr = main_program.global_block().vars[op.input("LearningRate")[0]] + beta1_pow = main_program.global_block().vars[ + op.input("Beta1Pow")[0] + ] + beta2_pow = main_program.global_block().vars[ + op.input("Beta2Pow")[0] + ] + beta1_pow_o = main_program.global_block().vars[ + op.output("Beta1PowOut")[0] + ] + beta2_pow_o = main_program.global_block().vars[ + op.output("Beta2PowOut")[0] + ] + + beta1 = op.attr('beta1') + beta2 = op.attr('beta2') + epsilon = op.attr('epsilon') + + block._insert_op( + opt_idx, + type="lookup_sparse_table_fuse_adam", + inputs={ + "Grad": grad, + "LearningRate": lr, + "Beta1Pow": beta1_pow, + "Beta2Pow": beta2_pow, + }, + outputs={ + "Beta1PowOut": beta1_pow_o, + "Beta2PowOut": beta2_pow_o, + }, + attrs={ + "beta1": beta1, + "beta2": beta2, + "epsilon": epsilon, + "is_entry": is_entry, + "tablename": table_name, + "value_names": value_names, + }, + ) + else: + raise ValueError("only support sgd/adam optimizer now") + + def add_large_scale_op( + block, + global_block, + table_name, + value_names, + acture_names, + grad, + is_entry, + opt_idx, + ): + ids = global_block.create_var( + name="kSparseIDs@{}".format(table_name), + persistable=False, + dtype="int64", + shape=[1, 1], + lod_level=0, + ) + + # insert grad split to ids and tensor op + block._insert_op( + opt_idx, + type="lookup_sparse_table_grad_split", + inputs={"Grad": grad}, + outputs={"Row": ids, "Value": grad}, + attrs={"tablename": table_name, "is_entry": is_entry}, + ) + + # insert read at first + vars = [global_block.vars[acture_name] for acture_name in acture_names] + block._insert_op( + opt_idx + 1, + type="lookup_sparse_table_read", + inputs={"Ids": ids}, + outputs={"Out": vars}, + attrs={"tablename": table_name, "value_names": value_names}, + ) + + # append write at last + inputs = {"Ids": ids, "In": vars} + + block.append_op( + type="lookup_sparse_table_write", + inputs=inputs, + outputs={}, + attrs={"tablename": table_name, "value_names": value_names}, + ) + + op = get_op_by_type(main_program.global_block(), "listen_and_serv") + + param_blockid_map = {} + grad_blockid_map = {} + grad_to_params = op.attr('sparse_grad_to_param') + grad_to_block_ids = op.attr('grad_to_block_id') + + origin_program = config.get_origin_main_program() + sparse_varnames = get_sparse_tablenames(origin_program, False) + + for grad_to_block_id in grad_to_block_ids: + grad, blockid = grad_to_block_id.split(":") + grad_blockid_map[grad] = int(blockid) + + for grad_to_param in grad_to_params: + grad, param = grad_to_param.split(":") + + if _orig_varname(param) in sparse_varnames: + continue + + param_blockid_map[param] = grad_blockid_map[grad] + + if not is_startup: + for param, blockid in param_blockid_map.items(): + opt_block = program.block(blockid) + + ( + grad, + opt_idx, + value_names, + value_dims, + acture_names, + fuse, + ) = get_optimizer_values(opt_block) + + entry_attr = get_entry_attr(param) + is_entry = False if entry_attr == "none" else True + + if fuse: + add_fuse_large_scale_op( + opt_block, + program.global_block(), + param, + value_names, + acture_names, + grad, + is_entry, + opt_idx, + ) + else: + add_large_scale_op( + opt_block, + program.global_block(), + param, + value_names, + acture_names, + grad, + is_entry, + opt_idx, + ) + else: + large_scale_kv_metas = [] + for param, blockid in param_blockid_map.items(): + opt_block = main_program.block(blockid) + + ( + grad, + opt_idx, + value_names, + value_dims, + acture_names, + fuse, + ) = get_optimizer_values(opt_block) + + entry_attr = get_entry_attr(param) + + if fuse: + # remove origin optimzier op + opt_block._remove_op(opt_idx) + + # training/infer + mode = "0" + names_str = ",".join(value_names) + dims_str = ",".join([str(dim) for dim in value_dims]) + ids_name = "kSparseIDs@{}".format(param) + cached_str = ",".join(acture_names + [ids_name]) + init_attr_str = get_initializer_attrs(acture_names) + + meta_str = ":".join( + [ + param, + names_str, + dims_str, + mode, + grad.name, + cached_str, + init_attr_str, + entry_attr, + ] + ) + print("large_scale_metas: {}".format(meta_str)) + large_scale_kv_metas.append(meta_str) + + program.global_block().append_op( + type="lookup_sparse_table_init", + inputs=None, + outputs=None, + attrs={"large_scale_metas": large_scale_kv_metas}, + ) + + # todo: need delete unused var. + return program + + +def get_distributed_from_listen_and_serv(program, origin_program): + op = get_op_by_type(program.global_block(), "listen_and_serv") + sparse_varnames = get_sparse_tablenames(origin_program, True) + sparse_params = [] + grad_to_params = op.attr('sparse_grad_to_param') + for grad_to_param in grad_to_params: + _, param = grad_to_param.split(":") + if _orig_varname(param) in sparse_varnames: + sparse_params.append(param) + return sparse_params + + +def delete_unused_in_main_pass(program, config): + origin_program = config.get_origin_main_program() + sparse_params = get_distributed_from_listen_and_serv( + program, origin_program + ) + + for var in sparse_params: + if program.global_block().has_var(var): + program.global_block()._remove_var(var) + return program + + +def delete_unused_in_startup_pass(program, main_program, config): + origin_program = config.get_origin_main_program() + sparse_params = get_distributed_from_listen_and_serv( + main_program, origin_program + ) + remove_ops = [] + + for op in program.global_block().ops: + if op.type in ["recv", "fetch_barrier", "concat"]: + continue + + for key in op.output_names: + if op.output(key)[0] in sparse_params: + remove_ops.append(op) + + all_ops = program.global_block().ops + op_idxs = [all_ops.index(op) for op in remove_ops] + + for idx in op_idxs[::-1]: + program.global_block()._remove_op(idx) + + for var in sparse_params: + if program.global_block().has_var(var): + program.global_block()._remove_var(var) + + return program + + +def build_pserver_startup_program_pass(program, p_main_program, config): + ps_endpoint = config.get_ps_endpoint() + o_startup_program = config.get_origin_startup_program() + program.random_seed = o_startup_program.random_seed + params = config.param_grad_ep_mapping[ps_endpoint]["params"] + merged_ordervars = [] + + for var in params: + name = var.name + orig_varname = _orig_varname(name) + + for pairs in config.merged_variables_pairs: + merged_p = pairs[0] + if merged_p.merged_var.name == orig_varname: + if merged_p.merged_var.name != merged_p.ordered_vars[0].name: + merged_ordervars.append(merged_p.ordered_vars[0]) + break + + def _get_splited_name_and_shape(varname): + for splited_param in params: + pname = splited_param.name + if _same_or_split_var(pname, varname) and varname != pname: + return pname, splited_param.shape + + for idx, ordered in enumerate(merged_ordervars): + if _same_or_split_var(varname, ordered.name): + return pname, splited_param.shape + + return "", [] + + # 1. create vars in pserver program to startup program + pserver_vars = p_main_program.global_block().vars + + created_var_map = collections.OrderedDict() + for _, var in pserver_vars.items(): + tmpvar = program.global_block()._clone_variable(var) + created_var_map[var.name] = tmpvar + + # 2. rename op outputs + for op in o_startup_program.global_block().ops: + new_outputs = collections.OrderedDict() + # do not append startup op if var is not on this pserver + op_on_pserver = False + # TODO(gongwb) : remove this line. + if op.type not in ["recv", "fetch_barrier", "concat"]: + for key in op.output_names: + newname, _ = _get_splited_name_and_shape(op.output(key)[0]) + if newname: + op_on_pserver = True + new_outputs[key] = created_var_map[newname] + elif op.output(key)[0] in pserver_vars: + op_on_pserver = True + new_outputs[key] = pserver_vars[op.output(key)[0]] + + if op_on_pserver: + # most startup program ops have no inputs + new_inputs = _get_input_map_from_op(pserver_vars, op) + + if op.type in [ + "gaussian_random", + "fill_constant", + "uniform_random", + "truncated_gaussian_random", + ]: + op._set_attr("shape", list(new_outputs["Out"].shape)) + + program.global_block().append_op( + type=op.type, + inputs=new_inputs, + outputs=new_outputs, + attrs=op.all_attrs(), + ) + + return program + + +def add_geo_optimizer_pass(program, config): + endpoint = config.get_ps_endpoint() + params = [p for p in config.param_grad_ep_mapping[endpoint]["params"]] + + sparse_tablenames = get_sparse_tablenames( + config.get_origin_main_program(), False + ) + + for param in params: + _clone_var(program.global_block(), param) + + optimize_block = [] + sparse_grad_to_param = [] + param_to_block_id = [] + pre_block_idx = program.num_blocks - 1 + + for param in params: + per_opt_block = program._create_block(pre_block_idx) + optimize_block.append(per_opt_block) + var_name = param.name + pserver_block = per_opt_block.program.global_block() + param = pserver_block.vars[var_name] + + delta_var_name = "%s.delta" % (param.name) + origin_varname = _orig_varname(param.name) + + if origin_varname in sparse_tablenames: + sparse_grad_to_param.append(":".join([delta_var_name, param.name])) + + delta_var = pserver_block.create_var( + name=delta_var_name, + persistable=False, + type=param.type, + dtype=param.dtype, + shape=param.shape, + ) + + per_opt_block.append_op( + type="sum", inputs={"X": [param, delta_var]}, outputs={"Out": param} + ) + + param_to_block_id.append(delta_var_name + ":" + str(per_opt_block.idx)) + + op = get_op_by_type(program.global_block(), "listen_and_serv") + op._set_attr("optimize_blocks", optimize_block) + op._set_attr("grad_to_block_id", param_to_block_id) + op._set_attr("sparse_grad_to_param", sparse_grad_to_param) + + return program diff --git a/python/paddle/incubate/fleet/parameter_server/ir/public.py b/python/paddle/incubate/fleet/parameter_server/ir/public.py new file mode 100755 index 0000000000000000000000000000000000000000..ac9d2e6b58d027b7d1be5e4b5fadb9204ce24985 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/public.py @@ -0,0 +1,1507 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import logging +import math +import warnings +from functools import reduce + +import paddle +from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools +from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import ( + RoundRobin, +) +from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode +from paddle.framework import core + +OP_NAME_SCOPE = "op_namescope" +CLIP_OP_NAME_SCOPE = "gradient_clip" +STEP_COUNTER = "@PS_STEP_COUNTER@" +LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@" + +OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() +RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName() +RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC +op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() +LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched +OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize + +SPARSE_OP_LIST = ["lookup_table", "lookup_table_v2"] +SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"} + + +def _get_lr_ops(program): + lr_ops = [] + for index, op in enumerate(program.global_block().ops): + role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME)) + if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or role_id == int( + LR_SCHED_OP_ROLE_ATTR_VALUE + ) | int(OPT_OP_ROLE_ATTR_VALUE): + lr_ops.append(op) + return lr_ops + + +def _has_global_step(lr_ops): + if len(lr_ops) > 0: + for idx, op in enumerate(lr_ops): + if op.type != 'increment': + continue + counter = op.input("X")[0] + if counter == LEARNING_RATE_DECAY_COUNTER: + return True + return False + + +def is_sparse_op(op): + if ( + op.type in SPARSE_OP_LIST + and op.attr('is_sparse') is True + and op.attr('is_distributed') is False + ): + return True + + if ( + op.type == "distributed_lookup_table" + and op.attr('is_distributed') is False + ): + return True + + return False + + +def is_distributed_sparse_op(op): + if op.type in SPARSE_OP_LIST and op.attr('is_distributed') is True: + return True + + if ( + op.type == "distributed_lookup_table" + and op.attr('is_distributed') is True + ): + return True + + return False + + +def get_sparse_tablename(op): + return op.input("W")[0] + + +def get_sparse_tablenames(program, is_distributed): + tablenames = set() + if is_distributed: + for op in program.global_block().ops: + if is_distributed_sparse_op(op): + tablenames.add(get_sparse_tablename(op)) + else: + for op in program.global_block().ops: + if is_sparse_op(op): + tablenames.add(get_sparse_tablename(op)) + return list(tablenames) + + +class MergedVariable: + def __init__(self, merged, ordered, offsets): + self.merged_var = merged + self.ordered_vars = ordered + self.offsets = offsets + + +def Singleton(cls): + _instance = {} + + def _singleton(*args, **kargs): + if cls not in _instance: + _instance[cls] = cls(*args, **kargs) + return _instance[cls] + + return _singleton + + +@Singleton +class CompileTimeStrategy: + def __init__(self, main_program, startup_program, strategy, role_maker): + self.min_block_size = 81920 + + self.origin_main_program = main_program + self.origin_startup_program = startup_program + self.origin_ps_main_program = main_program + self.origin_ps_startup_program = startup_program + + self.strategy = strategy + self.role_maker = role_maker + self.use_ps_gpu = False + try: + self.is_heter_ps_mode = role_maker._is_heter_parameter_server_mode + except: + warnings.warn( + "Using paddle.distributed.fleet instead of paddle.fluid.incubate.fleet" + ) + self.is_heter_ps_mode = False + + self.origin_sparse_pairs = [] + self.origin_dense_pairs = [] + + self.merged_variables_pairs = [] + self.merged_dense_pairs = [] + self.merged_sparse_pairs = [] + + self.merged_variable_map = {} + self.param_name_to_grad_name = {} + self.grad_name_to_param_name = {} + + self.param_grad_ep_mapping = collections.OrderedDict() + self.grad_param_mapping = collections.OrderedDict() + + self._build_var_distributed() + + self.tensor_table_dict = {} + + # for heter-ps save variables + self.origin_merged_variables_pairs = list(self.merged_variables_pairs) + self.origin_merged_dense_pairs = list(self.merged_dense_pairs) + self.origin_merged_sparse_pairs = list(self.merged_sparse_pairs) + + def get_distributed_mode(self): + trainer = self.strategy.get_trainer_runtime_config() + return trainer.mode + + def is_sync_mode(self): + trainer = self.strategy.get_trainer_runtime_config() + return trainer.mode == DistributedMode.SYNC + + def is_geo_mode(self): + trainer = self.strategy.get_trainer_runtime_config() + return trainer.mode == DistributedMode.GEO + + def is_async_mode(self): + trainer = self.strategy.get_trainer_runtime_config() + return trainer.mode == DistributedMode.ASYNC + + def get_role_id(self): + try: + return self.role_maker._role_id() + except Exception: + return self.role_maker.role_id() + + def get_trainers(self): + try: + return self.role_maker._worker_num() + except Exception: + return self.role_maker.worker_num() + + def get_ps_endpoint(self): + try: + return self.role_maker._get_pserver_endpoints()[self.get_role_id()] + except Exception: + return self.role_maker.get_pserver_endpoints()[self.get_role_id()] + + def get_ps_endpoints(self): + try: + return self.role_maker._get_pserver_endpoints() + except Exception: + return self.role_maker.get_pserver_endpoints() + + def get_heter_worker_endpoints(self): + try: + return self.role_maker._get_heter_worker_endpoints() + except Exception: + return self.role_maker.get_heter_worker_endpoints() + + def get_next_stage_trainers(self): + try: + return self.role_maker._get_next_trainers() + except Exception: + return self.role_maker.get_next_trainers() + + def get_heter_worker_endpoint(self): + try: + return self.role_maker._get_heter_worker_endpoint() + except Exception: + return self.role_maker.get_heter_worker_endpoint() + + def get_trainer_endpoints(self): + try: + return self.role_maker._get_trainer_endpoints() + except Exception: + return self.role_maker.get_trainer_endpoints() + + def get_trainer_endpoint(self): + try: + return self.role_maker._get_trainer_endpoint() + except Exception: + return self.role_maker.get_trainer_endpoint() + + def get_previous_stage_trainers(self): + try: + return self.role_maker._get_previous_trainers() + except Exception: + return self.role_maker.get_previous_trainers() + + def get_origin_programs(self): + return self.origin_main_program, self.origin_startup_program + + def get_origin_main_program(self): + return self.origin_main_program + + def get_origin_startup_program(self): + return self.origin_startup_program + + def set_origin_ps_main_program(self, program): + self.origin_ps_main_program = program + + def set_origin_ps_startup_program(self, program): + self.origin_ps_startup_program = program + + def get_origin_ps_main_program(self): + return self.origin_ps_main_program + + def get_origin_ps_startup_program(self): + return self.origin_ps_startup_program + + def add_tensor_table( + self, + feed_var_name, + fetch_var_name="", + startup_program=None, + main_program=None, + tensor_table_class="", + ): + self.tensor_table_dict[feed_var_name] = {} + self.tensor_table_dict[feed_var_name]["feed_var_name"] = feed_var_name + self.tensor_table_dict[feed_var_name]["fetch_var_name"] = fetch_var_name + self.tensor_table_dict[feed_var_name][ + "startup_program" + ] = startup_program + self.tensor_table_dict[feed_var_name]["main_program"] = main_program + self.tensor_table_dict[feed_var_name][ + "tensor_table_class" + ] = tensor_table_class + + def get_tensor_table_dict(self): + return self.tensor_table_dict + + def get_sparse_varname_on_ps(self, is_distributed, endpoint=None): + if not endpoint: + endpoint = self.get_ps_endpoint() + varnames = get_sparse_tablenames( + self.get_origin_main_program(), is_distributed + ) + + ps_sparse_varnames = [] + for varname in varnames: + tables = self.get_var_distributed(varname, True) + for i in range(len(tables)): + table, ep, _ = tables[i] + if ep == endpoint: + ps_sparse_varnames.append(table) + return ps_sparse_varnames + + def get_optimize_varname_on_ps(self, param_name): + origin_param_name, _, _ = _get_varname_parts(param_name) + optimize_var_names = [] + for op in self.get_origin_main_program().global_block().ops: + # check all optimizer op + if int(op.all_attrs()["op_role"]) == 2: + # check param name + if op.input("Param")[0] != origin_param_name: + continue + # check all input + for key in op.input_names: + if key in [ + "Param", + "Grad", + "LearningRate", + "Beta1Tensor", + "Beta2Tensor", + ]: + continue + # check varibale shape related param, e.g: Moment1 + optimize_var_names += ( + self._get_optimizer_param_related_var_name( + op, op.type, key + ) + ) + return optimize_var_names + + def _get_optimizer_param_related_var_name(self, op, op_type, varkey): + """ + Returns the names for optimizer inputs that need to be load + """ + related_var_names = [] + if op_type == "adam": + if varkey in ["Moment1", "Moment2"]: + related_var_names.append(op.input(varkey)[0]) + elif op_type == "adagrad": + if varkey == "Moment": + related_var_names.append(op.input(varkey)[0]) + elif op_type in ["momentum", "lars_momentum"]: + if varkey == "Velocity": + related_var_names.append(op.input(varkey)[0]) + elif op_type == "rmsprop": + if varkey in ["Moment", "MeanSquare"]: + related_var_names.append(op.input(varkey)[0]) + elif op_type == "ftrl": + if varkey in ["SquaredAccumulator", "LinearAccumulator"]: + related_var_names.append(op.input(varkey)[0]) + elif op_type == "sgd": + pass + else: + raise ValueError( + "Not supported optimizer for distributed training: %s" % op_type + ) + return related_var_names + + def build_ctx( + self, vars, mapping, is_grad, is_sparse, is_send, is_distributed=False + ): + def get_grad_var_ep(slices): + names = [] + eps = [] + sections = [] + + for slice in slices: + if self.is_geo_mode(): + if is_send: + names.append("{}.delta".format(slice.name)) + else: + names.append(slice.name) + elif ( + is_grad and self.is_sync_mode() and self.get_trainers() > 1 + ): + names.append( + "{}.trainer_{}".format(slice.name, self.get_role_id()) + ) + else: + names.append(slice.name) + + sections.append(slice.shape[0]) + + for ep, pairs in self.param_grad_ep_mapping.items(): + params, grads = pairs["params"], pairs["grads"] + + for var in params + grads: + if slice.name == var.name: + eps.append(ep) + break + return names, eps, sections + + if isinstance(vars, MergedVariable): + name = vars.merged_var.name + slices = mapping[name] + names, eps, sections = get_grad_var_ep(slices) + origin_varnames = [var.name for var in vars.ordered_vars] + else: + name = vars.name + slices = mapping[name] + names, eps, sections = get_grad_var_ep(slices) + origin_varnames = [vars.name] + + trainer_id = self.get_role_id() + aggregate = True + ctx = core.CommContext( + name, + names, + eps, + sections, + origin_varnames, + trainer_id, + aggregate, + is_sparse, + is_distributed, + [], + ) + return ctx + + def get_trainer_send_context(self): + send_ctx = {} + distibuted_varnames = get_sparse_tablenames( + self.origin_main_program, True + ) + idx = 0 + + if not self.is_geo_mode(): + for merged in self.merged_dense_pairs: + grad = merged[1] + ctx = self.build_ctx( + grad, self.grad_var_mapping, True, False, True + ) + send_ctx[ctx.var_name()] = ctx + + for merged in self.merged_sparse_pairs: + param = merged[0] + grad = merged[1] + + param_name = param.merged_var.name + + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + ctx = self.build_ctx( + grad, + self.grad_var_mapping, + True, + True, + True, + is_distributed, + ) + send_ctx[ctx.var_name()] = ctx + idx += 1 + + if self.is_async_mode(): + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + else: + for pairs in self.origin_sparse_pairs: + param, grad = pairs + param_name = param.name + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + param_ctx = self.build_ctx( + param, + self.param_var_mapping, + False, + True, + True, + is_distributed, + ) + grad_ctx = self.build_ctx( + grad, + self.grad_var_mapping, + True, + True, + True, + is_distributed, + ) + + ctx = core.CommContext( + param_ctx.var_name(), + param_ctx.split_varnames(), + param_ctx.split_endpoints(), + param_ctx.sections(), + grad_ctx.origin_varnames(), + param_ctx.trainer_id(), + param_ctx.aggregate(), + param_ctx.is_sparse(), + param_ctx.is_distributed(), + [], + ) + + send_ctx[ctx.var_name()] = ctx + idx += 1 + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + return send_ctx + + def get_communicator_send_context(self): + send_ctx = {} + distibuted_varnames = get_sparse_tablenames( + self.origin_main_program, True + ) + idx = 0 + + if self.is_geo_mode(): + for pairs in self.merged_dense_pairs: + param = pairs[0] + ctx = self.build_ctx( + param, self.param_var_mapping, False, False, True + ) + send_ctx[ctx.var_name()] = ctx + + for pairs in self.merged_sparse_pairs: + param = pairs[0] + param_name = param.merged_var.name + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + ctx = self.build_ctx( + param, + self.param_var_mapping, + False, + True, + True, + is_distributed, + ) + send_ctx[ctx.var_name()] = ctx + idx += 1 + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + else: + for merged in self.merged_dense_pairs: + grad = merged[1] + ctx = self.build_ctx( + grad, self.grad_var_mapping, True, False, True + ) + send_ctx[ctx.var_name()] = ctx + + for merged in self.merged_sparse_pairs: + param, grad = merged + param_name = param.merged_var.name + + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + ctx = self.build_ctx( + grad, + self.grad_var_mapping, + True, + True, + True, + is_distributed, + ) + send_ctx[ctx.var_name()] = ctx + idx += 1 + + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + return send_ctx + + def get_communicator_recv_context( + self, recv_type=1, use_origin_program=False + ): + # recv_type + # 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL + distibuted_varnames = get_sparse_tablenames( + self.origin_main_program, True + ) + sparse_varnames = [] + for pairs in self.origin_sparse_pairs: + param, grad = pairs + sparse_varnames.append(param.name) + + dense_recv_ctx = {} + sparse_recv_ctx = {} + distributed_recv_ctx = {} + + variables_pairs = ( + self.merged_variables_pairs + if not use_origin_program + else self.origin_merged_variables_pairs + ) + for merged in variables_pairs: + params = merged[0] + if params.merged_var.name in sparse_varnames: + continue + + ctx = self.build_ctx( + params, self.param_var_mapping, False, False, False, False + ) + dense_recv_ctx[ctx.var_name()] = ctx + + for pairs in self.origin_sparse_pairs: + param, grad = pairs + + if param.name in distibuted_varnames: + ctx = self.build_ctx( + param, self.param_var_mapping, False, True, False, True + ) + distributed_recv_ctx[ctx.var_name()] = ctx + else: + ctx = self.build_ctx( + param, self.param_var_mapping, False, True, False, False + ) + sparse_recv_ctx[ctx.var_name()] = ctx + + if recv_type == 1: + return dense_recv_ctx + if recv_type == 2: + return sparse_recv_ctx + if recv_type == 3: + return distributed_recv_ctx + if recv_type == 4: + dense_recv_ctx.update(sparse_recv_ctx) + dense_recv_ctx.update(distributed_recv_ctx) + return dense_recv_ctx + assert ValueError( + "recv_type can only be 1/2/3/4, 1 : DENSE 2. SPARSE 3. DISTRIBUTED 4. ALL" + ) + + def get_the_one_trainer_send_context(self, split_dense_table): + if self.is_geo_mode(): + send_ctx = {} + trainer_id = self.get_role_id() + idx = 0 + + distibuted_varnames = get_sparse_tablenames( + self.origin_main_program, True + ) + for merged in self.merged_sparse_pairs: + param, grad = merged + grad_name = grad.merged_var.name + param_name = param.merged_var.name + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + var = self.origin_main_program.global_block().vars[ + grad.merged_var.name + ] + var_numel = reduce(lambda x, y: x * y, var.shape[1:]) + + sparse_ctx = core.CommContext( + grad_name, + [grad_name], + ["127.0.0.1:6071"], + [var_numel], + [grad_name], + trainer_id, + True, + True, + is_distributed, + idx, + False, + False, + -1, + [], + ) + idx += 1 + send_ctx[sparse_ctx.var_name()] = sparse_ctx + + if len(send_ctx) == 0: + raise ValueError( + "GeoSGD require sparse parameters in your net." + ) + + if len(self.tensor_table_dict) > 0 and self.role_maker._is_worker(): + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + + return send_ctx + else: + return self.get_the_one_send_context(split_dense_table) + + def get_dense_send_context( + self, + send_ctx, + idx, + merged_dense_pairs, + trainer_id, + split_dense_table=False, + ): + if len(merged_dense_pairs) < 1: + return idx + if not split_dense_table: + origin_varnames = [] + var_numel = 0 + for merged in merged_dense_pairs: + grad = merged[1] + origin_varnames.append(grad.merged_var.name) + var = self.origin_main_program.global_block().vars[ + grad.merged_var.name + ] + var_numel += reduce(lambda x, y: x * y, var.shape) + grad_name = "Dense@Grad" + trainer_id = self.get_role_id() + aggregate = True + dense_ctx = core.CommContext( + grad_name, + [grad_name], + ["127.0.0.1:6071"], + [var_numel], + origin_varnames, + trainer_id, + aggregate, + False, + False, + idx, + False, + False, + -1, + [], + ) + send_ctx[grad_name] = dense_ctx + idx += 1 + else: + for merged in merged_dense_pairs: + grad = merged[1] + origin_varname = grad.merged_var.name + var = self.origin_main_program.global_block().vars[ + origin_varname + ] + var_numel = reduce(lambda x, y: x * y, var.shape) + grad_name = origin_varname + aggregate = True + dense_ctx = core.CommContext( + grad_name, + [grad_name], + ["127.0.0.1:6071"], + [var_numel], + [origin_varname], + trainer_id, + aggregate, + False, + False, + idx, + False, + False, + -1, + [], + ) + send_ctx[grad_name] = dense_ctx + idx += 1 + return idx + + def get_the_one_send_context( + self, split_dense_table=False, use_origin_program=False, ep_list=None + ): + if ep_list is None: + ep_list = ["127.0.0.1:6071"] + send_ctx = {} + trainer_id = self.get_role_id() + idx = 0 + + merged_dense_pairs = ( + self.origin_merged_dense_pairs + if use_origin_program + else self.merged_dense_pairs + ) + merged_sparse_pairs = ( + self.origin_merged_sparse_pairs + if use_origin_program + else self.merged_sparse_pairs + ) + + idx += self.get_dense_send_context( + send_ctx, idx, merged_dense_pairs, trainer_id, split_dense_table + ) + + distibuted_varnames = get_sparse_tablenames( + self.origin_main_program, True + ) + for merged in merged_sparse_pairs: + param, grad = merged + grad_name = grad.merged_var.name + param_name = param.merged_var.name + splited_varname = [] + + for i in range(len(ep_list)): + splited_varname.append("{}.block{}".format(param_name, i)) + + is_distributed = ( + True if param_name in distibuted_varnames else False + ) + + var = self.origin_main_program.global_block().vars[ + grad.merged_var.name + ] + + shape = list(var.shape) + shape[0] = 0 if is_distributed else shape[0] + + sparse_ctx = core.CommContext( + grad_name, + splited_varname, + ep_list, + shape, + [grad_name], + trainer_id, + True, + True, + is_distributed, + idx, + False, + False, + -1, + [], + ) + + idx += 1 + send_ctx[sparse_ctx.var_name()] = sparse_ctx + + if len(self.tensor_table_dict) > 0 and self.role_maker._is_worker(): + name, ctx = self._step_ctx(idx) + send_ctx[name] = ctx + + return send_ctx + + def get_the_one_recv_context( + self, is_dense=True, split_dense_table=False, use_origin_program=False + ): + recv_id_maps = {} + if is_dense: + send_ctx = self.get_the_one_send_context( + split_dense_table=split_dense_table, + use_origin_program=use_origin_program, + ) + for idx, (name, ctx) in enumerate(send_ctx.items()): + if ctx.is_sparse(): + continue + if ctx.is_tensor_table(): + continue + + origin_grad_varnames = ctx.origin_varnames() + + param_names = [] + for grad_varname in origin_grad_varnames: + param_name = self.grad_name_to_param_name[grad_varname] + param_names.append(param_name) + recv_id_maps[ctx.table_id()] = param_names + else: + send_ctx = self.get_the_one_send_context() + for idx, (name, ctx) in enumerate(send_ctx.items()): + if not ctx.is_sparse(): + continue + + origin_grad_varnames = ctx.origin_varnames() + + param_names = [] + for grad_varname in origin_grad_varnames: + param_name = self.grad_name_to_param_name[grad_varname] + param_names.append(param_name) + recv_id_maps[ctx.table_id()] = param_names + return recv_id_maps + + def get_server_runtime_config(self): + return self.strategy.get_server_runtime_config() + + def get_var_distributed(self, varname, is_param): + var_distributed = [] + offset = 0 + if is_param: + params = self.param_var_mapping[varname] + param_varnames = [var.name for var in params] + for ep, pairs in self.param_grad_ep_mapping.items(): + for p in pairs["params"]: + if p.name in param_varnames: + offset += p.shape[0] + var_distributed.append((p.name, ep, p.shape[0])) + else: + grads = self.grad_var_mapping[varname] + grad_varnames = [var.name for var in grads] + for ep, pairs in self.param_grad_ep_mapping.items(): + for g in pairs["grads"]: + if g.name in grad_varnames: + var_distributed.append((g.name, ep, g.shape[0])) + return var_distributed + + def _step_ctx(self, idx): + name = STEP_COUNTER + trainer_id = self.get_role_id() + endpoints = self.get_ps_endpoints() + sections = [1] * len(endpoints) + names = [name] * len(endpoints) + ctx = core.CommContext( + name, + names, + endpoints, + sections, + [name], + trainer_id, + True, + False, + False, + idx, + True, + False, + -1, + [], + ) + return name, ctx + + def _create_vars_from_blocklist(self, block_list): + """ + Create vars for each split. + NOTE: only grads need to be named for different trainers, use + add_trainer_suffix to rename the grad vars. + Args: + block_list (list[(varname, block_id, block_size)]): List of gradient blocks. + add_trainer_suffix (Bool): Add trainer suffix to new variable's name if set True. + Returns: + var_mapping (collections.OrderedDict(varname->[new_varname_variable])):A dict mapping + from original var name to each var split. + """ + + # varname->[(block_id, current_block_size)] + block_map = collections.OrderedDict() + var_mapping = collections.OrderedDict() + + for block_str in block_list: + varname, offset, size = block_str.split(":") + if varname not in block_map: + block_map[varname] = [] + block_map[varname].append((int(offset), int(size))) + + for varname, split in block_map.items(): + orig_var = self.merged_variable_map[varname] + + if len(split) == 1: + var_mapping[varname] = [orig_var] + self.var_distributed.add_distributed_var( + origin_var=orig_var, + slice_var=orig_var, + block_id=0, + offset=0, + is_slice=False, + vtype="Param", + ) + else: + var_mapping[varname] = [] + orig_shape = orig_var.shape + orig_dim1_flatten = 1 + + if len(orig_shape) >= 2: + orig_dim1_flatten = reduce( + lambda x, y: x * y, orig_shape[1:] + ) + + for i, block in enumerate(split): + size = block[1] + rows = size // orig_dim1_flatten + splited_shape = [rows] + if len(orig_shape) >= 2: + splited_shape.extend(orig_shape[1:]) + + new_var_name = "%s.block%d" % (varname, i) + slice_var = vars_metatools.VarStruct( + name=new_var_name, + shape=splited_shape, + dtype=orig_var.dtype, + type=orig_var.type, + lod_level=orig_var.lod_level, + persistable=False, + ) + var_mapping[varname].append(slice_var) + + self.var_distributed.add_distributed_var( + origin_var=orig_var, + slice_var=slice_var, + block_id=i, + offset=-1, + is_slice=False, + vtype="Param", + ) + + return var_mapping + + def _dispatcher(self): + ps_dispatcher = RoundRobin(self.get_ps_endpoints()) + ps_dispatcher.reset() + grad_var_mapping_items = list(self.grad_var_mapping.items()) + + sparse_gradnames = [grad.name for _, grad in self.origin_sparse_pairs] + + for grad_varname, splited_vars in grad_var_mapping_items: + if grad_varname in sparse_gradnames: + continue + + send_vars = [] + for _, var in enumerate(splited_vars): + send_vars.append(var) + + recv_vars = [] + for _, var in enumerate(send_vars): + recv_vars.append(self.grad_param_mapping[var]) + + eps = ps_dispatcher.dispatch(recv_vars) + + for i, ep in enumerate(eps): + self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i]) + self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) + + for grad_varname, splited_vars in grad_var_mapping_items: + if grad_varname not in sparse_gradnames: + continue + + ps_dispatcher.reset() + + send_vars = [] + for _, var in enumerate(splited_vars): + send_vars.append(var) + + recv_vars = [] + for _, var in enumerate(send_vars): + recv_vars.append(self.grad_param_mapping[var]) + + eps = ps_dispatcher.dispatch(recv_vars) + + for i, ep in enumerate(eps): + self.param_grad_ep_mapping[ep]["params"].append(recv_vars[i]) + self.param_grad_ep_mapping[ep]["grads"].append(send_vars[i]) + + def _slice_variable( + self, var_list, slice_count, min_block_size, uniform=False + ): + """ + We may need to split dense tensor to one or more blocks and put + them equally onto parameter server. One block is a sub-tensor + aligned by dim[0] of the tensor. + + We need to have a minimal block size so that the calculations in + the parameter server side can gain better performance. By default + minimum block size 8K elements (maybe 16bit or 32bit or 64bit). + + Args: + var_list (list): List of variables. + slice_count (int): Numel of count that variables will be sliced, which + could be the pserver services' count. + min_block_size (int): Minimum split block size. + Returns: + blocks (list[(varname, block_id, current_block_size)]): A list + of VarBlocks. Each VarBlock specifies a shard of the var. + """ + blocks = [] + for var in var_list: + if not uniform: + var_numel = reduce(lambda x, y: x * y, var.shape) + + split_count = 1 + + if min_block_size == -1: + split_count = 1 + else: + split_count = slice_count + max_pserver_count = int( + math.floor(var_numel / float(min_block_size)) + ) + if max_pserver_count == 0: + max_pserver_count = 1 + if max_pserver_count < slice_count: + split_count = max_pserver_count + block_size = int(math.ceil(var_numel / float(split_count))) + + if len(var.shape) >= 2: + # align by dim1(width) + dim1 = reduce(lambda x, y: x * y, var.shape[1:]) + remains = block_size % dim1 + if remains != 0: + block_size += dim1 - remains + # update split_count after aligning + split_count = int(math.ceil(var_numel / float(block_size))) + for block_id in range(split_count): + curr_block_size = min( + block_size, var_numel - ((block_id) * block_size) + ) + block = vars_metatools.VarBlock( + var.name, block_id, curr_block_size + ) + blocks.append(str(block)) + else: + block_size = var.shape[0] / slice_count + remainder = var.shape[0] % slice_count + + if block_size == 0: + dim0s = [block_size] * remainder + else: + dim0s = [block_size] * slice_count + for i in range(remainder): + dim0s[i] = dim0s[i] + 1 + + dim1 = reduce(lambda x, y: x * y, var.shape[1:]) + + for block_id in range(len(dim0s)): + numel = dim0s[block_id] * dim1 + block = vars_metatools.VarBlock(var.name, block_id, numel) + blocks.append(str(block)) + return blocks + + def _get_param_grad_blocks(self, pairs, min_block_size, uniform=False): + param_list = [] + grad_list = [] + param_grad_set = set() + for p, g in pairs: + # todo(tangwei12) skip parameter marked not trainable + # if type(p) == Parameter and p.trainable == False: + # continue + p = p.merged_var + g = g.merged_var + + if p.name not in param_grad_set: + param_list.append(p) + param_grad_set.add(p.name) + if g.name not in param_grad_set: + grad_list.append(g) + param_grad_set.add(g.name) + + # when we slice var up into blocks, we will slice the var according to + # pserver services' count. A pserver may have two or more listening ports. + grad_blocks = self._slice_variable( + grad_list, len(self.get_ps_endpoints()), min_block_size, uniform + ) + + param_blocks = self._slice_variable( + param_list, len(self.get_ps_endpoints()), min_block_size, uniform + ) + return param_blocks, grad_blocks + + def _var_slice_and_distribute(self): + # update these mappings for further transpile: + # 1. param_var_mapping : param var name->[split params vars] + # 2. grad_var_mapping : grad var name->[split grads vars] + # 3. grad_param_mapping : grad.blockx->param.blockx + # 4. param_grad_ep_mapping : ep->{"params" : [], "grads" : [] } + + dps, dgs = self._get_param_grad_blocks( + self.merged_dense_pairs, self.min_block_size, False + ) + sps, sgs = self._get_param_grad_blocks( + self.merged_sparse_pairs, self.min_block_size, True + ) + + param_blocks = dps + sps + grad_blocks = dgs + sgs + + assert len(grad_blocks) == len(param_blocks) + + # origin_param_name->[splited_param_vars] + self.param_var_mapping = self._create_vars_from_blocklist(param_blocks) + self.grad_var_mapping = self._create_vars_from_blocklist(grad_blocks) + + # dict(grad_splited_var->param_splited_var) + self.grad_param_mapping = collections.OrderedDict() + for g, p in zip(grad_blocks, param_blocks): + g_name, g_bid, _ = g.split(":") + p_name, p_bid, _ = p.split(":") + self.grad_param_mapping[ + self.grad_var_mapping[g_name][int(g_bid)] + ] = self.param_var_mapping[p_name][int(p_bid)] + + print_maps = {} + for k, v in self.grad_param_mapping.items(): + print_maps[str(k)] = str(v) + + # create mapping of endpoint->split var to create pserver side program + self.param_grad_ep_mapping = collections.OrderedDict() + [ + self.param_grad_ep_mapping.update({ep: {"params": [], "grads": []}}) + for ep in self.get_ps_endpoints() + ] + + def _build_var_distributed(self): + self.var_distributed = vars_metatools.VarsDistributed() + + sparse_pairs, dense_pairs = self.get_param_grads() + origin_for_sparse = [] + origin_for_dense = [] + param_name_grad_name = dict() + grad_name_to_param_name = dict() + + for param, grad in sparse_pairs: + param = vars_metatools.create_var_struct(param) + grad = vars_metatools.create_var_struct(grad) + origin_for_sparse.append((param, grad)) + + for param, grad in dense_pairs: + param = vars_metatools.create_var_struct(param) + grad = vars_metatools.create_var_struct(grad) + origin_for_dense.append((param, grad)) + + for dense_pair in origin_for_dense: + param, grad = dense_pair + + m_param = MergedVariable(param, [param], [0]) + m_grad = MergedVariable(grad, [grad], [0]) + self.merged_variables_pairs.append((m_param, m_grad)) + self.merged_dense_pairs.append((m_param, m_grad)) + + for sparse_pair in origin_for_sparse: + param, grad = sparse_pair + + m_param = MergedVariable(param, [param], [0]) + m_grad = MergedVariable(grad, [grad], [0]) + self.merged_variables_pairs.append((m_param, m_grad)) + self.merged_sparse_pairs.append((m_param, m_grad)) + + for merged in self.merged_variables_pairs: + m_param, m_grad = merged + self.merged_variable_map[ + m_param.merged_var.name + ] = m_param.merged_var + self.merged_variable_map[m_grad.merged_var.name] = m_grad.merged_var + + param_merges = [] + param_merges.extend(origin_for_sparse) + param_merges.extend(origin_for_dense) + + for param, grad in param_merges: + param_name_grad_name[param.name] = grad.name + grad_name_to_param_name[grad.name] = param.name + + self.origin_sparse_pairs = origin_for_sparse + self.origin_dense_pairs = origin_for_dense + self.param_name_to_grad_name = param_name_grad_name + self.grad_name_to_param_name = grad_name_to_param_name + + sparse_pair_map = collections.OrderedDict() + + for pair in self.origin_sparse_pairs + self.origin_dense_pairs: + param, grad = pair + sparse_pair_map[param.name] = str(param) + sparse_pair_map[grad.name] = str(grad) + + self._var_slice_and_distribute() + self._dispatcher() + + def get_param_grads(self): + origin_program = self.origin_main_program + + def _get_params_grads(sparse_varnames): + block = origin_program.global_block() + + dense_param_grads = [] + sparse_param_grads = [] + + optimize_params = set() + origin_var_dict = origin_program.global_block().vars + role_id = int(core.op_proto_and_checker_maker.OpRole.Backward) + for op in block.ops: + if _is_opt_role_op(op): + # delete clip op from opt_ops when run in Parameter Server mode + if ( + OP_NAME_SCOPE in op.all_attrs() + and CLIP_OP_NAME_SCOPE in op.attr(OP_NAME_SCOPE) + ): + op._set_attr("op_role", role_id) + continue + if op.attr(OP_ROLE_VAR_ATTR_NAME): + param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0] + grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1] + if param_name not in optimize_params: + optimize_params.add(param_name) + param_grad = ( + origin_var_dict[param_name], + origin_var_dict[grad_name], + ) + + if param_name in sparse_varnames: + sparse_param_grads.append(param_grad) + else: + dense_param_grads.append(param_grad) + return sparse_param_grads, dense_param_grads + + def _get_sparse_varnames(): + varnames = [] + for op in origin_program.global_block().ops: + if ( + op.type in SPARSE_OP_TYPE_DICT.keys() + and op.attr('remote_prefetch') is True + ): + param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0] + varnames.append(param_name) + + return list(set(varnames)) + + sparse_varnames = _get_sparse_varnames() + sparse_param_grads, dense_param_grads = _get_params_grads( + sparse_varnames + ) + + return sparse_param_grads, dense_param_grads + + def remove_var_pair_by_grad(self, var_name): + + for index, pair in enumerate(self.merged_variables_pairs): + var = pair[0] + var_grad = pair[1] + if var_grad.merged_var.name == var_name: + del self.merged_variables_pairs[index] + + for index, pair in enumerate(self.merged_dense_pairs): + var = pair[0] + var_grad = pair[1] + if var_grad.merged_var.name == var_name: + del self.merged_dense_pairs[index] + return + + for index, pair in enumerate(self.merged_sparse_pairs): + var = pair[0] + var_grad = pair[1] + if var_grad.merged_var.name == var_name: + del self.merged_sparse_pairs[index] + return + + print("Not find {} in self.merge_pairs".format(var_name)) + + +def _is_opt_role_op(op): + # NOTE : depend on oprole to find out whether this op is for + # optimize + op_maker = core.op_proto_and_checker_maker + optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize + if op_maker.kOpRoleAttrName() in op.attr_names and int( + op.all_attrs()[op_maker.kOpRoleAttrName()] + ) == int(optimize_role): + return True + return False + + +def _get_optimize_ops(_program): + block = _program.global_block() + opt_ops = [] + for op in block.ops: + if _is_opt_role_op(op): + # delete clip op from opt_ops when run in Parameter Server mode + if ( + OP_NAME_SCOPE in op.all_attrs() + and CLIP_OP_NAME_SCOPE in op.attr(OP_NAME_SCOPE) + ): + op._set_attr( + "op_role", + int(core.op_proto_and_checker_maker.OpRole.Backward), + ) + continue + opt_ops.append(op) + return opt_ops + + +def _add_lr_decay_table_pass(main_program, compiled_config, lr_decay_steps): + if hasattr(compiled_config.origin_main_program, 'lr_sheduler'): + from paddle.optimizer.lr import LRScheduler + + assert isinstance( + compiled_config.origin_main_program.lr_sheduler, LRScheduler + ), "must be LRScheduler" + ops = _get_optimize_ops(compiled_config.origin_main_program) + lr_param_dict = _get_lr_param_dict(ops) + ( + lr_decay_main_program, + lr_decay_startup_program, + lr_name, + ) = _get_lr_sheduler_program( + compiled_config.origin_main_program.lr_sheduler, + lr_param_dict, + lr_decay_steps, + ) + compiled_config.add_tensor_table( + "@LR_DECAY_COUNTER@", + lr_name, + lr_decay_startup_program, + lr_decay_main_program, + "GlobalStepTable", + ) + + +def _get_lr_param_dict(opt_ops): + lr_param_dict = {} + for op in opt_ops: + lr_name = op.input("LearningRate")[0] + param_name = op.input("Param")[0] + if lr_name not in lr_param_dict: + lr_param_dict[lr_name] = [] + lr_param_dict[lr_name].append(param_name) + return lr_param_dict + + +def _get_lr_sheduler_program(lr_sheduler, lr_param_dict, lr_decay_steps): + schedler_decay = [ + 'NoamDecay', + 'NaturalExpDecay', + 'InverseTimeDecay', + 'ExponentialDecay', + ] + + from paddle.optimizer.lr import ( + ExponentialDecay, + InverseTimeDecay, + NaturalExpDecay, + NoamDecay, + ) + from paddle.static.learning_rate_scheduler import ( + exponential_decay, + inverse_time_decay, + natural_exp_decay, + noam_decay, + ) + + decay_main_program = paddle.static.Program() + decay_startup_program = paddle.static.Program() + lr_name = "" + + if isinstance(lr_sheduler, ExponentialDecay): + with paddle.static.program_guard( + decay_main_program, decay_startup_program + ): + lr = exponential_decay(1.0, lr_decay_steps, lr_sheduler.gamma, True) + lr_name = lr.name + logging.warn( + "ExponentialDecay is set, staircase = True, global learning rate decay step is [ %d ], Change decay steps as follow: \n" + "\t strategy = paddle.distributed.fleet.DistributedStrategy() \n " + "\t strategy.a_sync = True \n" + "\t strategy.a_sync_configs= { 'lr_decay_steps' : YOUR_DECAY_STEP } \n" + % lr_decay_steps + ) + elif isinstance(lr_sheduler, NoamDecay): + with paddle.static.program_guard( + decay_main_program, decay_startup_program + ): + lr = noam_decay(lr_sheduler.d_model, lr_sheduler.warmup_steps, 1.0) + lr_name = lr.name + logging.warn( + "NoamDecay is set, warmup steps is [ %d ]" + % lr_sheduler.warmup_steps + ) + elif isinstance(lr_sheduler, NaturalExpDecay): + with paddle.static.program_guard( + decay_main_program, decay_startup_program + ): + lr = natural_exp_decay(1.0, lr_decay_steps, lr_sheduler.gamma, True) + lr_name = lr.name + logging.warn( + "NaturalExpDecay is set, staircase = True, global learning rate decay step is [ %d ], Change decay steps as follow: \n" + "\t strategy = paddle.distributed.fleet.DistributedStrategy() \n " + "\t strategy.a_sync = True \n" + "\t strategy.a_sync_configs= { 'lr_decay_steps' : YOUR_DECAY_STEP } \n" + % lr_decay_steps + ) + elif isinstance(lr_sheduler, InverseTimeDecay): + with paddle.static.program_guard( + decay_main_program, decay_startup_program + ): + lr = inverse_time_decay( + 1.0, lr_decay_steps, lr_sheduler.gamma, True + ) + lr_name = lr.name + logging.warn( + "InverseTimeDecay is set, staircase = True, global learning rate decay step is [ %d ], Change decay steps as follow: \n" + "\t strategy = paddle.distributed.fleet.DistributedStrategy() \n " + "\t strategy.a_sync = True \n" + "\t strategy.a_sync_configs= { 'lr_decay_steps' : YOUR_DECAY_STEP } \n" + % lr_decay_steps + ) + else: + raise ValueError( + "Not supported current LearningRate strategy, please use follow decay strategy: {}".format( + schedler_decay + ) + ) + + return decay_main_program, decay_startup_program, lr_name + + +def _get_varname_parts(varname): + # returns origin, blockid, trainerid + orig_var_name = "" + trainer_part = "" + block_part = "" + trainer_idx = varname.find(".trainer_") + if trainer_idx >= 0: + trainer_part = varname[trainer_idx + 1 :] + else: + trainer_idx = len(varname) + block_index = varname.find(".block") + if block_index >= 0: + block_part = varname[block_index + 1 : trainer_idx] + else: + block_index = len(varname) + orig_var_name = varname[0 : min(block_index, trainer_idx)] + return orig_var_name, block_part, trainer_part + + +def _orig_varname(varname): + orig, _, _ = _get_varname_parts(varname) + return orig diff --git a/python/paddle/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/incubate/fleet/parameter_server/ir/trainer_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..2795a1c8a951105df7c04adef4d80230c9e41eed --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/trainer_pass.py @@ -0,0 +1,2143 @@ +# -*- coding: UTF-8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +import warnings +from functools import reduce + +import paddle +import paddle.framework as framework +from paddle.fluid.incubate.fleet.parameter_server.ir.public import ( + _get_lr_ops, + _get_optimize_ops, + get_sparse_tablenames, +) +from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode +from paddle.fluid.transpiler.details.program_utils import delete_ops +from paddle.framework import core + +OP_NAME_SCOPE = "op_namescope" +CLIP_OP_NAME_SCOPE = "gradient_clip" +STEP_COUNTER = "@PS_STEP_COUNTER@" +OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() +RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName() +RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC +LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched +OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize +op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + +SPARSE_OP_TYPE_DICT = {"lookup_table": "W", "lookup_table_v2": "W"} +SPARSE_GRAD_OP_TYPE_DICT = { + "lookup_table_grad": "W", + "lookup_table_v2_grad": "W", +} +DEVICE_LIST = ["cpu", "gpu", "xpu"] +COMMUNICATE_OPS_TYPE = ["send", "recv", "fetch_barrier", "send_barrier"] +DEFAULT_DEVICE = 'cpu' + + +def delete_optimizer_pass(program, config): + def _delete_optimizer_op_and_vars(_program, optimize_ops): + optimize_vars = [] + optimize_op_role_vars = [] + optimize_need_delete_vars = [] + + for op in optimize_ops: + optimize_vars.extend(op.input_arg_names) + optimize_op_role_vars.extend(op.attr("op_role_var")) + + optimize_vars = list(set(optimize_vars)) + optimize_op_role_vars = list(set(optimize_op_role_vars)) + + for var in optimize_vars: + if var not in optimize_op_role_vars: + optimize_need_delete_vars.append(var) + need_delete_optimize_vars = list(set(optimize_need_delete_vars)) + + delete_ops(_program.global_block(), optimize_ops) + for var in need_delete_optimize_vars: + if _program.global_block().has_var(var): + _program.global_block()._remove_var(var) + + def _add_lr_var(main_program, compiled_config): + # Todo: hard code for pe + lr_var = compiled_config.origin_main_program.global_block().vars[ + "learning_rate_0" + ] + main_program.global_block().create_var( + name=lr_var.name, + shape=lr_var.shape, + dtype=lr_var.dtype, + type=lr_var.type, + lod_level=lr_var.lod_level, + persistable=True, + ) + + optimizer_ops = _get_optimize_ops(program) + lr_ops = _get_lr_ops(program) + optimizer_ops.extend(lr_ops) + _delete_optimizer_op_and_vars(program, optimizer_ops) + + if hasattr(config.origin_main_program, 'lr_sheduler'): + _add_lr_var(program, config) + + return program + + +def distributed_ops_pass(program, config, use_ps_gpu=False): + trainer_id = config.get_role_id() + send_ctx = config.get_the_one_send_context( + split_dense_table=config.is_heter_ps_mode + ) + w_2_table_id = {} + emb_size = {} + + def _get_pull_sparse_ops(_program): + pull_sparse_ops = {} + pull_sparse_ids = {} + push_sparse_ops = {} + ops = {} + for op in _program.global_block().ops: + if ( + op.type in SPARSE_OP_TYPE_DICT.keys() + and op.attr('remote_prefetch') is True + ): + param_name = op.input(SPARSE_OP_TYPE_DICT[op.type])[0] + if config.is_heter_ps_mode: + # trick for matchnet, need to modify + param_name += op.input("Ids")[0][0] + ops = pull_sparse_ops.get(param_name, []) + ops.append(op) + pull_sparse_ops[param_name] = ops + ids = pull_sparse_ids.get(param_name, []) + ids.append(op.input("Ids")[0]) + pull_sparse_ids[param_name] = ids + for op in _program.global_block().ops: + if op.type in SPARSE_GRAD_OP_TYPE_DICT.keys(): + param_name = op.input(SPARSE_GRAD_OP_TYPE_DICT[op.type])[0] + if ( + param_name in pull_sparse_ids + and op.input("Ids")[0] in pull_sparse_ids[param_name] + ): + ops = push_sparse_ops.get(param_name, []) + ops.append(op) + push_sparse_ops[param_name] = ops + return pull_sparse_ops, push_sparse_ops + + def _pull_sparse_fuse(_program, pull_sparse_ops, use_ps_gpu): + def dag_check_up_and_reorder(program, inputs, outputs): + global_block = program.global_block() + min_output_index = len(global_block.ops) + max_input_index = -1 + input_indexes = [0] * len(global_block.ops) + output_indexes = [0] * len(global_block.ops) + for idx, op in enumerate(global_block.ops): + for i in range(0, len(op.output_names)): + if input_indexes[idx] == 1: + break + outs = op.output(op.output_names[i]) + for in_id, in_var in enumerate(inputs): + if in_var.name in outs: + input_indexes[idx] = 1 + max_input_index = max(max_input_index, idx) + break + + for i in range(0, len(op.input_names)): + if output_indexes[idx] == 1: + break + ins = op.input(op.input_names[i]) + for out_id, out_var in enumerate(outputs): + if out_var.name in ins: + output_indexes[idx] = 1 + min_output_index = min(min_output_index, idx) + + for i in range(len(global_block.ops)): + if input_indexes[i] == 1 and output_indexes[i] == 1: + warnings.warn( + "unable to re-arrange dags order to combine distributed embedding ops because a op both needs embedding table's output as input and produces ids as the same embedding table's input" + ) + return + + if min_output_index < max_input_index: + move_ops = [] + for i in range(min_output_index + 1, len(input_indexes)): + if input_indexes[i] == 1: + move_ops.append((global_block.ops[i], i)) + for i, op in enumerate(move_ops): + queue = list() + visited = set() + queue.append(op[1]) + visited.add(op[0]) + start = 0 + while start < len(queue): + pos = queue[start] + op = global_block.ops[pos] + op_inputs = [] + for k in range(0, len(op.input_names)): + ins = op.input(op.input_names[k]) + op_inputs.append(ins) + for j in range(pos - 1, min_output_index - 1, -1): + op1 = global_block.ops[j] + if op1 in visited: + continue + found = False + for k in range(0, len(op1.output_names)): + outs = op1.output(op1.output_names[k]) + for t in range(len(op_inputs)): + for y in op_inputs[t]: + if y in outs: + found = True + break + if found: + break + if found: + break + if found: + if output_indexes[j] is True: + warnings.warn( + "unable to re-arrange dags order to combine distributed embedding ops" + ) + return + queue.append(j) + visited.add(global_block.ops[j]) + start = start + 1 + + queue.sort() + for index in queue: + desc = global_block.desc._insert_op(min_output_index) + desc.copy_from(global_block.ops[index].desc) + global_block.desc._remove_op(index + 1, index + 2) + global_block.ops[index].desc = desc + insert_op = global_block.ops.pop(index) + input_state = input_indexes.pop(index) + output_state = output_indexes.pop(index) + global_block.ops.insert(min_output_index, insert_op) + input_indexes.insert(min_output_index, input_state) + output_indexes.insert(min_output_index, output_state) + min_output_index = min_output_index + 1 + + assert global_block.desc.op_size() == len(global_block.ops) + for i in range(len(global_block.ops)): + assert global_block.desc.op(i) == global_block.ops[i].desc + + for param, ops in pull_sparse_ops.items(): + all_ops = program.global_block().ops + op_device = "" + if config.is_heter_ps_mode: + op_device = ops[0].attr("op_device") + inputs = [ + program.global_block().vars[op.input("Ids")[0]] for op in ops + ] + w = program.global_block().vars[ops[0].input("W")[0]] + emb_size[param] = w.shape[1] + + grad_name = config.param_name_to_grad_name[w.name] + + table_id = -1 + + for name, ctx in send_ctx.items(): + if grad_name in ctx.origin_varnames(): + table_id = ctx.table_id() + + if table_id == -1: + raise ValueError( + "can not find suitable sparse table, please check" + ) + + w_2_table_id[param] = table_id + padding_idx = ops[0].attr("padding_idx") + is_distributed = ops[0].attr("is_distributed") + op_type = ops[0].type + + outputs = [ + program.global_block().vars[op.output("Out")[0]] for op in ops + ] + + dag_check_up_and_reorder(program, inputs, outputs) + + op_idxs = [all_ops.index(op) for op in ops] + + for idx in op_idxs[::-1]: + program.global_block()._remove_op(idx) + + inputs_idxs = [-1] * len(inputs) + outputs_idxs = [len(program.global_block().ops) + 1] * len(outputs) + + for idx, op in enumerate(program.global_block().ops): + for i in range(0, len(op.output_names)): + outs = op.output(op.output_names[i]) + for in_id, in_var in enumerate(inputs): + if in_var.name in outs: + inputs_idxs[in_id] = max(idx, inputs_idxs[in_id]) + for i in range(0, len(op.input_names)): + ins = op.input(op.input_names[i]) + for out_id, out_var in enumerate(outputs): + if out_var.name in ins: + outputs_idxs[out_id] = min( + idx, outputs_idxs[out_id] + ) + + if min(outputs_idxs) - max(inputs_idxs) >= 1: + if max(inputs_idxs) == -1: + distributed_idx = min(op_idxs) + else: + distributed_idx = max(inputs_idxs) + 1 + + if use_ps_gpu: + program.global_block()._insert_op( + index=distributed_idx, + type="pull_gpups_sparse", + inputs={"Ids": inputs, 'W': w}, + outputs={"Out": outputs}, + attrs={ + "size": [w.shape[1] for i in inputs], + "is_distributed": True, + "is_sparse": True, + }, + ) + else: + program.global_block()._insert_op( + index=distributed_idx, + type="distributed_lookup_table", + inputs={"Ids": inputs, 'W': w}, + outputs={"Outputs": outputs}, + attrs={ + "is_distributed": is_distributed, + "padding_idx": padding_idx, + "table_id": table_id, + "lookup_table_version": op_type, + "op_device": op_device, + }, + ) + else: + for i in range(len(inputs_idxs)): + distributed_idx = op_idxs[i] + + program.global_block()._insert_op( + index=distributed_idx, + type="distributed_lookup_table", + inputs={"Ids": [inputs[i]], 'W': w}, + outputs={"Outputs": [outputs[i]]}, + attrs={ + "is_distributed": is_distributed, + "padding_idx": padding_idx, + "table_id": table_id, + "lookup_table_version": op_type, + "op_device": op_device, + }, + ) + + def _push_sparse_fuse(_program, push_sparse_ops, use_ps_gpu): + if use_ps_gpu: + # in ps_gpu_pass + return + if len(push_sparse_ops) == 0: + return + show = None + clk = None + use_entry = False + for param, ops in push_sparse_ops.items(): + op_first = ops[0] + break + print(op_first) + if op_first.has_attr("entry"): + entry = op_first.attr("entry") + entry = entry.split(':') + if len(entry) == 3 and entry[0] == 'show_click_entry': + show_var_name = entry[1] + click_var_name = entry[2] + if ( + show_var_name in program.global_block().vars + and click_var_name in program.global_block().vars + ): + show = program.global_block().vars[show_var_name] + clk = program.global_block().vars[click_var_name] + use_entry = True + else: + warnings.warn( + 'ShowClickEntry configured, but cannot find show/click var, will not use' + ) + + if not use_entry: + print('ShowClickEntry not configured, will not use') + show = program.global_block().create_var( + name="show", + dtype=core.VarDesc.VarType.INT64, + persistable=False, + stop_gradient=True, + ) + program.global_block()._insert_op( + index=0, + type='fill_constant', + inputs={}, + outputs={'Out': show}, + attrs={ + 'shape': [1], + 'dtype': show.dtype, + 'value': 1, + # OP_ROLE_KEY: OpRole.Forward + }, + ) + + clk = program.global_block().create_var( + name="clk", + dtype=core.VarDesc.VarType.INT64, + persistable=False, + stop_gradient=True, + ) + program.global_block()._insert_op( + index=0, + type='fill_constant', + inputs={}, + outputs={'Out': clk}, + attrs={ + 'shape': [1], + 'dtype': clk.dtype, + 'value': 0, + # OP_ROLE_KEY: OpRole.Forward + }, + ) + + for param, ops in push_sparse_ops.items(): + all_ops = program.global_block().ops + op_idxs = [all_ops.index(op) for op in ops] + inputs = [ + program.global_block().vars[op.input("Ids")[0]] for op in ops + ] + w = program.global_block().vars[ops[0].output("W@GRAD")[0]] + table_id = w_2_table_id[param] + + padding_idx = ops[0].attr("padding_idx") + is_distributed = ops[0].attr("is_distributed") + op_type = ops[0].type + outputs = [ + program.global_block().vars[op.input("Out@GRAD")[0]] + for op in ops + ] + + for idx in op_idxs[::-1]: + program.global_block()._remove_op(idx) + + # if use_ps_gpu: + # program.global_block().append_op( + # type="push_box_sparse", + # inputs={"Ids": inputs, + # 'Out': outputs}, + # outputs={"Out": outputs}, + # attrs={ + # "size": w.shape[1], + # "is_distributed": True, + # "is_sparse": True + # }) + # else: + program.global_block().append_op( + type="distributed_push_sparse", + inputs={ + "Ids": inputs, + 'W': w, + "Outputs": outputs, + "Shows": show, + "Clicks": clk, + }, + outputs={"Outputs": outputs}, + attrs={ + "is_distributed": is_distributed, + "padding_idx": padding_idx, + "table_id": table_id, + "size": emb_size[param], + }, + ) + + pull_sparse_ops, push_sparse_ops = _get_pull_sparse_ops(program) + _pull_sparse_fuse(program, pull_sparse_ops, use_ps_gpu) + _push_sparse_fuse(program, push_sparse_ops, use_ps_gpu) + return program + + +def append_send_ops_pass(program, config): + mode = config.get_distributed_mode() + trainer_id = config.get_role_id() + + def _append_send_op(union_vars, queue, is_sparse, table_id): + + if queue == STEP_COUNTER: + send_input_vars = [] + else: + send_input_vars = [ + program.global_block().vars[union_var] + for union_var in union_vars + ] + + dummy_output = [] + if mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]: + dummy_output = program.global_block().create_var( + name=framework.generate_control_dev_var_name() + ) + + program.global_block().append_op( + type="send", + inputs={"X": send_input_vars}, + outputs={"Out": dummy_output}, + attrs={ + "send_varnames": [queue], + "is_sparse": is_sparse, + "table_id": table_id, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + + return dummy_output + + def _append_barrier_op(dummys): + program.global_block().append_op( + type="send_barrier", + inputs={"X": dummys}, + outputs={"Out": []}, + attrs={ + "trainer_id": trainer_id, + "half_async": True, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + + dummys = [] + + sends = config.get_the_one_trainer_send_context( + split_dense_table=config.is_heter_ps_mode + ) + + for merged_name, send in sends.items(): + if send.is_sparse() and not config.is_geo_mode(): + continue + is_sparse = 1 if send.is_sparse() else 0 + is_sparse = 2 if send.is_distributed() else is_sparse + dummys.append( + _append_send_op( + send.origin_varnames(), merged_name, is_sparse, send.table_id() + ) + ) + + if mode in [DistributedMode.SYNC, DistributedMode.HALF_ASYNC]: + _append_barrier_op(dummys) + + return program + + +def init_from_server_pass(program, config): + # 0' trainer do not need barrier, it will call barrier at the end init_worker + if config.role_maker._is_first_worker(): + return program + + fetch_barrier_out = program.global_block().create_var( + name=framework.generate_control_dev_var_name() + ) + + program.global_block().append_op( + type="fetch_barrier", + inputs={}, + outputs={"Out": fetch_barrier_out}, + attrs={ + "endpoints": config.get_ps_endpoints(), + "trainer_id": config.get_role_id(), + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + return program + + +def fake_init_ops_pass(program, config): + origin_program = config.get_origin_main_program() + + def _get_sparse_table_names(): + dist_varnames = get_sparse_tablenames(origin_program, True) + sparse_varnames = get_sparse_tablenames(origin_program, False) + return list(set(dist_varnames + sparse_varnames)) + + def _fake_init_sparsetable(sparse_table_names): + # delete table init op + for table_name in sparse_table_names: + table_var = program.global_block().vars[table_name] + table_param_init_op = [] + for op in program.global_block().ops: + if table_name in op.output_arg_names: + table_param_init_op.append(op) + init_op_num = len(table_param_init_op) + if init_op_num != 1: + raise ValueError( + "table init op num should be 1, now is " + str(init_op_num) + ) + table_init_op = table_param_init_op[0] + program.global_block().append_op( + type="fake_init", + inputs={}, + outputs={"Out": table_var}, + attrs={"shape": table_init_op.attr('shape')}, + ) + delete_ops(program.global_block(), table_param_init_op) + + sparse_tables = _get_sparse_table_names() + _fake_init_sparsetable(sparse_tables) + + return program + + +def ps_gpu_pass(program): + def _add_push_box_sparse_op(program): + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + backward = core.op_proto_and_checker_maker.OpRole.Backward + for op in program.global_block().ops: + if op.type != "pull_box_sparse" and op.type != "pull_gpups_sparse": + continue + grad_op_desc, op_grad_to_var = core.get_grad_op_desc( + op.desc, set(), [] + ) + for op_desc in grad_op_desc: + new_op_desc = program.global_block().desc.append_op() + new_op_desc.copy_from(op_desc) + new_op_desc._set_attr(op_role_attr_name, backward) + + def _remove_lookup_table_grad_op_and_var(program): + lookup_table_grad_var = {} + remove_op_index = [] + remove_var = [] + for idx, op in list(enumerate(program.global_block().ops)): + if op.type == "lookup_table_grad": + for name in op.output("W@GRAD"): + lookup_table_grad_var[name] = 1 + remove_op_index.append(idx) + remove_var.append(name) + for name in op.input("W"): + lookup_table_grad_var[name] = 1 + + for idx, op in list(enumerate(program.global_block().ops)): + if op.type == "pull_box_sparse" or op.type == "pull_gpups_sparse": + continue + for key_name in op.input_names: + for var in op.input(key_name): + if var in lookup_table_grad_var: + remove_op_index.append(idx) + break + + remove_op_index = list(set(remove_op_index)) + remove_op_index.sort(reverse=True) + for idx in remove_op_index: + program.global_block()._remove_op(idx) + for name in remove_var: + program.global_block()._remove_var(name) + + def _remove_optimizer_var(program): + + embedding_w = {} + for idx, op in list(enumerate(program.global_block().ops)): + if op.type == "lookup_table_grad": + for name in op.input("W"): + embedding_w[name] = 1 + + optimize_vars = [] + optimize_op_role_vars = [] + optimize_need_delete_vars = [] + for op in _get_optimize_ops(program): + for name in op.input("Param"): + if name in embedding_w: + optimize_op_role_vars.extend(op.attr("op_role_var")) + for key_name in op.input_names: + if key_name == "LearningRate": + continue + for var in op.input(key_name): + optimize_vars.append(var) + + optimize_vars = list(set(optimize_vars)) + optimize_op_role_vars = list(set(optimize_op_role_vars)) + + for var in optimize_vars: + if var not in optimize_op_role_vars: + optimize_need_delete_vars.append(var) + need_delete_optimize_vars = list(set(optimize_need_delete_vars)) + + for name in need_delete_optimize_vars: + if program.global_block().has_var(name): + program.global_block()._remove_var(name) + + _add_push_box_sparse_op(program) + _remove_optimizer_var(program) + _remove_lookup_table_grad_op_and_var(program) + return program + + +def delete_extra_optimizes_pass(program, config): + optimize_vars = [] + optimize_op_role_vars = [] + optimize_need_delete_vars = [] + + origin_program = config.get_origin_main_program() + for op in _get_optimize_ops(origin_program): + optimize_vars.extend(op.input_arg_names) + optimize_op_role_vars.extend(op.attr("op_role_var")) + + optimize_vars = list(set(optimize_vars)) + optimize_op_role_vars = list(set(optimize_op_role_vars)) + for var in optimize_vars: + if var not in optimize_op_role_vars: + optimize_need_delete_vars.append(var) + need_delete_optimize_vars = list(set(optimize_need_delete_vars)) + + init_ops = [] + for var in need_delete_optimize_vars: + param_init_op = [] + for op in program.global_block().ops: + if var in op.output_arg_names: + param_init_op.append(op) + init_ops.extend(param_init_op) + delete_ops(program.global_block(), init_ops) + + for var in need_delete_optimize_vars: + if program.global_block().has_var(var): + program.global_block()._remove_var(var) + + return program + + +def find_heter_ops(program, default_device="cpu"): + if default_device not in DEVICE_LIST: + raise ValueError( + "Given device {} is not in device list {}".format( + default_device, DEVICE_LIST + ) + ) + + def _is_heter_op(op, current_heter_device, default_device="cpu"): + heter_devices = list(DEVICE_LIST) + heter_devices.remove(default_device) + op_device = op.attr("op_device") + op_type = op.type + if op_device in heter_devices: + return True + elif ( + op_type in COMMUNICATE_OPS_TYPE + and current_heter_device != default_device + ): + # for distributed communciate ops: send & recv & barrier etc. + # Todo: need update this method + # op._set_attr('op_device', current_heter_device) + return True + elif op_device is None or op_device == default_device: + op._set_attr('op_device', default_device) + return False + return False + + def _is_same_device(op, pre_device, default_device="cpu"): + op_device = op.attr("op_device") + if op_device == pre_device: + return True + if pre_device == default_device: + return True + return False + + def _append_heter_op(op, current_heter_block_ops, heter_ops): + op_device = op.attr("op_device") + if op_device not in heter_ops: + heter_ops[op_device] = {} + current_heter_block_ops.append(op) + + origin_porgram = program.clone() + block = program.global_block() + ''' + re-place sum op to fix bug for union forward backward op + ''' + var2idx = {} + op_list = list(block.ops) + op_size = len(op_list) + + for i in range(op_size - 1, -1, -1): + op_list = list(block.ops) + op = op_list[i] + if "_grad" in op.type: + forward_op_type = op.type.split("_grad")[0] + if ( + forward_op_type in SPARSE_OP_TYPE_DICT.keys() + and op.attr('remote_prefetch') is True + ): + param_name = op.input(SPARSE_OP_TYPE_DICT[forward_op_type])[0] + if param_name in var2idx: + # insert sum op & remove sum op from var2idx and origin place + op_list = list(block.ops) + sum_op = op_list[var2idx[param_name]] + sum_op_inputs = { + sum_op.input_names[0]: [ + block.vars[input] + for input in sum_op.input_arg_names + ] + } + sum_op_outputs = { + sum_op.output_names[0]: [ + block.vars[output] + for output in sum_op.output_arg_names + ] + } + block._insert_op( + index=i + 1, + type=sum_op.type, + inputs=sum_op_inputs, + outputs=sum_op_outputs, + attrs=sum_op.all_attrs(), + ) + block._remove_op(var2idx[param_name] + 1) + var2idx.pop(param_name) + for var_ in var2idx: + var2idx[var_] += 1 + elif forward_op_type == "elementwise_mul": + """ + get output varname of pre op + + """ + output_vars_no_grad = [] + for key in op.output_names: + for varname in op.output(key): + if varname == "@EMPTY@": + continue + if "lod_tensor_blocking_queue" in varname: + continue + output_vars_no_grad.append(varname.split("@GRAD")[0]) + for no_grad_var in output_vars_no_grad: + if no_grad_var in var2idx: + """ + insert sum op & remove sum op from var2idx and origin place + + """ + op_list = list(block.ops) + sum_op = op_list[var2idx[no_grad_var]] + sum_op_inputs = { + sum_op.input_names[0]: [ + block.vars[input] + for input in sum_op.input_arg_names + ] + } + sum_op_outputs = { + sum_op.output_names[0]: [ + block.vars[output] + for output in sum_op.output_arg_names + ] + } + block._insert_op( + index=i + 1, + type=sum_op.type, + inputs=sum_op_inputs, + outputs=sum_op_outputs, + attrs=sum_op.all_attrs(), + ) + block._remove_op(var2idx[no_grad_var] + 1) + var2idx.pop(no_grad_var) + for var_ in var2idx: + var2idx[var_] += 1 + else: + if op.type == "sum": + var = op.output("Out")[0] + if "@GRAD" in var: + origin_var = var.split("@GRAD")[0] + pre_op = op_list[i - 1] + if "_grad" in pre_op.type: + forward_op_type = pre_op.type.split("_grad")[0] + if ( + forward_op_type in SPARSE_OP_TYPE_DICT.keys() + and pre_op.attr('remote_prefetch') is True + ): + param_name = pre_op.input( + SPARSE_OP_TYPE_DICT[forward_op_type] + )[0] + if param_name == origin_var and op.attr( + "op_device" + ) == pre_op.attr("op_device"): + continue + else: + var2idx[origin_var] = i + elif forward_op_type == "elementwise_mul": + output_vars = [] + for key in pre_op.output_names: + for varname in pre_op.output(key): + if varname == "@EMPTY@": + continue + if "lod_tensor_blocking_queue" in varname: + continue + output_vars.append(varname) + input_vars = [] + for key in op.input_names: + for varname in op.input(key): + if varname == "@EMPTY@": + continue + if "lod_tensor_blocking_queue" in varname: + continue + input_vars.append(varname) + is_match = False + for varname in output_vars: + if varname in input_vars: + is_match = True + break + if is_match: + continue + else: + var2idx[origin_var] = i + else: + var2idx[origin_var] = i + + origin_porgram = program.clone() + block = program.global_block() + + program_block_ops = [] + default_ops = {default_device: {}} + heter_ops = {} + block_index = 0 + + current_heter_block_ops = [] + current_default_block_ops = [] + current_heter_device = default_device + is_heter = False + for op in block.ops: + if _is_heter_op(op, current_heter_device, default_device): + # for gpu/xpu-op + is_heter = True + + # for cpu-op block append + if len(current_default_block_ops) > 1: + default_ops[default_device][ + block_index + ] = current_default_block_ops + program_block_ops.append(current_default_block_ops) + current_default_block_ops = [] + block_index += 1 + + if _is_same_device(op, current_heter_device, default_device): + # for gpu-op, gpu-op -> gpu-op,... + current_heter_device = op.attr("op_device") + _append_heter_op(op, current_heter_block_ops, heter_ops) + else: + # for gpu-op -> xpu-op, ... + op_device = current_heter_block_ops[0].attr("op_device") + heter_ops[op_device][block_index] = current_heter_block_ops + program_block_ops.append(current_heter_block_ops) + block_index += 1 + current_heter_block_ops = [] + current_heter_device = op.attr("op_device") + _append_heter_op(op, current_heter_block_ops, heter_ops) + + elif is_heter: + # for gpu/xpu-op -> cpu-op + op_device = current_heter_block_ops[0].attr("op_device") + heter_ops[op_device][block_index] = current_heter_block_ops + program_block_ops.append(current_heter_block_ops) + block_index += 1 + current_heter_block_ops = [] + current_heter_device = default_device + is_heter = False + current_default_block_ops.append(op) + else: + # for cpu-op + current_default_block_ops.append(op) + + if current_default_block_ops != []: + default_ops[default_device][block_index] = current_default_block_ops + program_block_ops.append(current_default_block_ops) + + if current_heter_block_ops != []: + op_device = current_heter_block_ops[0].attr("op_device") + heter_ops[op_device][block_index] = current_heter_block_ops + program_block_ops.append(current_heter_block_ops) + + if len(heter_ops) == 0: + warnings.warn( + "No heterogeneous OP was found in your program , " + " please using paddle.static.device_guard() to run OPs on different device." + ) + + total_heter_ops = 0 + heter_blocks = 0 + for device in heter_ops.keys(): + heter_block_dict = heter_ops[device] + heter_blocks += len(heter_block_dict) + for _, heter_block in heter_block_dict.items(): + total_heter_ops += len(heter_block) + print( + "There are {} OPs in your main_program, and contains {} heter-OPs which is made up of {} heter-blocks.".format( + len(block.ops), total_heter_ops, heter_blocks + ) + ) + + return origin_porgram, heter_ops, default_ops, program_block_ops + + +def create_heter_program( + program, + config, + heter_program, + program_block_ops_list, + heter_ops, + block_var_detail, + current_device, + stage_id, +): + # This function mainly includes the following contents: + # 1. For every heter block: + # a) copy heter device op from origin program + # b) create variables which belong to heter op: + # -> if variable is persistable, clone it in global_scope + # -> if variable is temp, create it in heter block + # c) create communicate related op as follow: + # joint_var.0_1 -> slice -> reshape -> origin_var + # origin_var -> origin_program + # reshape -> concat -> joint_var.1_2 + # d) copy send op from origin program for var@grad which loacted in current heter block + # e) re-check every op in current blcok if its device is not current heter devie + # 2. Create send op for step counter in last heter-block + # 3. Create Listen&Serv OP and Send&Recv OP for distributed training + # 4. update CompileTimeStrategy for heter_program + + optimizer_block = [] + grad_to_block_id = [] + send_grad_var_list = [] + + pre_block_idx = heter_program.num_blocks - 1 + stage_id = int(stage_id) + print("stage id", stage_id) + heter_block_ops_forward = program_block_ops_list[stage_id - 1]["forward"] + + heter_block_ops_backward = program_block_ops_list[stage_id - 1]["backward"] + + heter_block = heter_program._create_block(pre_block_idx) + optimizer_block.append(heter_block) + for _, op in enumerate(heter_block_ops_forward): + block_append_op(heter_program, program, heter_block, op) + + entrance_vars = block_var_detail[stage_id - 1]["forward"]["entrance"] + add_vars_by_var_list(entrance_vars, program, heter_program, heter_block) + exit_vars = block_var_detail[stage_id - 1]["forward"]["exit"] + add_vars_by_var_list(exit_vars, program, heter_program, heter_block) + + first_op_index_fp = len(heter_block.ops) + + if stage_id < len(program_block_ops_list): + + heter_block_bp = heter_program._create_block(pre_block_idx) + optimizer_block.append(heter_block_bp) + + for _, op in enumerate(heter_block_ops_backward): + block_append_op(heter_program, program, heter_block_bp, op) + + bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][ + "entrance" + ] + add_vars_by_var_list( + bp_entrance_vars, program, heter_program, heter_block_bp + ) + bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"] + add_vars_by_var_list( + bp_exit_vars, program, heter_program, heter_block_bp + ) + backward_comm_info = get_communicate_var_info( + program, stage_id, bp_entrance_vars, type="backward" + ) + + grad_to_block_id.append( + backward_comm_info["block_input_var_name"] + + ":" + + str(heter_block_bp.idx) + ) + + else: + for _, op in enumerate(heter_block_ops_backward): + block_append_op(heter_program, program, heter_block, op) + + bp_entrance_vars = block_var_detail[stage_id - 1]["backward"][ + "entrance" + ] + add_vars_by_var_list( + bp_entrance_vars, program, heter_program, heter_block + ) + bp_exit_vars = block_var_detail[stage_id - 1]["backward"]["exit"] + add_vars_by_var_list(bp_exit_vars, program, heter_program, heter_block) + + heter_block_bp = heter_block + + forward_comm_info = get_communicate_var_info( + program, stage_id, entrance_vars, type="forward" + ) + + grad_to_block_id.append( + forward_comm_info["block_input_var_name"] + ":" + str(heter_block.idx) + ) + + first_op_index_bp = len(heter_block_bp.ops) + + if stage_id <= len(block_var_detail) - 1: + static_var = insert_communicate_op( + program, + config, + heter_block, + stage_id, + first_op_index_fp, + block_var_detail, + current_device, + ) + static_var_bp = insert_communicate_op( + program, + config, + heter_block_bp, + stage_id, + first_op_index_bp, + block_var_detail, + current_device, + False, + ) + + # add send op + send_grad_var_list = add_heter_send_op( + program, heter_program, heter_block_bp, block_var_detail[stage_id - 1] + ) + + # --------------- + # add step conter + send_input_vars = [] + dummy_output = [] + pserver_endpoints = config.get_ps_endpoints() + + # optimizer_block[-1].append_op( + # type="send", + # inputs={"X": send_input_vars}, + # outputs={"Out": dummy_output}, + # attrs={ + # "send_varnames": [STEP_COUNTER], + # "merge_add": True, + # "use_send_handler": False, + # "endpoints": pserver_endpoints + # }) + + # add info in listen&serv + attrs = { + # "mode": "sync", + # "trainers": config.get_trainers(), + # "trainer_id": config.get_role_id() + config.get_trainers(), + "message_to_block_id": grad_to_block_id, + "optimize_blocks": optimizer_block, + # runtime attribute + "endpoint": config.get_heter_worker_endpoint(), + "fanin": len(config.get_previous_stage_trainers()), + "pserver_id": config.get_role_id(), + "distributed_mode": config.get_distributed_mode(), + "rpc_exec_thread_num": int(os.getenv("CPU_NUM", 32)), + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + } + # append the listen_and_serv op + heter_program.global_block().append_op( + type="heter_listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs + ) + check_heter_compile_time_strategy(program, config, send_grad_var_list) + + +def check_heter_compile_time_strategy(program, config, send_grad_var_list): + origin_grad_var_list = [] + for _, var_grad in config.merged_variables_pairs: + origin_grad_var_list.append(var_grad.merged_var.name) + + origin_grad_var_list = list(set(origin_grad_var_list)) + send_grad_var_list = list(set(send_grad_var_list)) + useless_grad_var_list = list( + set(origin_grad_var_list) - set(send_grad_var_list) + ) + + for useless_grad_var in useless_grad_var_list: + config.remove_var_pair_by_grad(useless_grad_var) + + +def create_trainer_program( + program, origin_program, config, program_block_ops_list, block_var_detail +): + # This function mainly includes the following contents: + # 1. For every heter block in origin program + # a) delete heter op and related variables + # b) add send&recv op + # c) add communicate ops as follows: + # origin_var -> reshape -> concat -> joint_var.0_1 + # send&recv op(send joint_var.0_1; recv joint_var.1_2) + # joint_var.1_2 -> slice -> reshape -> origin_var + # d) remove send op which related var@grad is not in trainer program + # 2. check every op's device + static_var = [] + for heter_block_index in range(1, len(program_block_ops_list)): + ops_list = ( + program_block_ops_list[heter_block_index]["forward"] + + program_block_ops_list[heter_block_index]["backward"] + ) + static_var += replace_ops_by_communicate_op( + program, config, heter_block_index, ops_list, block_var_detail + ) + remove_trainer_send_op( + program, config, heter_block_index, block_var_detail + ) + + optimizer_block = [] + grad_to_block_id = [] + + bp_ops_list = program_block_ops_list[0]["backward"] + delete_same_ops(program.global_block(), bp_ops_list) + delete_trainer_useless_var(config, program, static_var) + backward_block = create_backward_block( + program, origin_program, config, bp_ops_list, block_var_detail + ) + + bp_entrance_vars = block_var_detail[0]["backward"]["entrance"] + backward_comm_info = get_communicate_var_info( + origin_program, 1, bp_entrance_vars, type="backward" + ) + + grad_to_block_id.append( + backward_comm_info["block_input_var_name"] + + ":" + + str(backward_block.idx) + ) + optimizer_block.append(backward_block) + + attrs = { + # "mode": "sync", + # "trainers": config.get_trainers(), + # "trainer_id": config.get_role_id(), + "message_to_block_id": grad_to_block_id, + "optimize_blocks": optimizer_block, + # runtime attribute + "endpoint": config.get_trainer_endpoint(), # get trainer endpoint + "fanin": 0, # get heter worker + "pserver_id": config.get_role_id(), + "distributed_mode": config.get_distributed_mode(), + "rpc_exec_thread_num": int(os.getenv("CPU_NUM", 32)), + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + } + # append the listen_and_serv op + program.global_block()._insert_op( + index=0, + type="heter_listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs=attrs, + ) + + # TODO add check for bp block + check_op_device(program.global_block(), DEFAULT_DEVICE) + + +def insert_communicate_op( + orign_program, + config, + heter_block, + stage_id, + first_op_index, + block_var_detail, + device, + is_forward=True, +): + + if is_forward: + next_heter_worker_endpoints = config.get_next_stage_trainers() + previous_heter_worker_endpoints = config.get_previous_stage_trainers() + entrance_var = block_var_detail[stage_id]["forward"]["entrance"] + comm_info = get_communicate_var_info( + orign_program, stage_id + 1, entrance_var + ) + + else: + next_heter_worker_endpoints = config.get_next_stage_trainers() + # if next_heter_worker_endpoints == "": + # next_heter_worker_endpoints = [] + previous_heter_worker_endpoints = config.get_previous_stage_trainers() + entrance_var = block_var_detail[stage_id - 1]["backward"]["exit"] + comm_info = get_communicate_var_info( + orign_program, stage_id - 1, entrance_var, "backward" + ) + + heter_block._insert_op( + index=first_op_index, + type="send_and_recv", + inputs={"X": heter_block.vars[entrance_var[0]]}, + outputs={"Out": []}, + attrs={ + "mode": "forward" if is_forward else "backward", + "send_var_name": entrance_var + ["microbatch_id"], + "recv_var_name": [], + "message_name": comm_info["block_input_var_name"], + "next_endpoints": next_heter_worker_endpoints, + "previous_endpoints": previous_heter_worker_endpoints, + "trainer_id": config.get_role_id(), + "op_device": device, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + + return entrance_var + + +def create_backward_block( + program, origin_program, config, bp_ops_list, block_var_detail +): + pre_block_idx = program.num_blocks - 1 + heter_block = program._create_block(pre_block_idx) + + for _, op in enumerate(bp_ops_list): + if op.type == "send": + send_varnames = op.attr('send_varnames') + is_skip = False + for varname in send_varnames: + if ( + varname not in program.global_block().vars + and varname not in heter_block.vars + ): + is_skip = True + break + if is_skip is True: + continue + block_append_op(program, origin_program, heter_block, op) + + entrance_vars = block_var_detail[0]["backward"]["entrance"] + add_vars_by_var_list(entrance_vars, origin_program, program, heter_block) + exit_vars = block_var_detail[0]["backward"]["exit"] + add_vars_by_var_list(exit_vars, origin_program, program, heter_block) + return heter_block + + +def replace_ops_by_communicate_op( + program, config, heter_block_index, ops_list, block_var_detail +): + all_op = program.global_block().ops + start_op = ops_list[0] + first_op_idx = -1 + for op in all_op: + if is_same_op(op, start_op): + first_op_idx = all_op.index(op) + break + assert first_op_idx != -1 + delete_same_ops(program.global_block(), ops_list) + + entrance_var = [] + + if heter_block_index == 1: + mode = config.get_distributed_mode() + next_heter_worker_endpoints = config.get_next_stage_trainers() + + entrance_var = block_var_detail[heter_block_index]["forward"][ + "entrance" + ] + + comm_info = get_communicate_var_info( + program, heter_block_index + 1, entrance_var + ) + program.global_block()._insert_op( + index=first_op_idx, + type="send_and_recv", + inputs={"X": program.global_block().vars[entrance_var[0]]}, + outputs={"Out": []}, + attrs={ + "mode": "forward", + "send_var_name": entrance_var + ["microbatch_id"], + "recv_var_name": [], + "message_name": comm_info["block_input_var_name"], + "next_endpoints": next_heter_worker_endpoints, + "previous_endpoints": [], + "trainer_id": config.get_role_id(), + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + + return entrance_var + + +def remove_trainer_send_op( + program, config, heter_block_index, block_var_detail +): + + # if trainer do FF->BP->SEND, it has follow vars: var, var@GRAD + # if trainer only do SEND, it has one var: var@GRAD + # Delete Send op ,if trainer doesn't has pair var (var<->var@GRAD) + persistables = ( + block_var_detail[heter_block_index]["forward"]["persistables"] + + block_var_detail[heter_block_index]["backward"]["persistables"] + ) + need_remove_send_op = [] + need_remove_grad_var = [] + for op in find_send_op(program): + input_list, _ = find_op_input_output( + program, program.global_block(), op + ) + for var_name in input_list: + origin_var_name = var_name.split("@GRAD")[0] + if origin_var_name in persistables: + need_remove_send_op.append(op) + need_remove_grad_var.append(var_name) + need_remove_send_op = list(set(need_remove_send_op)) + delete_ops(program.global_block(), need_remove_send_op) + for grad_var_name in need_remove_grad_var: + config.remove_var_pair_by_grad(grad_var_name) + + +def add_heter_send_op(program, heter_program, block, block_var_detail): + def _get_send_op_dict(): + send_op_dict = {} + send_op_list = find_send_op(program) + for op in send_op_list: + input_list, _ = find_op_input_output( + program, program.global_block(), op + ) + for var in input_list: + send_op_dict[var] = op + return send_op_dict + + # send_Op = { inputs{'X':[]}, + # outputs{'Out':dummy_output}, + # attrs{'send_varnames'"[]", + # 'is_sparse':int, + # 'table_id':int } } + send_grad_var_list = [] + send_op_dict = _get_send_op_dict() + table_dict = {} + for persistable_var in block_var_detail["backward"]["persistables"]: + # check var_name == var@GRAD + if "@GRAD" not in persistable_var: + continue + if "GRAD" != persistable_var.split("@")[-1]: + continue + if persistable_var not in send_op_dict: + continue + send_op = send_op_dict[persistable_var] + is_sparse = send_op.attr('is_sparse') + table_id = send_op.attr('table_id') + send_varnames = send_op.attr('send_varnames') + send_grad_var_list.append(persistable_var) + if table_id not in table_dict: + table_dict[table_id] = {} + table_dict[table_id]['var_list'] = [] + table_dict[table_id]['is_sparse'] = is_sparse + table_dict[table_id]['send_varnames'] = send_varnames + table_dict[table_id]['var_list'].append(persistable_var) + + for table_id in table_dict: + dummy_output = block.create_var( + name=framework.generate_control_dev_var_name() + ) + send_input_vars = [ + block.vars[union_var] + for union_var in table_dict[table_id]['var_list'] + ] + block.append_op( + type="send", + inputs={"X": send_input_vars}, + outputs={"Out": dummy_output}, + attrs={ + "send_varnames": table_dict[table_id]['send_varnames'], + "is_sparse": is_sparse, + "table_id": table_id, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + }, + ) + + return send_grad_var_list + + +def find_send_op(program): + send_op_list = [] + for op in program.global_block().ops: + if op.type == "send": + send_op_list.append(op) + return send_op_list + + +def get_communicate_var_info( + program, block_index, entrance_var_list, type="forward" +): + input_var_reshape_dim = [] + input_var_reshape_name = [] + + if type == "forward": + block_input_var_name = "forward_joint_{}_{}@Heter".format( + block_index - 1, block_index + ) + else: + block_input_var_name = "backward_joint_{}_{}@Heter".format( + block_index + 1, block_index + ) + + entrance_var_list.sort() + # input + # Heter_SERVER_BLOCK_index@JOINT_VAR -> slice -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> reshape -> var + for name in entrance_var_list: + var = program.global_block().vars[name] + shape = var.shape + # if len(shape) < 2 or shape[0] != -1: + # raise ValueError( + # "Variable {} not support heter training. its shape is {}". + # format(name, shape)) + recv_var_dim = -1 * reduce(lambda x, y: x * y, shape) + input_var_reshape_dim.append(recv_var_dim) + input_var_reshape_name.append("{}.input_reshape@Heter".format(name)) + + # output + # var -> reshape -> var@Heter_SERVER_BLOCK@INPUT_RESHAPE_VAR -> concat -> Heter_SERVER_BLOCK_index@JOINT_VAR + # for var_name in exit_var_list: + # var = program.global_block().vars[var_name] + # shape = var.shape + # # if len(shape) < 2 or shape[0] != -1: + # # raise ValueError( + # # "Variable {} not support heter training. its shape is {}". + # # format(var_name, shape)) + # send_reshape_dim = -1 * reduce(lambda x, y: x * y, shape) + # output_var_reshape_dim.append(send_reshape_dim) + # output_var_reshape_name.append("{}.output_reshape@Heter".format( + # var_name)) + + info = { + "input_var_reshape_dim": input_var_reshape_dim, + "input_var_reshape_name": input_var_reshape_name, + "block_input_var_name": block_input_var_name, + # "output_var_reshape_dim": output_var_reshape_dim, + # "output_var_reshape_name": output_var_reshape_name, + # "block_output_var_name": block_output_var_name + } + + return info + + +def union_forward_gradient_op(program_block_ops_list): + """ + before analyzing the input & output of each block in program_block_list, we should + union the forward op and corresponding gradient op to elimincate the unnecessary variable + transmit + """ + """ + fix for 2emb model, re-place sum op + + """ + block_length = len(program_block_ops_list) + ''' + ## get the final part + final_part_idx = -1 + for i in range(block_length): + op_list = program_block_ops_list[i] + for op in op_list: + if "_grad" in op.type: + final_part_idx = i + break + if final_part_idx != -1: + break + + ## eliminate wrong partition because of sum op + ## lookup_table_v2_grad + ## every looup_table_v2_grad op block should follow a sum op + var2idx = {} + + for i in range(final_part_idx, block_length): + op_list = program_block_ops_list[i] + for j in range(len(op_list) - 1, -1, -1): + op = op_list[j] + #if op.type == "lookup_table_v2_grad": + # if j < len(op_list) - 1): + # else: + # ## get var and record place + if _grad in op.type: + forward_op_type = op.type.split("_grad")[0] + if forward_op_type in SPARSE_OP_TYPE_DICT.keys() \ + and op.attr('remote_prefetch') is True: + param_name = op.input(SPARSE_OP_TYPE_DICT[forward_op_type])[0] + + var2idx[] = [i,j] ## + + ''' + + union_program_block_ops_list = [] + assert ( + block_length % 2 != 0 + ), "the length of program_block_ops_list should be odd" + for i in range(0, block_length // 2): + block_op_list = {"forward": program_block_ops_list[i]} + block_op_list.update( + {"backward": program_block_ops_list[block_length - 1 - i]} + ) + union_program_block_ops_list.append(block_op_list) + + block_op_list = {"forward": [], "backward": []} + for op in program_block_ops_list[block_length // 2]: + if "_grad" not in op.type and not (op.type == "sum"): + block_op_list["forward"].append(op) + else: + block_op_list["backward"].append(op) + union_program_block_ops_list.append(block_op_list) + return union_program_block_ops_list + + +def find_block_joints(program, program_block_ops_list, heter_ops): + block_var_detail = find_entrance_exit_private( + program, program_block_ops_list + ) + block_var_detail = entrance_exit_check( + program, program_block_ops_list, block_var_detail, heter_ops + ) + block_var_detail = delete_block_useless_exit( + program, program_block_ops_list, block_var_detail + ) + + return block_var_detail + + +def find_entrance_exit_private(program, program_block_ops_list): + block_var_detail = [] + persistables = [] + for index, block_op_list in enumerate(program_block_ops_list): + # forward + block_input, block_output = find_ops_list_input_output( + program, block_op_list["forward"] + ) + persistables = screen_persistables( + program, block_input + ) + screen_persistables(program, block_output) + # find entrance & exit + block_private_vars = list(set(block_input) & set(block_output)) + block_entrance = list(set(block_input) - set(block_private_vars)) + block_exit = list(set(block_output) - set(block_private_vars)) + detail = { + "forward": { + "entrance": block_entrance, + "exit": block_exit, + "private": block_private_vars, + "persistables": persistables, + } + } + + # backward + bp_block_input, bp_block_output = find_ops_list_input_output( + program, block_op_list["backward"] + ) + bp_persistables = screen_persistables( + program, bp_block_input + ) + screen_persistables(program, bp_block_output) + # find entrance & exit + bp_block_private_vars = list(set(bp_block_input) & set(bp_block_output)) + bp_block_entrance = list( + set(bp_block_input) - set(bp_block_private_vars) + ) + bp_block_exit = list(set(bp_block_output) - set(bp_block_private_vars)) + detail.update( + { + "backward": { + "entrance": bp_block_entrance, + "exit": bp_block_exit, + "private": bp_block_private_vars, + "persistables": bp_persistables, + } + } + ) + block_var_detail.append(detail) + return block_var_detail + + +def entrance_exit_check( + program, program_block_ops_list, block_var_detail, heter_ops +): + for index in range(len(block_var_detail) - 1, -1, -1): + if index - 1 < 0: + break + previous_block_exit = block_var_detail[index - 1]["forward"]["exit"] + previous_block_exit.sort() + current_block_entrance = block_var_detail[index]["forward"]["entrance"] + + backward_entrance = block_var_detail[index]["backward"]["entrance"] + + forward_all = ( + block_var_detail[index]["forward"]["entrance"] + + block_var_detail[index]["forward"]["private"] + + block_var_detail[index]["forward"]["exit"] + ) + + for var in backward_entrance: + if not ("@GRAD" in var) and not (var in forward_all): + current_block_entrance.append(var) + + current_block_entrance.sort() + + if previous_block_exit == current_block_entrance: + continue + exist_vars = list( + set(previous_block_exit) & set(current_block_entrance) + ) + need_add_vars = list(set(current_block_entrance) - set(exist_vars)) + # var in different stage should not be ignored, since they are not placed in the same program & device + # need_add_vars = find_need_var_from_previous_block( + # need_add_vars, block_var_detail, index, heter_ops) + + previous_block_private = block_var_detail[index - 1]["forward"][ + "private" + ] + previous_block_entrance = block_var_detail[index - 1]["forward"][ + "entrance" + ] + for var in need_add_vars: + if ( + var not in previous_block_private + and var not in previous_block_entrance + ): + previous_block_entrance.append(var) + previous_block_exit.append(var) + if var not in current_block_entrance: + current_block_entrance.append(var) + + for index in range(0, len(block_var_detail) - 1, 1): + previous_block_exit = block_var_detail[index + 1]["backward"]["exit"] + previous_block_exit.sort() + current_block_entrance = block_var_detail[index]["backward"]["entrance"] + + current_block_entrance.sort() + + if previous_block_exit == current_block_entrance: + continue + exist_vars = list( + set(previous_block_exit) & set(current_block_entrance) + ) + need_add_vars = list(set(current_block_entrance) - set(exist_vars)) + need_ignore_vars = [] + for var in need_add_vars: + if "@GRAD" not in var: + need_ignore_vars.append(var) + need_add_vars = list( + set(need_add_vars).difference(set(need_ignore_vars)) + ) + previous_block_private = block_var_detail[index + 1]["backward"][ + "private" + ] + previous_block_entrance = block_var_detail[index + 1]["backward"][ + "entrance" + ] + for var in need_add_vars: + if ( + var not in previous_block_private + and var not in previous_block_entrance + ): + previous_block_entrance.append(var) + previous_block_exit.append(var) + return block_var_detail + + +def find_need_var_from_previous_block( + need_add_vars, block_var_detail, current_index, heter_ops +): + # create index_device_map + index_device_map = {} + for index in range(len(block_var_detail)): + index_device_map[index] = DEFAULT_DEVICE + for device in heter_ops: + for index in heter_ops[device].keys(): + if index < len(block_var_detail): + index_device_map[index] = device + + pre_index = current_index - 1 + need_ignore_var = [] + + # if need_add_var in current device, no need communicate + for var in need_add_vars: + while pre_index >= 0: + previous_block_private = block_var_detail[pre_index]["private"] + previous_block_exit = block_var_detail[pre_index]["exit"] + previous_block_entrance = block_var_detail[pre_index]["entrance"] + total_var = ( + previous_block_private + + previous_block_exit + + previous_block_entrance + ) + if var in total_var: + if ( + index_device_map[current_index] + == index_device_map[pre_index] + and index_device_map[current_index] == DEFAULT_DEVICE + ): + need_ignore_var.append(var) + break + pre_index -= 1 + + need_add_vars = list(set(need_add_vars).difference(set(need_ignore_var))) + return need_add_vars + + +def delete_block_useless_exit( + program, program_block_ops_list, block_var_detail +): + # forward + for index in range(len(block_var_detail)): + if index == len(block_var_detail) - 1: + break + current_block_exit = block_var_detail[index]["forward"]["exit"] + next_block_entrance = block_var_detail[index + 1]["forward"]["entrance"] + need_delete_var = [] + for var in current_block_exit: + if var not in next_block_entrance: + need_delete_var.append(var) + + for var in need_delete_var: + current_block_exit.remove(var) + # backward + for index in range(len(block_var_detail) - 1, -1, -1): + if index - 1 < 0: + break + current_block_exit = block_var_detail[index]["backward"]["exit"] + next_block_entrance = block_var_detail[index - 1]["backward"][ + "entrance" + ] + need_delete_var = [] + for var in current_block_exit: + if var not in next_block_entrance: + need_delete_var.append(var) + for var in need_delete_var: + current_block_exit.remove(var) + + return block_var_detail + + +def check_op_device(block, device): + for op in block.ops: + op._set_attr('op_device', device) + + +def screen_persistables(program, var_list): + need_remove = [] + for var_name in var_list: + if "@GRAD" in var_name: + if "GRAD" != var_name.split("@")[-1]: + continue + origin_var_name = var_name.split("@GRAD")[0] + var = program.global_block().vars[origin_var_name] + else: + var = program.global_block().vars[var_name] + + if paddle.static.is_persistable(var): + need_remove.append(var_name) + + for var_name in need_remove: + var_list.remove(var_name) + return need_remove + + +def insert_reshape_op( + program, block, index, var_name, new_var_name, new_var_shape=None +): + input_var = block.vars[var_name] + + if new_var_name not in block.vars: + out = block.create_var( + name=new_var_name, + shape=new_var_shape, + dtype=input_var.dtype, + type=input_var.type, + ) + else: + out = block.vars[new_var_name] + new_var_shape = out.shape + + x_shape = block.create_var( + name="{}.xshape@Heter".format(var_name), dtype=input_var.dtype + ) + block._insert_op( + index=index, + type="reshape2", + inputs={"X": input_var}, + attrs={'shape': new_var_shape}, + outputs={"Out": out, "XShape": x_shape}, + ) + + +def insert_send_concat_op( + program, block, index, var_name_list, new_var_name, new_var_shape +): + input_var_list = [block.vars[var_name] for var_name in var_name_list] + + out = program.global_block().create_var( + name=new_var_name, + shape=new_var_shape, + dtype=input_var_list[0].dtype, + type=input_var_list[0].type, + ) + + block._insert_op( + index=index, + type='concat', + inputs={"X": input_var_list}, + outputs={'Out': [out]}, + attrs={'axis': -1, 'use_stack': False}, + ) + + +def insert_recv_slice_op( + program, + block, + index, + var_name, + var_shape, + dtype, + type, + new_var_name_list, + new_var_shape_list, +): + if var_name not in program.global_block().vars: + input_var = program.global_block().create_var( + name=var_name, shape=var_shape, dtype=dtype, type=type + ) + else: + input_var = program.global_block().vars[var_name] + + out_list = [] + for i in range(len(new_var_name_list)): + if new_var_name_list[i] not in block.vars: + out = block.create_var( + name=new_var_name_list[i], + shape=new_var_shape_list[i], + dtype=input_var.dtype, + type=input_var.type, + ) + else: + out = block.vars[new_var_name_list[i]] + out_list.append(out) + + start_index = 0 + end_index = 0 + for i in range(len(new_var_name_list)): + starts = [] + ends = [] + attrs = {'axes': [1]} + end_index += new_var_shape_list[i][1] + starts.append(start_index) + ends.append(end_index) + attrs['starts'] = starts + attrs['ends'] = ends + + block._insert_op( + index=index, + type='slice', + inputs={'Input': input_var}, + attrs=attrs, + outputs={'Out': out_list[i]}, + ) + start_index = end_index + index += 1 + + +def add_heter_trainer_useful_vars( + config, program, heter_program, heter_block, static_var +): + static_var = list(set(static_var)) + for var_name in static_var: + if ( + var_name not in heter_program.global_block().vars + and var_name not in heter_block.vars + ): + var = program.global_block().vars[var_name] + if var.persistable: + heter_program.global_block()._clone_variable( + var, force_persistable=False + ) + else: + heter_block._clone_variable(var, force_persistable=False) + + +def delete_trainer_useless_var(config, program, static_var): + static_var = list(set(static_var)) + program_useful_var_list = [] + for op in program.global_block().ops: + input_var_list, output_var_list = find_op_input_output( + program, program.global_block(), op + ) + op_var_list = list(set(input_var_list).union(set(output_var_list))) + program_useful_var_list = list( + set(program_useful_var_list).union(set(op_var_list)) + ) + program_useful_var_list += static_var + program_useless_var_list = list( + set(get_vars_name_in_block(program.global_block())).difference( + set(program_useful_var_list) + ) + ) + for var in program_useless_var_list: + program.global_block()._remove_var(var) + return program_useless_var_list + + +def block_append_op(program, origin_program, block, op): + + merge_ordereddict = origin_program.global_block().vars.copy() + merge_ordereddict.update(block.vars) + inputs = _get_input_map_from_op(merge_ordereddict, op) + for key, varlist in inputs.items(): + if not isinstance(varlist, list): + varlist = [varlist] + for var in varlist: + if ( + var.name not in program.global_block().vars + and var.name not in block.vars + ): + if var.persistable: + program.global_block()._clone_variable( + var, force_persistable=False + ) + else: + block._clone_variable(var, force_persistable=False) + + outputs = _get_output_map_from_op(origin_program.global_block().vars, op) + for key, varlist in outputs.items(): + if not isinstance(varlist, list): + varlist = [varlist] + for var in varlist: + if ( + var.name not in program.global_block().vars + and var.name not in block.vars + ): + if var.persistable: + program.global_block()._clone_variable( + var, force_persistable=False + ) + else: + block._clone_variable(var, force_persistable=False) + + if "_grad" not in op.type: + # for forward op + return block.append_op( + type=op.type, inputs=inputs, outputs=outputs, attrs=op.all_attrs() + ) + else: + # for grad op + op_desc = op.desc + op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName() + backward = core.op_proto_and_checker_maker.OpRole.Backward + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + + # append grad op + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(op_desc) + new_op_desc._set_attr(op_role_attr_name, backward) + + # set device gard + if op.desc.has_attr(device_attr_name): + op_device = op_desc.attr(device_attr_name) + new_op_desc._set_attr(device_attr_name, op_device) + block._sync_with_cpp() + + +def add_vars_by_var_list(var_name_list, origin_program, program, block): + for var_name in var_name_list: + if ( + var_name not in program.global_block().vars + and var_name not in block.vars + ): + var = origin_program.global_block().vars[var_name] + if var.persistable: + program.global_block()._clone_variable( + var, force_persistable=False + ) + else: + block._clone_variable(var, force_persistable=False) + + +def get_varlist_from_op_map(var_map): + var_list = [] + for key, varlist in var_map.items(): + if not isinstance(varlist, list): + varlist = [varlist] + for i in range(len(varlist)): + var = varlist[i] + var_list.append(var.name) + return var_list + + +def find_ops_list_input_output(program, ops_list): + input_var_list = [] + output_var_list = [] + for op in ops_list: + inputs = _get_input_map_from_op(program.global_block().vars, op) + input_var_list += get_varlist_from_op_map(inputs) + outputs = _get_output_map_from_op(program.global_block().vars, op) + output_var_list += get_varlist_from_op_map(outputs) + + input_var_list = list(set(input_var_list)) + output_var_list = list(set(output_var_list)) + return input_var_list, output_var_list + + +def find_op_input_output(program, block, op): + input_var_list = [] + output_var_list = [] + inputs = _get_input_map_from_op(block.vars, op) + input_var_list += get_varlist_from_op_map(inputs) + outputs = _get_output_map_from_op(block.vars, op) + output_var_list += get_varlist_from_op_map(outputs) + input_var_list = list(set(input_var_list)) + output_var_list = list(set(output_var_list)) + return input_var_list, output_var_list + + +def get_vars_name_in_block(block): + vars_list = block.vars.keys() + vars_name_list = [var_name for var_name in vars_list] + return vars_name_list + + +def is_same_op(op1, op2): + if str(op1) != str(op2): + return False + return True + + +def _get_input_map_from_op(varmap, op): + """Returns a dict from op input name to the vars in varmap.""" + iomap = collections.OrderedDict() + for key in op.input_names: + vars = [] + for varname in op.input(key): + if varname == "@EMPTY@": + continue + if "lod_tensor_blocking_queue" in varname: + continue + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + +def _get_output_map_from_op(varmap, op): + """Returns a dict from op output name to the vars in varmap.""" + iomap = collections.OrderedDict() + for key in op.output_names: + vars = [] + for varname in op.output(key): + if varname == "@EMPTY@": + continue + if "lod_tensor_blocking_queue" in varname: + continue + vars.append(varmap[varname]) + if len(vars) == 1: + iomap[key] = vars[0] + else: + iomap[key] = vars + return iomap + + +def delete_same_ops(block, ops): + for op in ops: + try: + for origin_op in block.ops: + if is_same_op(origin_op, op): + idx = list(block.ops).index(origin_op) + block._remove_op(idx) + break + except Exception as e: + print(e) diff --git a/python/paddle/incubate/fleet/parameter_server/ir/ufind.py b/python/paddle/incubate/fleet/parameter_server/ir/ufind.py new file mode 100644 index 0000000000000000000000000000000000000000..5b22d008b779a86f7d96c628b3df9b17d7bc05f7 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/ufind.py @@ -0,0 +1,64 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class UnionFind: + """Union-find data structure. + + Union-find is a data structure that keeps track of a set of elements partitioned + into a number of disjoint (non-overlapping) subsets. + + Reference: + https://en.wikipedia.org/wiki/Disjoint-set_data_structure + + Args: + elements(list): The initialize element list. + """ + + def __init__(self, elementes=None): + self._parents = [] # index -> parent index + self._index = {} # element -> index + self._curr_idx = 0 + if not elementes: + elementes = [] + for ele in elementes: + self._parents.append(self._curr_idx) + self._index.update({ele: self._curr_idx}) + self._curr_idx += 1 + + def find(self, x): + # Find the root index of given element x, + # execute the path compress while findind the root index + if x not in self._index: + return -1 + idx = self._index[x] + while idx != self._parents[idx]: + t = self._parents[idx] + self._parents[idx] = self._parents[t] + idx = t + return idx + + def union(self, x, y): + # Union two given element + x_root = self.find(x) + y_root = self.find(y) + + if x_root == y_root: + return + self._parents[x_root] = y_root + + def is_connected(self, x, y): + # If two given elements have the same root index, + # then they are connected. + return self.find(x) == self.find(y) diff --git a/python/paddle/incubate/fleet/parameter_server/ir/vars_metatools.py b/python/paddle/incubate/fleet/parameter_server/ir/vars_metatools.py new file mode 100644 index 0000000000000000000000000000000000000000..090be625cb2602306783c63adc7ddcf7a7c1d5f0 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/ir/vars_metatools.py @@ -0,0 +1,231 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import reduce + +from paddle.framework import core +from paddle.framework.io import Variable + +dtype_to_size = { + core.VarDesc.VarType.FP16: 2, + core.VarDesc.VarType.FP32: 4, + core.VarDesc.VarType.FP64: 8, + core.VarDesc.VarType.INT16: 2, + core.VarDesc.VarType.INT32: 4, + core.VarDesc.VarType.INT64: 8, + core.VarDesc.VarType.BOOL: 1, + core.VarDesc.VarType.UINT8: 1, +} + + +class VarBlock: + def __init__(self, varname, offset, size): + self.varname = varname + # NOTE: real offset is offset * size + self.offset = offset + self.size = size + + def __str__(self): + return "%s:%d:%d" % (self.varname, self.offset, self.size) + + +def create_var_struct(var): + if var.type == core.VarDesc.VarType.SELECTED_ROWS: + lod_level = None + elif var.type == core.VarDesc.VarType.LOD_TENSOR: + lod_level = var.lod_level + else: + raise ValueError("can only support SELECTED_ROWS/LOD_TENSOR now") + + return VarStruct( + var.name, var.shape, var.dtype, var.type, lod_level, var.persistable + ) + + +class VarStruct: + """ + record part properties of a Variable in python. + """ + + def __init__(self, name, shape, dtype, type, lod_level, persistable): + self.name = name + self.shape = shape + self.dtype = dtype + self.type = type + self.lod_level = lod_level + self.persistable = persistable + self.m_size = 1 + self.m_size = reduce(lambda x, y: x * y, shape) + self.m_size *= dtype_to_size[dtype] + + def __str__(self): + return "N: {}, S: {}, D: {}, T: {}, LL: {}, P: {}, M: {}".format( + self.name, + self.shape, + self.dtype, + self.type, + self.lod_level, + self.persistable, + self.m_size, + ) + + +class VarDistributed: + """ + a class to record the var distributed on parameter servers. + the class will record the relationship between origin var and slice var. + the slice var's properties, such as type/shape/offset/endpoint. + """ + + def __init__( + self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None, + ): + """ + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + """ + + if isinstance(origin_var, Variable): + self.origin = create_var_struct(origin_var) + else: + self.origin = origin_var + + if isinstance(slice_var, Variable): + self.slice = create_var_struct(slice_var) + else: + self.slice = slice_var + + if self.equal(self.origin, self.slice): + self.is_slice = False + self.block_id = 0 + self.offset = 0 + else: + self.is_slice = True + self.block_id = 0 + self.offset = 0 + + if is_slice is not None: + self.is_slice = is_slice + if block_id is not None: + self.block_id = block_id + if offset is not None: + self.offset = offset + + self.vtype = vtype + self.endpoint = endpoint + + @staticmethod + def equal(var1, var2): + """ + the two var is equal or not. + Returns: + bool: equal will return True else False + """ + assert isinstance(var1, VarStruct) and isinstance(var2, VarStruct) + + return ( + var1.name == var2.name + and var1.type == var2.type + and var1.shape == var2.shape + and var1.dtype == var2.dtype + and var1.lod_level == var2.lod_level + and var1.persistable == var2.persistable + ) + + def __str__(self): + origin_var_str = ( + "{name} : fluid.{type}.shape{shape}.astype({dtype})".format( + name=self.origin.name, + type=self.origin.type, + shape=self.origin.shape, + dtype=self.origin.dtype, + ) + ) + + slice_var_str = ( + "{name} : fluid.{type}.shape{shape}.astype({dtype})" + ".slice({is_slice}).block({block_id}).offset({offset})".format( + name=self.slice.name, + type=self.slice.type, + shape=self.slice.shape, + dtype=self.slice.dtype, + is_slice=self.is_slice, + block_id=self.block_id, + offset=self.offset, + ) + ) + + return "var owned: {}, origin var: ( {} ), slice var: ( {} ), endpoint: {} ".format( + self.vtype, origin_var_str, slice_var_str, self.endpoint + ) + + +class VarsDistributed: + """ + a gather about VarDistributed with many methods to find distributed vars. + through the class, we can get overview about the distributed parameters on parameter servers. + this class may centralized and convenient for developer to manage and get variable's distribute. + other module can also use this to find variables such io.py. + """ + + def __init__(self): + self.distributed_vars = [] + + def add_distributed_var( + self, + origin_var, + slice_var, + is_slice=None, + block_id=None, + offset=None, + vtype=None, + endpoint=None, + ): + """ + add distributed var in this. + + Args: + origin_var(Variable|VarStruct): origin var properties + slice_var(Variable|VarStruct): slice var properties + is_slice(bool|None): slice or not, slice_var=True/False and its block size > 8192 are the judgement standard. + block_id(int|None): the number about the slice var. + offset(int|None): if the slice var is sliced, offset is the numel before the var. + vtype(str|None): a tag, such as Optimizer/Param/RemoteProfetch. + endpoint(str|None): which parameter the slice var on, such as "127.0.0.1:1001" + Returns: + None + """ + self.distributed_vars.append( + VarDistributed( + origin_var, + slice_var, + is_slice, + block_id, + offset, + vtype, + endpoint, + ) + ) diff --git a/python/paddle/incubate/fleet/parameter_server/mode.py b/python/paddle/incubate/fleet/parameter_server/mode.py new file mode 100644 index 0000000000000000000000000000000000000000..623e919ba35a8aa135875a2b266062c07964e53d --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/mode.py @@ -0,0 +1,29 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +class PSMode: + """ + There are various mode for fleet, each of them is designed for different model. + """ + + TRANSPILER = 1 + PSLIB = 2 + + +class DistributedMode: + SYNC = 0 + ASYNC = 1 + HALF_ASYNC = 2 + GEO = 3 diff --git a/python/paddle/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/incubate/fleet/parameter_server/pslib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..75bac7f94ea93411b2023bd2bf825018bdb7ddfe --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/pslib/__init__.py @@ -0,0 +1,1273 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""Defination of PSLib.""" + +import os +import sys +from .optimizer_factory import * # noqa: F403 +from google.protobuf import text_format +from paddle.framework import core + +from paddle.fluid.incubate.fleet.base.fleet_base import Fleet +from paddle.fluid.incubate.fleet.base.mode import Mode +from paddle.fluid.incubate.fleet.base.fleet_base import DistributedOptimizer +from paddle.fluid.incubate.fleet.base.role_maker import MPISymetricRoleMaker +from paddle.fluid.incubate.fleet.base.role_maker import HeterRoleMaker + + +class PSLib(Fleet): + """PSLib class.""" + + def __init__(self): + super().__init__(Mode.PSLIB) + self._opt_info = None + self._local_ip = 0 + self._fleet_ptr = None + self._main_programs = [] + self._scopes = [] + self._client2client_request_timeout_ms = 500000 + self._client2client_connect_timeout_ms = 10000 + self._client2client_max_retry = 3 + + def init(self, role_maker=None): + if role_maker is None: + role_maker = MPISymetricRoleMaker() + super().init(role_maker) + self._fleet_ptr = core.Fleet() + self._heter_ptr = None + if isinstance(role_maker, HeterRoleMaker): + self._heter_ptr = core.Heter() + + def _set_client_communication_config( + self, request_timeout_ms, connect_timeout_ms, max_retry + ): + self._client2client_request_timeout_ms = request_timeout_ms + self._client2client_connect_timeout_ms = connect_timeout_ms + self._client2client_max_retry = max_retry + + def set_pull_local_thread_num(self, thread_num): + self._fleet_ptr.set_pull_local_thread_num(thread_num) + + def init_worker(self): + """ + init_worker(): will be called by user. When a user knows current process is_server(), he/she + should call init_worker() to initialize global information about worker and connect + worker with pserver. You should run startup program before init_worker. + Args: + executor(Executor): The executor to run for init server. + programs(Program|None): The program that need to run. + """ + + if len(self._main_programs) == 0: + raise ValueError( + "You should run DistributedOptimizer.minimize() first" + ) + + if self._opt_info: + if "fleet_desc" in self._opt_info: + self._dist_desc_str = text_format.MessageToString( + self._opt_info["fleet_desc"] + ) + self._dist_desc = self._opt_info["fleet_desc"] + else: + raise Exception( + "You should run DistributedOptimizer.minimize() first" + ) + # barrier_all for init_server, wait for server starts + if isinstance(self._role_maker, HeterRoleMaker): + if self._role_maker.is_xpu(): + local_endpoint = self._role_maker.get_local_endpoint() + local_endpoint = local_endpoint.split(":") + self._heter_ptr.start_xpu_service( + str(local_endpoint[0]), int(local_endpoint[1]) + ) + self._role_maker._barrier_all() + self.all_ips_ = self._role_maker._all_gather(self._local_ip) + # worker_index * 2 is for compatible with older versions of pslib + self._fleet_ptr.init_worker( + self._dist_desc_str, + self.all_ips_, + self._role_maker._get_size(), + self._role_maker.worker_index() * 2, + ) + if isinstance(self._role_maker, HeterRoleMaker): + if self._role_maker.is_worker(): + self._heter_ptr.set_xpu_list( + self._role_maker._xpu_endpoints + ) + self._heter_ptr.create_client2xpu_connection() + # barrier_all for init_worker + self._role_maker._barrier_all() + # prepare for client to client communication + if self._role_maker.is_worker(): + info = self._fleet_ptr.get_clients_info() + print("Client Info: {}".format(info)) + all_info = self._role_maker._worker_gather(info[0]) + print("All Client Info: {}".format(all_info)) + self._fleet_ptr.gather_clients(all_info) + self._fleet_ptr.set_client2client_config( + self._client2client_request_timeout_ms, + self._client2client_connect_timeout_ms, + self._client2client_max_retry, + ) + self._fleet_ptr.create_client2client_connection() + # barrier for init model + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + tables = [] + for tp in self._dist_desc.trainer_param: + for i in tp.dense_table: + tables.append(i) + for prog, scope in zip(self._main_programs, self._scopes): + prog_id = str(id(prog)) + prog_conf = self._opt_info['program_configs'][prog_id] + prog_tables = {} + for key in prog_conf: + if "dense" not in key: + continue + for table_id in prog_conf[key]: + prog_tables[int(table_id)] = 0 + for table in tables: + if int(table.table_id) not in prog_tables: + continue + var_name_list = [] + for i in range(0, len(table.dense_variable_name)): + var_name = table.dense_variable_name[i] + if scope.find_var(var_name) is None: + raise ValueError( + "var " + + var_name + + " not found in scope, " + + "you should run startup program first" + ) + var_name_list.append(var_name) + if not self._opt_info["use_ps_gpu"]: + self._fleet_ptr.init_model( + scope, int(table.table_id), var_name_list + ) + # barrier for init model done + self._role_maker._barrier_worker() + else: + raise NameError( + "You should run DistributedOptimizer.minimize() first" + ) + + def init_server(self, model_dir=None, **kwargs): + """ + init_server() will be called by user. It will load model from model_dir. + Args: + model_dir(str): load model path, can be local or hdfs/afs path. + kwargs: user-defined attributes, currently support following: + model(int): load model mode. + 0 is for load whole model, + 1 is for load delta model (load diff), + default is 0. + Example: + >>> fleet.init_server("/you/path/to/model", mode = 0) + """ + mode = kwargs.get("mode", 0) + if isinstance(self._role_maker, HeterRoleMaker): + self._role_maker._barrier_xpu() + if self._role_maker.is_first_xpu(): + self._fleet_ptr.load_model(model_dir, mode) + self._role_maker._barrier_xpu() + else: + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.load_model(model_dir, mode) + self._role_maker._barrier_worker() + + def run_server(self): + """ + init_pserver(): will be called by user. When a user knows current process is_worker(), he/she + should call init_pserver() to initialize global information about parameter server + """ + if self._opt_info: + if "fleet_desc" in self._opt_info: + self._dist_desc_str = text_format.MessageToString( + self._opt_info["fleet_desc"] + ) + self._dist_desc = self._opt_info["fleet_desc"] + else: + raise Exception( + "You should run DistributedOptimizer.minimize() first" + ) + # server_index * 2 is for compatible with older versions of pslib + self._fleet_ptr.init_server( + self._dist_desc_str, self._role_maker.server_index() * 2 + ) + if isinstance(self._role_maker, MPISymetricRoleMaker): + self._local_ip = self._fleet_ptr.run_server() + else: + local_endpoint = self._role_maker.get_local_endpoint() + local_endpoint = local_endpoint.split(":") + self._local_ip = self._fleet_ptr.run_server( + str(local_endpoint[0]), int(local_endpoint[1]) + ) + + # barrier_all for init_server + self._role_maker._barrier_all() + self.all_ips_ = self._role_maker._all_gather(self._local_ip) + + self._fleet_ptr.gather_servers( + self.all_ips_, self._role_maker._get_size() + ) + # barrier_all for init_worker, wait all workers start + self._role_maker._barrier_all() + else: + raise Exception( + "You should run DistributedOptimizer.minimize() first" + ) + + def end_pass(self, scope): + if self._role_maker.worker_index() < self._role_maker.xpu_num(): + self._heter_ptr.end_pass(scope, self._role_maker.worker_index()) + self._heter_ptr.stop_xpu_service(self._role_maker.worker_index()) + + def train_from_dataset( + self, + executor, + program=None, + dataset=None, + scope=None, + thread=0, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None, + ): + """ """ + + if self._role_maker.is_worker(): + self._role_maker._barrier_heter() + executor.train_from_dataset( + program, + dataset, + scope, + thread, + debug, + fetch_list, + fetch_info, + print_period, + fetch_handler, + ) + + def start_heter_trainer( + self, + executor, + program=None, + scope=None, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100, + fetch_handler=None, + ): + """ """ + + trainer_instance = executor.start_heter_trainer( + program, + scope, + debug, + fetch_list, + fetch_info, + print_period, + fetch_handler, + ) + if self._role_maker.is_xpu(): + print("barrier heter") + self._role_maker._barrier_heter() + print("barrier heter") + executor._default_executor.release_trainer(trainer_instance) + + def stop_worker(self): + """ + stop(): will be called after a user finishes his/her training task. Fleet instance will be + destroyed when stop() is called. + """ + self._role_maker._barrier_worker() + # all worker should be finalize first + if self._role_maker.is_worker(): + self._fleet_ptr.finalize_worker() + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.stop_server() + if self._heter_ptr: + self._heter_ptr.stop_xpu_service() + self._role_maker._barrier_worker() + self._role_maker._barrier_all() + self._role_maker._finalize() + + def distributed_optimizer(self, optimizer, strategy={}): + """ + distributed_optimizer + Args: + optimizer(Optimizer): optimizer + strategy(dict): strategy + Examples: + .. code-block:: python + fleet.distributed_optimizer(optimizer) + Returns: + optimizer(DownpourOptimizer): downpour optimizer + """ + self._optimizer = DownpourOptimizer(optimizer, strategy) + return self._optimizer + + def save_inference_model( + self, + executor, + dirname, + feeded_var_names=None, + target_vars=None, + main_program=None, + export_for_deployment=True, + ): + """ + save pserver model called from a worker + Args: + executor(Executor): fluid executor + dirname(str): save model path + feeded_var_names(list): default None + target_vars(list): default None + main_program(Program): default None + export_for_deployment(bool): default None + Examples: + .. code-block:: python + fleet.save_inference_model(dirname="hdfs:/my/path") + """ + self._fleet_ptr.save_model(dirname, 0) + + def print_table_stat(self, table_id): + """ + print stat info of table_id, + format: tableid, feasign size, mf size + Args: + table_id(int): the id of table + Example: + .. code-block:: python + fleet.print_table_stat(0) + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.print_table_stat(table_id) + self._role_maker._barrier_worker() + + def set_file_num_one_shard(self, table_id, file_num): + """ + set file_num in one shard + Args: + table_id(int): the id of table + file_num(int): file num in one shard + Example: + .. code-block:: python + fleet.set_file_num_one_shard(0, 5) + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_file_num_one_shard(table_id, file_num) + self._role_maker._barrier_worker() + + def save_persistables(self, executor, dirname, main_program=None, **kwargs): + """ + save presistable parameters, + when using fleet, it will save sparse and dense feature + Args: + executor(Executor): fluid executor + dirname(str): save path. It can be hdfs/afs path or local path + main_program(Program): fluid program, default None + kwargs: use define property, current support following + mode(int): 0 means save all pserver model, + 1 means save delta pserver model (save diff), + 2 means save xbox base, + 3 means save batch model. + Example: + .. code-block:: python + fleet.save_persistables(dirname="/you/path/to/model", mode = 0) + """ + mode = kwargs.get("mode", 0) + self._fleet_ptr.client_flush() + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.save_model(dirname, mode) + self._role_maker._barrier_worker() + + def save_model_with_whitelist( + self, executor, dirname, whitelist_path, main_program=None, **kwargs + ): + """ + save whitelist, mode is consistent with fleet.save_persistables, + when using fleet, it will save sparse and dense feature + + Args: + executor(Executor): fluid executor + dirname(str): save path. It can be hdfs/afs path or local path + main_program(Program): fluid program, default None + kwargs: use define property, current support following + mode(int): 0 means save all pserver model, + 1 means save delta pserver model (save diff), + 2 means save xbox base, + 3 means save batch model. + + Example: + .. code-block:: python + + fleet.save_persistables(dirname="/you/path/to/model", mode = 0) + + """ + mode = kwargs.get("mode", 0) + table_id = kwargs.get("table_id", 0) + self._fleet_ptr.client_flush() + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.save_model_with_whitelist( + table_id, dirname, mode, whitelist_path + ) + self._role_maker._barrier_worker() + + def save_multi_table_one_path(self, table_ids, model_dir, **kwargs): + """ + save pslib multi sparse table in one path. + Args: + table_ids(list): table ids + model_dir(str): if you use hdfs, model_dir should starts with + 'hdfs:', otherwise means local dir + kwargs(dict): user-defined properties. + mode(int): the modes illustrated above, default 0 + prefix(str): the parts to save can have prefix, + for example, part-prefix-000-00000 + Examples: + .. code-block:: python + fleet.save_multi_table_one_path("[0, 1]", "afs:/user/path/") + """ + mode = kwargs.get("mode", 0) + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.save_multi_table_one_path( + table_ids, model_dir, mode + ) + self._role_maker._barrier_worker() + + def save_cache_model(self, executor, dirname, main_program=None, **kwargs): + """ + save sparse cache table, + when using fleet, it will save sparse cache table + Args: + executor(Executor): fluid executor + dirname(str): save path. It can be hdfs/afs path or local path + main_program(Program): fluid program, default None + kwargs: use define property, current support following + mode(int): define for feature extension in the future, + currently no use, will pass a default value 0 + table_id(int): which table to save cache, default is 0 + Returns: + feasign_num(int): cache feasign num + Example: + .. code-block:: python + fleet.save_cache_model(None, dirname="/you/path/to/model", mode = 0) + """ + mode = kwargs.get("mode", 0) + table_id = kwargs.get("table_id", 0) + self._fleet_ptr.client_flush() + self._role_maker._barrier_worker() + cache_threshold = 0.0 + + if self._role_maker.is_first_worker(): + cache_threshold = self._fleet_ptr.get_cache_threshold(table_id) + # check cache threshold right or not + self._role_maker._barrier_worker() + + if self._role_maker.is_first_worker(): + self._fleet_ptr.cache_shuffle( + table_id, dirname, mode, cache_threshold + ) + + self._role_maker._barrier_worker() + + feasign_num = -1 + if self._role_maker.is_first_worker(): + feasign_num = self._fleet_ptr.save_cache(table_id, dirname, mode) + + self._role_maker._barrier_worker() + return feasign_num + + def shrink_sparse_table(self): + """ + shrink cvm of all sparse embedding in pserver, the decay rate + is defined as "show_click_decay_rate" in fleet_desc.prototxt + Example: + >>> fleet.shrink_sparse_table() + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + tables = [] + for tp in self._opt_info["fleet_desc"].trainer_param: + for i in tp.sparse_table: + tables.append(i.table_id) + for i in list(set(tables)): + self._fleet_ptr.shrink_sparse_table(i) + self._role_maker._barrier_worker() + + def shrink_dense_table(self, decay, emb_dim=11, scope=None, table_id=None): + """ + shrink batch_sum in pserver by multiplying by decay + Args: + decay(float): the decay rate, usually range in (0, 1) + emb_dim(int): one element's length in datanorm layer + scope(Scope): Scope object, default is fluid.global_scope() + table_id(int): table id of shrinking dense table. None means shrink all, + you should specify it when using multiple scopes, + default is None. + Example: + >>> fleet.shrink_dense_table(0.98, 11, myscope1, 1) + >>> fleet.shrink_dense_table(0.98, 11, myscope1, 2) + >>> fleet.shrink_dense_table(0.98, 11, myscope2, 3) + """ + if scope is None: + scope = fluid.global_scope() + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + for tp in self._opt_info["fleet_desc"].trainer_param: + for i in tp.dense_table: + if table_id is not None and table_id != i.table_id: + continue + var_list = [var for var in i.dense_variable_name] + skip = False + for var in var_list: + if scope.find_var(var) is None: + skip = True + break + if skip: + continue + self._fleet_ptr.shrink_dense_table( + i.table_id, scope, var_list, decay, emb_dim + ) + self._role_maker._barrier_worker() + + def clear_one_table(self, table_id): + """ + clear_one_table() will be called by user. It will clear one table. + Args: + table_id(int): table id + Examples: + .. code-block:: python + fleet.clear_one_table(0) + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.clear_one_table(table_id) + self._role_maker._barrier_worker() + + def clear_model(self): + """ + clear_model() will be called by user. It will clear sparse model. + Examples: + .. code-block:: python + fleet.clear_model() + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.clear_model() + self._role_maker._barrier_worker() + + def load_pslib_whitelist(self, table_id, model_path, **kwargs): + """ + load pslib model for one table with whitelist + + Args: + table_id(int): load table id + model_path(str): load model path, can be local or hdfs/afs path + kwargs(dict): user defined params, currently support following: + only for load pslib model for one table: + mode(int): load model mode. 0 is for load whole model, 1 is + for load delta model (load diff), default is 0. + only for load params from paddle model: + scope(Scope): Scope object + model_proto_file(str): path of program desc proto binary + file, can be local or hdfs/afs file + var_names(list): var name list + load_combine(bool): load from a file or split param files + default False. + + Examples: + .. code-block:: python + + # load pslib model for one table + fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/") + fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0) + + # load params from paddle model + fleet.load_one_table(2, "hdfs:/my_paddle_model/", + scope = my_scope, + model_proto_file = "./my_program.bin", + load_combine = False) + + # below is how to save proto binary file + with open("my_program.bin", "wb") as fout: + my_program = fluid.default_main_program() + fout.write(my_program.desc.serialize_to_string()) + + """ + self._role_maker._barrier_worker() + mode = kwargs.get("mode", 0) + if self._role_maker.is_first_worker(): + self._fleet_ptr.load_table_with_whitelist( + table_id, model_path, mode + ) + self._role_maker._barrier_worker() + + def load_one_table(self, table_id, model_path, **kwargs): + """ + load pslib model for one table or load params from paddle model + Args: + table_id(int): load table id + model_path(str): load model path, can be local or hdfs/afs path + kwargs(dict): user defined params, currently support following: + only for load pslib model for one table: + mode(int): load model mode. 0 is for load whole model, 1 is + for load delta model (load diff), default is 0. + only for load params from paddle model: + scope(Scope): Scope object + model_proto_file(str): path of program desc proto binary + file, can be local or hdfs/afs file + var_names(list): var name list + load_combine(bool): load from a file or split param files + default False. + Examples: + .. code-block:: python + # load pslib model for one table + fleet.load_one_table(0, "hdfs:/my_fleet_model/20190714/0/") + fleet.load_one_table(1, "hdfs:/xx/xxx", mode = 0) + # load params from paddle model + fleet.load_one_table(2, "hdfs:/my_paddle_model/", + scope = my_scope, + model_proto_file = "./my_program.bin", + load_combine = False) + # below is how to save proto binary file + with open("my_program.bin", "wb") as fout: + my_program = fluid.default_main_program() + fout.write(my_program.desc.serialize_to_string()) + """ + self._role_maker._barrier_worker() + mode = kwargs.get("mode", 0) + scope = kwargs.get("scope", None) + model_proto_file = kwargs.get("model_proto_file", None) + var_names = kwargs.get("var_names", None) + load_combine = kwargs.get("load_combine", False) + self._role_maker._barrier_worker() + if scope is not None and model_proto_file is not None: + self._load_one_table_from_paddle_model( + scope, + table_id, + model_path, + model_proto_file, + var_names, + load_combine, + ) + elif self._role_maker.is_first_worker(): + self._fleet_ptr.load_model_one_table(table_id, model_path, mode) + self._role_maker._barrier_worker() + + def _load_one_table_from_paddle_model( + self, + scope, + table_id, + model_path, + model_proto_file, + var_names=None, + load_combine=False, + ): + """ + load params from paddle model, and push params to pserver + Args: + scope(Scope): Scope object + table_id(int): the id of table to load + model_path(str): path of paddle model, can be local or hdfs/afs file + model_proto_file(str): path of program desc proto binary file, + can be local or hdfs/afs file + var_names(list): load var names + load_combine(bool): load from a file or split param files + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + # get fs config from fleet_desc + fs_name = self._opt_info["fleet_desc"].fs_client_param.uri + fs_ugi = ( + self._opt_info["fleet_desc"].fs_client_param.user + + "," + + self._opt_info["fleet_desc"].fs_client_param.passwd + ) + hadoop_bin = self._opt_info["fleet_desc"].fs_client_param.hadoop_bin + # download model_path if it's hdfs/afs + if model_path.startswith("hdfs:") or model_path.startswith("afs:"): + dest = "./model_for_load_table_%s" % table_id + cmd = ( + hadoop_bin + + " fs -D fs.default.name=" + + fs_name + + " -D hadoop.job.ugi=" + + fs_ugi + + " -get " + + model_path + + " " + + dest + ) + ret = os.system(cmd) + if ret != 0: + raise RuntimeError("download model failed") + model_path = dest + # download model_proto_file if it's hdfs/afs + if model_proto_file.startswith( + "hdfs:" + ) or model_proto_file.startswith("afs:"): + dest = "./model_proto_file_for_load_table_%s" % table_id + cmd = ( + hadoop_bin + + " fs -D fs.default.name=" + + fs_name + + " -D hadoop.job.ugi=" + + fs_ugi + + " -get " + + model_proto_file + + " " + + dest + ) + ret = os.system(cmd) + if ret != 0: + raise RuntimeError("download model proto file failed") + model_proto_file = dest + for tp in self._opt_info["fleet_desc"].trainer_param: + for i in tp.dense_table: + if table_id is not None and table_id != i.table_id: + continue + table_var_names = [var for var in i.dense_variable_name] + skip = False + for var in table_var_names: + if scope.find_var(var) is None: + skip = True + break + if skip: + continue + self._fleet_ptr.load_from_paddle_model( + scope, + table_id, + var_names, + model_path, + model_proto_file, + table_var_names, + load_combine, + ) + self._role_maker._barrier_worker() + + def confirm(self): + """ + confirm all the updated params in current pass + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.confirm() + self._role_maker._barrier_worker() + + def revert(self): + """ + revert all the updated params in current pass + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.revert() + self._role_maker._barrier_worker() + + def load_model(self, model_dir=None, **kwargs): + """ + load pslib model, there are at least 4 modes, these modes are the same + in load one table/save model/save one table: + 0: load checkpoint model + 1: load delta model (delta means diff, it's usually for online predict) + 2: load base model (base model filters some feasigns in checkpoint, it's + usually for online predict) + 3: load batch model (do some statistic works in checkpoint, such as + calculate unseen days of each feasign) + Args: + model_dir(str): if you use hdfs, model_dir should starts with + 'hdfs:', otherwise means local dir + kwargs(dict): user-defined properties. + mode(int): the modes illustrated above, default 0 + Examples: + .. code-block:: python + fleet.load_model("afs:/user/path/") + """ + mode = kwargs.get("mode", 0) + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.load_model(model_dir, mode) + self._role_maker._barrier_worker() + + def save_model(self, model_dir=None, **kwargs): + """ + save pslib model, the modes are same with load model. + Args: + model_dir(str): if you use hdfs, model_dir should starts with + 'hdfs:', otherwise means local dir + kwargs(dict): user-defined properties. + mode(int): the modes illustrated above, default 0 + Examples: + .. code-block:: python + fleet.save_model("afs:/user/path/") + """ + mode = kwargs.get("mode", 0) + prefix = kwargs.get("prefix", None) + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.save_model(model_dir, mode) + self._role_maker._barrier_worker() + + def save_one_table(self, table_id, model_dir, **kwargs): + """ + save pslib model's one table, the modes are same with load model. + Args: + table_id(int): table id + model_dir(str): if you use hdfs, model_dir should starts with + 'hdfs:', otherwise means local dir + kwargs(dict): user-defined properties. + mode(int): the modes illustrated above, default 0 + prefix(str): the parts to save can have prefix, + for example, part-prefix-000-00000 + Examples: + .. code-block:: python + fleet.save_one_table("afs:/user/path/") + """ + mode = kwargs.get("mode", 0) + prefix = kwargs.get("prefix", None) + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + if prefix is not None: + self._fleet_ptr.save_model_one_table_with_prefix( + table_id, model_dir, mode, prefix + ) + else: + self._fleet_ptr.save_model_one_table(table_id, model_dir, mode) + self._role_maker._barrier_worker() + + def set_date(self, table_id, date): + """ + set_date, eg, 20210918 + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_date(table_id, str(date)) + self._role_maker._barrier_worker() + + def _set_opt_info(self, opt_info): + """ + this function saves the result from DistributedOptimizer.minimize() + """ + self._opt_info = opt_info + + +fleet = PSLib() + + +def _prepare_params( + input, + size, + is_sparse=False, + is_distributed=False, + padding_idx=None, + param_attr=None, + dtype='float32', +): + """ + preprocess params, this interface is not for users. + Args: + input(Variable|list of Variable): Input is a Tensor Variable + size(list of int): the embedding dim + is_sparse(bool): whether input is sparse ids + is_distributed(bool): whether in distributed mode + padding_idx(int): padding idx of input + param_attr(ParamAttr): To specify the weight parameter property + dtype(str): data type of output + """ + if param_attr is None: + raise ValueError("param_attr must be set") + name = param_attr.name + if name is None: + raise ValueError("embedding name must be set") + if not isinstance(size, list) and not isinstance(size, tuple): + raise ValueError("embedding size must be list or tuple") + size = size[-1] + global FLEET_GLOBAL_DICT + FLEET_GLOBAL_DICT["enable"] = True + d_table = FLEET_GLOBAL_DICT["emb_to_table"] + d_accessor = FLEET_GLOBAL_DICT["emb_to_accessor"] + d_size = FLEET_GLOBAL_DICT["emb_to_size"] + + # check embedding size + if d_size.get(name) is None: + d_size[name] = size + elif d_size[name] != size: + raise ValueError( + "embedding size error: %s vs %s" % (size, d_size[name]) + ) + + # check embedding accessor + accessor = FLEET_GLOBAL_DICT["cur_accessor"] + if d_accessor.get(name) is None: + d_accessor[name] = accessor + elif d_accessor[name] != accessor: + raise ValueError( + "embedding size error: %s vs %s" % (d_accessor[name], accessor) + ) + + # check embedding table id + if d_table.get(name) is None: + d_table[name] = FLEET_GLOBAL_DICT["cur_sparse_id"] + FLEET_GLOBAL_DICT["cur_sparse_id"] += 1 + + # check other params + if not is_sparse: + raise ValueError("is_sparse must be True") + elif not is_distributed: + raise ValueError("is_distributed must be True") + elif dtype != "float32": + raise ValueError("dtype must be float32") + + +def _fleet_embedding( + input, + size, + is_sparse=False, + is_distributed=False, + padding_idx=None, + param_attr=None, + dtype='float32', +): + """ + add fleet embedding, this interface is not for users. + Args: + input(Variable|list of Variable): Input is a Tensor Variable + size(list of int): the embedding dim + is_sparse(bool): whether input is sparse ids + is_distributed(bool): whether in distributed mode + padding_idx(int): padding idx of input + param_attr(ParamAttr): To specify the weight parameter property + dtype(str): data type of output + """ + # check and set params + _prepare_params( + input, size, is_sparse, is_distributed, padding_idx, param_attr, dtype + ) + name = param_attr.name + size = size[-1] + if padding_idx is None: + padding_idx = 0 + global FLEET_GLOBAL_DICT + return fluid.layers.nn._pull_sparse( + input=input, + size=size, + table_id=FLEET_GLOBAL_DICT["emb_to_table"][name], + accessor_class=FLEET_GLOBAL_DICT["emb_to_accessor"][name], + name=name, + ctr_label_name=FLEET_GLOBAL_DICT["click_name"], + padding_id=padding_idx, + dtype=dtype, + scale_sparse_grad=FLEET_GLOBAL_DICT["scale_sparse_grad"], + ) + + +def _fleet_embedding_v2( + input, + size, + is_sparse=False, + is_distributed=False, + padding_idx=None, + param_attr=None, + dtype='float32', +): + """ + add fleet embedding v2, this interface is not for users. + Args: + input(Variable|list of Variable): Input is a Tensor Variable + size(list of int): the embedding dim + is_sparse(bool): whether input is sparse ids + is_distributed(bool): whether in distributed mode + padding_idx(int): padding idx of input + param_attr(ParamAttr): To specify the weight parameter property + dtype(str): data type of output + """ + # check and set params + _prepare_params( + input, size, is_sparse, is_distributed, padding_idx, param_attr, dtype + ) + name = param_attr.name + size = size[-1] + if padding_idx is None: + padding_idx = 0 + + return fluid.layers.nn._pull_sparse_v2( + input=input, + size=size, + table_id=FLEET_GLOBAL_DICT["emb_to_table"][name], + accessor_class=FLEET_GLOBAL_DICT["emb_to_accessor"][name], + name=name, + ctr_label_name=FLEET_GLOBAL_DICT["click_name"], + padding_id=padding_idx, + dtype=dtype, + scale_sparse_grad=FLEET_GLOBAL_DICT["scale_sparse_grad"], + ) + + +class fleet_embedding: + """ + fleet embedding class, it is used as a wrapper + Example: + .. code-block:: python + with fleet_embedding(click_name=label.name): + emb = fluid.layers.embedding( + input=var, + size=[-1, 11], + is_sparse=True, + is_distributed=True, + param_attr=fluid.ParamAttr(name="embedding")) + """ + + def __init__(self, click_name, scale_sparse_grad=True): + """Init.""" + self.origin_emb = fluid.layers.embedding + self.origin_emb_v2 = fluid.embedding + # if user uses cvm layer after embedding, click_name can be None + self.click_name = "" if click_name is None else click_name + self.scale_sparse_grad = scale_sparse_grad + # it's default value, will be modified in minimize + self.accessor = "DownpourCtrAccessor" + + def __enter__(self): + """Enter.""" + fluid.layers.embedding = _fleet_embedding + fluid.embedding = _fleet_embedding_v2 + FLEET_GLOBAL_DICT["cur_accessor"] = self.accessor + FLEET_GLOBAL_DICT["click_name"] = self.click_name + FLEET_GLOBAL_DICT["scale_sparse_grad"] = self.scale_sparse_grad + + def __exit__(self, exc_type, exc_val, exc_tb): + """Exit.""" + fluid.layers.embedding = self.origin_emb + fluid.embedding = self.origin_emb_v2 + FLEET_GLOBAL_DICT["cur_accessor"] = "" + FLEET_GLOBAL_DICT["click_name"] = "" + FLEET_GLOBAL_DICT["scale_sparse_grad"] = None + + +class DownpourOptimizer(DistributedOptimizer): + """ + DistributedOptimizer is a wrapper for paddle.fluid.optimizer + A user should pass a paddle.fluid.optimizer to DistributedOptimizer + minimize() function is implemented. + DistributedOptimizer is the starting point for a user who wants to + run distributed training. The optimized information will be stored in + Fleet() instance who holds the global information about current distributed + training. + Args: + optimizer(Optimizer): subclass of Optimizer. + strategy(any): config for DownpourOptimizer. + Returns: + None + """ + + def __init__(self, optimizer, strategy=None): + super().__init__(optimizer, strategy) + + self._optimizer = optimizer + self._optimizer_name = "Distributed%s" % optimizer.type.capitalize() + if optimizer.type != "adam": + print( + "Currently, distributed optimizer only support Adam" + "Will config built-in adam for you." + "We will support more functions in DistributedOptimizer", + sys.stderr, + ) + self._optimizer_name = "DistributedAdam" + + self._distributed_optimizer = globals()[self._optimizer_name](optimizer) + + def backward( + self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None, + ): + """ + Currently, backward function can not be called through DistributedOptimizer + """ + raise NotImplementedError() + + def _remove_collective_ops(self, program, name): + """ + colective init op should call once, so remove other call. + """ + block = program.global_block() + for ids, op in list(enumerate(block.ops)): + if op.type == name: + block._remove_op(ids) + return + + def apply_gradients(self, params_grads): + """ + Currently, apply_gradients function can not be called through DistributedOptimizer + """ + raise NotImplementedError() + + def get_dist_env(self): + trainer_id = int(os.getenv('PADDLE_TRAINER_ID', '0')) + trainer_endpoints = '' + current_endpoint = '' + num_trainers = 0 + if os.getenv('PADDLE_TRAINER_ENDPOINTS') and os.getenv( + 'PADDLE_CURRENT_ENDPOINT' + ): + trainer_endpoints = os.getenv('PADDLE_TRAINER_ENDPOINTS') + current_endpoint = os.getenv('PADDLE_CURRENT_ENDPOINT') + num_trainers = len(trainer_endpoints.split(',')) + + return { + 'trainer_id': trainer_id, + 'num_trainers': num_trainers, + 'current_endpoint': current_endpoint, + 'trainer_endpoints': trainer_endpoints, + } + + def _remove_collective_op_for_embedding(self, loss, table_name): + """ + find multi-sparse-table + """ + table_name = [name + "@GRAD" for name in table_name] + need_remove_op_index = [] + block = loss.block.program.global_block() + collective_ops = ["c_sync_calc_stream", "c_allreduce_sum"] + for ids, op in list(enumerate(block.ops)): + if op.type in collective_ops: + if op.input("X")[0] in table_name: + need_remove_op_index.append(ids) + if op.type == "lookup_table_grad": + need_remove_op_index.append(ids) + try: + if op.output("Out")[0] in table_name: + need_remove_op_index.append(ids) + except: + pass + + need_remove_op_index.sort(reverse=True) + for index in need_remove_op_index: + block._remove_op(index) + + def minimize( + self, + losses, + scopes=None, + startup_programs=None, + parameter_list=None, + no_grad_set=None, + program_mode="all_reduce", + ): + """ + minimize a program through loss, loss can be a list in DistributedOptimizer. + Note that in parameter server mode, a worker will not get anything about optimize_os + Because optimizer algorithms run on pserver side. We will make this usable in pserver + process, but currently the optimization part is written into Fleet(). A user does not + need to care about how to startup a pserver node. + Args: + losses (Variable|Variable List): loss variable or loss variable list to run optimization. + scopes (Scope| Scope List): scope instance. + startup_programs (Program|Program List): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu. + Returns: + tuple: (optimize_ops, params_grads) which are, list of operators appended; + and list of (param, grad) Variables pair for optimization. + """ + + if not isinstance(losses, list): + losses = [losses] + + ( + optimize_ops, + param_grads, + opt_info, + ) = self._distributed_optimizer._minimize( + losses, + startup_programs, + parameter_list, + no_grad_set, + self._strategy, + ) + opt_info["mpi_rank"] = fleet.worker_index() + opt_info["mpi_size"] = fleet.worker_num() + fleet._set_opt_info(opt_info) + + programs = [loss.block.program for loss in losses] + + if scopes is None: + scopes = [fluid.global_scope()] * len(programs) + + if len(scopes) != len(programs): + raise ValueError( + "You should make sure len(scopes) == len(programs) or set scopes None" + ) + + fleet._main_programs = programs + fleet._scopes = scopes + if opt_info["use_ps_gpu"]: + from paddle.fluid.transpiler.collective import MultiThread + + # check start program + if program_mode not in [ + "all_reduce", + "fuse_all_reduce", + "all_gather", + "all_reduce_xpu", + ]: + raise ValueError( + "You should set program_mode in [ all_reduce, \ + fuse_all_reduce, all_gather, all_reduce_xpu ]" + ) + env = self.get_dist_env() + if not isinstance(losses, list): + startup_programs = [startup_programs] + for i in range(0, len(startup_programs)): + + t = MultiThread(trans_mode=program_mode) + start_program = startup_programs[i] + main_program = programs[i] + t.transpile( + startup_program=start_program, + main_program=main_program, + rank=env["trainer_id"], + endpoints=env["trainer_endpoints"], + current_endpoint=env['current_endpoint'], + wait_port=False, + ) + if i > 0: + self._remove_collective_ops( + start_program, "c_comm_init_all" + ) + for i in range(0, len(losses)): + loss = losses[i] + embedding_table = self._distributed_optimizer._find_multi_distributed_lookup_table( + [loss] + ) + self._remove_collective_op_for_embedding(loss, embedding_table) + + return [optimize_ops, param_grads] diff --git a/python/paddle/incubate/fleet/parameter_server/pslib/node.py b/python/paddle/incubate/fleet/parameter_server/pslib/node.py new file mode 100644 index 0000000000000000000000000000000000000000..94df56f4efe2e8715518437ef93edf86bb637157 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/pslib/node.py @@ -0,0 +1,803 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +"""Defination of Server and Worker.""" + +# NOTE: reduce removed in fuctools in python3 +from functools import reduce + +from . import ps_pb2 as pslib + + +class Server: + """ + A Server basic class + it's a base class, does not have implementation + """ + + def __init__(self): + pass + + +class Worker: + """ + A Worker basic class. + it's a base class, does not have implementation + """ + + def __init__(self): + pass + + +class DownpourServer(Server): + """ + DownpourServer class is used to generate server program_desc + Args: + server: it is pslib.ServerParameter() + Examples: + server = DownpourServer() + """ + + def __init__(self): + self._server = pslib.ServerParameter() + self._server.downpour_server_param.service_param.server_class = ( + "DownpourBrpcPsServer" + ) + self._server.downpour_server_param.service_param.client_class = ( + "DownpourBrpcPsClient" + ) + self._server.downpour_server_param.service_param.service_class = ( + "DownpourPsService" + ) + self._server.downpour_server_param.service_param.start_server_port = 0 + self._server.downpour_server_param.service_param.server_thread_num = 12 + + def add_sparse_table(self, table_id, strategy): + """ + Args: + table_id(int): id of sparse params table + strategy(dict): the config dict. + Returns: + return None + """ + + for table in self._server.downpour_server_param.downpour_table_param: + if table.table_id == table_id: + if table.type == pslib.PS_SPARSE_TABLE: + return + else: + raise ValueError( + "expect table %s type=%s, but actual type=%s" + % (table_id, pslib.PS_SPARSE_TABLE, table.type) + ) + if strategy is None: + strategy = dict() + table = self._server.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.type = pslib.PS_SPARSE_TABLE + + support_sparse_key_list = [ + 'sparse_table_class', + 'sparse_compress_in_save', + 'sparse_shard_num', + 'sparse_accessor_class', + 'sparse_learning_rate', + 'sparse_initial_g2sum', + 'sparse_initial_range', + 'sparse_weight_bounds', + 'sparse_embedx_dim', + 'sparse_embedx_threshold', + 'sparse_nonclk_coeff', + 'sparse_click_coeff', + 'sparse_base_threshold', + 'sparse_delta_threshold', + 'sparse_delta_keep_days', + 'sparse_delete_after_unseen_days', + 'sparse_show_click_decay_rate', + 'sparse_delete_threshold', + 'sparse_converter', + 'sparse_deconverter', + 'sparse_enable_cache', + 'sparse_cache_rate', + 'sparse_cache_file_num', + 'sparse_beta1_decay_rate', + 'sparse_beta2_decay_rate', + 'sparse_ada_epsilon', + 'sparse_optimizer', + 'sparse_ssd_unseenday_threshold', + 'embed_sparse_optimizer', + 'embed_sparse_learning_rate', + 'embed_sparse_weight_bounds', + 'embed_sparse_initial_range', + 'embed_sparse_initial_g2sum', + 'embed_sparse_beta1_decay_rate', + 'embed_sparse_beta2_decay_rate', + 'embedx_sparse_optimizer', + 'embedx_sparse_learning_rate', + 'embedx_sparse_weight_bounds', + 'embedx_sparse_initial_range', + 'embedx_sparse_initial_g2sum', + 'embedx_sparse_beta1_decay_rate', + 'embedx_sparse_beta2_decay_rate', + ] + + for key in strategy: + if key not in support_sparse_key_list: + raise ValueError("strategy key '%s' not support" % (key)) + + support_table_calss = ['DownpourSparseTable', 'DownpourSparseSSDTable'] + if strategy.get('sparse_table_class') is not None: + table_class = strategy.get('sparse_table_class') + if table_class not in support_table_calss: + raise ValueError( + "support sparse_table_class: [ 'DownpourSparseTable', 'DownpourSparseSSDTable'], \ + but actual %s" + % (table_class) + ) + else: + table_class = 'DownpourSparseTable' + + table.table_class = table_class + + if ( + table_class == 'DownpourSparseTable' + or table_class == 'DownpourSparseSSDTable' + ): + table.enable_sparse_table_cache = strategy.get( + 'sparse_enable_cache', True + ) + table.sparse_table_cache_rate = strategy.get( + 'sparse_cache_rate', 0.00055 + ) + table.sparse_table_cache_file_num = strategy.get( + 'sparse_cache_file_num', 16 + ) + table.compress_in_save = strategy.get( + 'sparse_compress_in_save', True + ) + table.shard_num = strategy.get('sparse_shard_num', 1000) + # DownpourFeatureValueAccessor: for ctr task, has cvm, embedding and sgd info + # DownpourCtrAccessor : for ctr task, has cvm, slot, embedding and sgd info + # DownpourSparseValueAccessor : for general task, has embedding and sgd info + # DownpourCtrDoubleAccessor : for ctr task, which show clk are in double + # DownpourUnitAccessor : for ctr task, has cvm, slot, embedding and sgd info + + support_accessor_class = [ + 'DownpourFeatureValueAccessor', + 'DownpourCtrAccessor', + 'DownpourCtrDymfAccessor', + 'DownpourSparseValueAccessor', + 'DownpourCtrDoubleAccessor', + 'DownpourUnitAccessor', + 'DownpourDoubleUnitAccessor', + ] + if strategy.get('sparse_accessor_class') is not None: + accessor_class = strategy.get('sparse_accessor_class') + if accessor_class not in support_accessor_class: + raise ValueError( + "support sparse_accessor_class: ['DownpourFeatureValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDymfAccessor', \ + 'DownpourSparseValueAccessor', 'DownpourCtrDoubleAccessor'], \ + but actual %s" + % (accessor_class) + ) + else: + accessor_class = 'DownpourCtrAccessor' + + table.accessor.accessor_class = accessor_class + + if ( + accessor_class == 'DownpourFeatureValueAccessor' + or accessor_class == 'DownpourCtrAccessor' + or accessor_class == 'DownpourCtrDymfAccessor' + or accessor_class == 'DownpourCtrDoubleAccessor' + ): + table.accessor.sparse_sgd_param.learning_rate = strategy.get( + 'sparse_learning_rate', 0.05 + ) + table.accessor.sparse_sgd_param.initial_g2sum = strategy.get( + 'sparse_initial_g2sum', 3 + ) + table.accessor.sparse_sgd_param.initial_range = strategy.get( + 'sparse_initial_range', 1e-4 + ) + if strategy.get('sparse_weight_bounds') is None: + table.accessor.sparse_sgd_param.weight_bounds.extend( + [-10, 10] + ) + else: + table.accessor.sparse_sgd_param.weight_bounds.extend( + strategy.get('sparse_weight_bounds') + ) + table.accessor.embedx_dim = strategy.get('sparse_embedx_dim', 8) + table.accessor.embedx_threshold = strategy.get( + 'sparse_embedx_threshold', 10 + ) + table.accessor.fea_dim = int(table.accessor.embedx_dim) + 3 + table.accessor.downpour_accessor_param.nonclk_coeff = ( + strategy.get('sparse_nonclk_coeff', 0.1) + ) + table.accessor.downpour_accessor_param.click_coeff = ( + strategy.get('sparse_click_coeff', 1) + ) + table.accessor.downpour_accessor_param.base_threshold = ( + strategy.get('sparse_base_threshold', 1.5) + ) + table.accessor.downpour_accessor_param.delta_threshold = ( + strategy.get('sparse_delta_threshold', 0.25) + ) + table.accessor.downpour_accessor_param.delta_keep_days = ( + strategy.get('sparse_delta_keep_days', 16) + ) + table.accessor.downpour_accessor_param.delete_after_unseen_days = strategy.get( + 'sparse_delete_after_unseen_days', 30 + ) + table.accessor.downpour_accessor_param.ssd_unseenday_threshold = strategy.get( + 'sparse_ssd_unseenday_threshold', 1 + ) + table.accessor.downpour_accessor_param.show_click_decay_rate = ( + strategy.get('sparse_show_click_decay_rate', 0.98) + ) + table.accessor.downpour_accessor_param.delete_threshold = ( + strategy.get('sparse_delete_threshold', 0.8) + ) + converter = strategy.get( + 'sparse_converter', + "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)", + ) + deconverter = strategy.get( + 'sparse_deconverter', + "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)", + ) + + table1 = table.accessor.table_accessor_save_param.add() + table1.param = 1 + table1.converter = converter + table1.deconverter = deconverter + + table2 = table.accessor.table_accessor_save_param.add() + table2.param = 2 + table2.converter = converter + table2.deconverter = deconverter + elif accessor_class == 'DownpourSparseValueAccessor': + optimizer_name = strategy.get("sparse_optimizer", "adam") + table.accessor.sparse_commonsgd_param.name = optimizer_name + table.accessor.embedx_dim = strategy.get('sparse_embedx_dim', 8) + table.accessor.fea_dim = int(table.accessor.embedx_dim) + if optimizer_name == "naive": + table.accessor.sparse_commonsgd_param.naive.learning_rate = strategy.get( + 'sparse_learning_rate', 0.05 + ) + table.accessor.sparse_commonsgd_param.naive.initial_range = strategy.get( + 'sparse_initial_range', 1e-4 + ) + if strategy.get('sparse_weight_bounds') is None: + table.accessor.sparse_commonsgd_param.naive.weight_bounds.extend( + [-10, 10] + ) + else: + table.accessor.sparse_commonsgd_param.naive.weight_bounds.extend( + strategy.get('sparse_weight_bounds') + ) + elif optimizer_name == "adagrad": + table.accessor.sparse_commonsgd_param.adagrad.learning_rate = strategy.get( + 'sparse_learning_rate', 0.05 + ) + table.accessor.sparse_commonsgd_param.adagrad.initial_range = strategy.get( + 'sparse_initial_range', 1e-4 + ) + table.accessor.sparse_commonsgd_param.adagrad.initial_g2sum = strategy.get( + 'sparse_initial_g2sum', 3 + ) + if strategy.get('sparse_weight_bounds') is None: + table.accessor.sparse_commonsgd_param.adagrad.weight_bounds.extend( + [-10, 10] + ) + else: + table.accessor.sparse_commonsgd_param.adagrad.weight_bounds.extend( + strategy.get('sparse_weight_bounds') + ) + elif optimizer_name == "adam": + table.accessor.sparse_commonsgd_param.adam.learning_rate = ( + strategy.get('sparse_learning_rate', 0.001) + ) + table.accessor.sparse_commonsgd_param.adam.initial_range = ( + strategy.get('sparse_initial_range', 1e-4) + ) + table.accessor.sparse_commonsgd_param.adam.beta1_decay_rate = strategy.get( + 'sparse_beta1_decay_rate', 0.9 + ) + table.accessor.sparse_commonsgd_param.adam.beta2_decay_rate = strategy.get( + 'sparse_beta2_decay_rate', 0.999 + ) + table.accessor.sparse_commonsgd_param.adam.ada_epsilon = ( + strategy.get('sparse_ada_epsilon', 1e-8) + ) + if strategy.get('sparse_weight_bounds') is None: + table.accessor.sparse_commonsgd_param.adam.weight_bounds.extend( + [-10, 10] + ) + else: + table.accessor.sparse_commonsgd_param.adam.weight_bounds.extend( + strategy.get('sparse_weight_bounds') + ) + converter = strategy.get( + 'sparse_converter', + "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)", + ) + deconverter = strategy.get( + 'sparse_deconverter', + "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)", + ) + + table1 = table.accessor.table_accessor_save_param.add() + table1.param = 1 + table1.converter = converter + table1.deconverter = deconverter + + table2 = table.accessor.table_accessor_save_param.add() + table2.param = 2 + table2.converter = converter + table2.deconverter = deconverter + elif ( + accessor_class == 'DownpourUnitAccessor' + or accessor_class == 'DownpourDoubleUnitAccessor' + ): + self.add_sparse_table_common_config(table, strategy) + self.add_sparse_optimizer( + table.accessor.embed_sgd_param, strategy, "embed_" + ) + self.add_sparse_optimizer( + table.accessor.embedx_sgd_param, strategy, "embedx_" + ) + + def add_dense_table( + self, table_id, param_var, grad_var, strategy, sparse_table_names + ): + """ + Args: + table_id(int): id of sparse params table + param_var(list): param vars + grad_var(list): param grad vars + strategy(dict): the dense config dict + sparse_table_names(list): sparse table names + Returns: + return None + """ + fea_dim = 0 + dense_param_vars = [] + for p in param_var: + if p.name not in sparse_table_names: + dense_param_vars.append(p) + + for param in dense_param_vars: + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + + for table in self._server.downpour_server_param.downpour_table_param: + if table.table_id == table_id: + if table.type == pslib.PS_DENSE_TABLE: + table.accessor.fea_dim = fea_dim + return + else: + raise ValueError( + "expect table %s type=%s, but actual type=%s" + % (table_id, pslib.PS_DENSE_TABLE, table.type) + ) + + if strategy is None: + strategy = dict() + table = self._server.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + support_dense_key_list = [ + 'dense_table_class', + 'dense_compress_in_save', + 'dense_accessor_class', + 'dense_optimizer', + 'dense_learning_rate', + 'dense_avg_decay', + 'dense_ada_decay', + 'dense_ada_epsilon', + 'dense_mom_decay', + 'dense_naive_lr', + ] + + for key in strategy: + if key not in support_dense_key_list: + raise ValueError("strategy key '%s' not support" % (key)) + + table.table_class = strategy.get( + 'dense_table_class', "DownpourDenseTable" + ) + table.type = pslib.PS_DENSE_TABLE + table.compress_in_save = strategy.get('dense_compress_in_save', True) + table.accessor.accessor_class = strategy.get( + 'dense_accessor_class', "DownpourDenseValueAccessor" + ) + table.accessor.dense_sgd_param.name = strategy.get( + 'dense_optimizer', "adam" + ) + table.accessor.dense_sgd_param.adam.learning_rate = strategy.get( + 'dense_learning_rate', 5e-06 + ) + table.accessor.dense_sgd_param.adam.avg_decay_rate = strategy.get( + 'dense_avg_decay', 0.999993 + ) + table.accessor.dense_sgd_param.adam.ada_decay_rate = strategy.get( + 'dense_ada_decay', 0.9999 + ) + table.accessor.dense_sgd_param.adam.ada_epsilon = strategy.get( + 'dense_ada_epsilon', 1e-8 + ) + table.accessor.dense_sgd_param.adam.mom_decay_rate = strategy.get( + 'dense_mom_decay', 0.99 + ) + table.accessor.dense_sgd_param.naive.learning_rate = strategy.get( + 'dense_naive_lr', 0.0002 + ) + table.accessor.fea_dim = fea_dim + + def add_data_norm_table( + self, + table_id, + learning_rate, + param_var, + grad_var, + strategy, + sparse_table_names, + ): + """ + Args: + table_id(int): id of datanorm table + learning_rate(float): the learning rate used to update parameters + param_var(list): param vars + grad_var(list): param grad vars + strategy(dict): the datanorm config dict + sparse_table_names(list): sparse table names + Returns: + return None + """ + fea_dim = 0 + dense_param_vars = [] + for p in param_var: + if p.name not in sparse_table_names: + dense_param_vars.append(p) + + for param in dense_param_vars: + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + + for table in self._server.downpour_server_param.downpour_table_param: + if table.table_id == table_id: + if table.type == pslib.PS_DENSE_TABLE: + table.accessor.fea_dim = fea_dim + return + else: + raise ValueError( + "expect table %s type=%s, but actual type=%s" + % (table_id, pslib.PS_DENSE_TABLE, table.type) + ) + if strategy is None: + strategy = dict() + + support_datanorm_key_list = [ + 'datanorm_table_class', + 'datanorm_compress_in_save', + 'datanorm_accessor_class', + 'datanorm_operation', + 'datanorm_decay_rate', + ] + + for key in strategy: + if key not in support_datanorm_key_list: + raise ValueError("strategy key '%s' not support" % (key)) + + table = self._server.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = strategy.get( + 'datanorm_table_class', 'DownpourDenseTable' + ) + table.type = pslib.PS_DENSE_TABLE + table.compress_in_save = strategy.get('datanorm_compress_in_save', True) + table.accessor.accessor_class = strategy.get( + 'datanorm_accessor_class', 'DownpourDenseValueAccessor' + ) + table.accessor.dense_sgd_param.name = strategy.get( + 'datanorm_operation', 'summary' + ) + table.accessor.dense_sgd_param.summary.summary_decay_rate = ( + strategy.get('datanorm_decay_rate', 0.999999) + ) + table.accessor.fea_dim = fea_dim + + def add_sparse_optimizer(self, sgd, strategy, prefix): + optimizer_name = strategy.get(prefix + "sparse_optimizer", "adagrad") + sgd.name = optimizer_name + if optimizer_name == "naive": + sgd.naive.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.05 + ) + sgd.naive.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4 + ) + bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) + sgd.naive.weight_bounds.extend(bounds) + elif optimizer_name == "adagrad": + sgd.adagrad.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.05 + ) + sgd.adagrad.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4 + ) + if prefix == "embed_": + sgd.adagrad.initial_range = 0 + sgd.adagrad.initial_g2sum = strategy.get( + prefix + 'sparse_initial_g2sum', 3 + ) + bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) + sgd.adagrad.weight_bounds.extend(bounds) + elif optimizer_name == "std_adagrad": + sgd.adagrad.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.05 + ) + sgd.adagrad.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4 + ) + if prefix == "embed_": + sgd.adagrad.initial_range = 0 + sgd.adagrad.initial_g2sum = strategy.get( + prefix + 'sparse_initial_g2sum', 3 + ) + bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) + sgd.adagrad.weight_bounds.extend(bounds) + elif optimizer_name == "adam": + sgd.adam.learning_rate = strategy.get( + prefix + 'sparse_learning_rate', 0.001 + ) + sgd.adam.initial_range = strategy.get( + prefix + 'sparse_initial_range', 1e-4 + ) + sgd.adam.beta1_decay_rate = strategy.get( + prefix + 'sparse_beta1_decay_rate', 0.9 + ) + sgd.adam.beta2_decay_rate = strategy.get( + prefix + 'sparse_beta2_decay_rate', 0.999 + ) + sgd.adam.ada_epsilon = strategy.get( + prefix + 'sparse_ada_epsilon', 1e-8 + ) + bounds = strategy.get(prefix + 'sparse_weight_bounds', [-10, 10]) + sgd.adam.weight_bounds.extend(bounds) + + def add_sparse_table_common_config(self, table, strategy): + table.accessor.embedx_dim = strategy.get('sparse_embedx_dim', 8) + table.accessor.embedx_threshold = strategy.get( + 'sparse_embedx_threshold', 10 + ) + table.accessor.fea_dim = int(table.accessor.embedx_dim) + 3 + table.accessor.downpour_accessor_param.nonclk_coeff = strategy.get( + 'sparse_nonclk_coeff', 0.1 + ) + table.accessor.downpour_accessor_param.click_coeff = strategy.get( + 'sparse_click_coeff', 1 + ) + table.accessor.downpour_accessor_param.base_threshold = strategy.get( + 'sparse_base_threshold', 1.5 + ) + table.accessor.downpour_accessor_param.delta_threshold = strategy.get( + 'sparse_delta_threshold', 0.25 + ) + table.accessor.downpour_accessor_param.delta_keep_days = strategy.get( + 'sparse_delta_keep_days', 16 + ) + table.accessor.downpour_accessor_param.delete_after_unseen_days = ( + strategy.get('sparse_delete_after_unseen_days', 30) + ) + table.accessor.downpour_accessor_param.show_click_decay_rate = ( + strategy.get('sparse_show_click_decay_rate', 0.98) + ) + table.accessor.downpour_accessor_param.delete_threshold = strategy.get( + 'sparse_delete_threshold', 0.8 + ) + converter = strategy.get( + 'sparse_converter', + "(scripts/xbox_compressor_mf.py | bin/xbox_pb_converter)", + ) + deconverter = strategy.get( + 'sparse_deconverter', + "(bin/xbox_pb_deconverter | scripts/xbox_decompressor_mf.awk)", + ) + + table1 = table.accessor.table_accessor_save_param.add() + table1.param = 1 + table1.converter = converter + table1.deconverter = deconverter + + table2 = table.accessor.table_accessor_save_param.add() + table2.param = 2 + table2.converter = converter + table2.deconverter = deconverter + + def get_desc(self): + """ + Return downpour server program_desc + """ + return self._server + + +class DownpourWorker(Worker): + """ + DownpourWorker class is used to generate worker program_desc + Args: + window (int): push params frequency + worker: it is pslib.DownpourTrainerParameter + Examples: + worker = DownpourWorker(1) + """ + + def __init__(self, window): + self.window = window + self._worker = pslib.DownpourTrainerParameter() + + def add_sparse_table( + self, table_id, slot_key_vars, slot_value_vars, slot_value_grads=None + ): + """ + Args: + table_id(int): id of sparse params table + slot_key_vars(list): slot key id + slot_value_vars(list): slot key value after embedding + slot_value_grads(list): grad of all params, default is None + Returns: + return None + """ + if slot_value_grads is None: + slot_value_grad_names = [ + var.name + "@GRAD" for var in slot_value_vars + ] + else: + value_to_key = {} + for i in range(len(slot_key_vars)): + value_to_key[slot_value_vars[i].name] = slot_key_vars[i] + slot_value_grad_names = [] + all_grad_names = [var.name for var in slot_value_grads] + for var in slot_value_vars: + if var.name + "@GRAD" in all_grad_names: + slot_value_grad_names.append(var.name + "@GRAD") + sorted_slot_value_vars = [ + i + for i in slot_value_vars + if i.name + "@GRAD" in slot_value_grad_names + ] + sorted_slot_value_vars += [ + i + for i in slot_value_vars + if i.name + "@GRAD" not in slot_value_grad_names + ] + sorted_slot_key_vars = [ + value_to_key[v.name] for v in sorted_slot_value_vars + ] + + target_table = None + for table in self._worker.sparse_table: + if table.table_id == table_id: + keys = table.slot_key + key_names = [var.name for var in sorted_slot_key_vars] + for key_name in key_names: + if key_name not in keys: + raise ValueError( + "sparse table %s slot_key error" % table_id + ) + target_table = table + break + + table = target_table + if table is not None: + self._worker.sparse_table.remove(table) + table = self._worker.sparse_table.add() + table.table_id = table_id + table.slot_key.extend([var.name for var in sorted_slot_key_vars]) + table.slot_value.extend([var.name for var in sorted_slot_value_vars]) + table.slot_gradient.extend(slot_value_grad_names) + + def add_dense_table( + self, + table_id, + learning_rate, + param_vars, + grad_vars, + dense_start_table_id, + sparse_table_names, + ): + r""" + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_vars(list): all dense param. it is a list. + grad_vars(list): all dense grad parm it is a list. + dense_start_table_id(int): dense table start index + sparse_table_names(list): sparse table names + Returns: + return None + """ + sparse_table_name_grad = [] + for name in sparse_table_names: + sparse_table_name_grad.append(name + "@GRAD") + + dense_param_name = [] + for p in param_vars: + if p.name not in sparse_table_names: + dense_param_name.append(p.name) + + dense_grad_name = [] + for g in grad_vars: + if g.name not in sparse_table_name_grad: + dense_grad_name.append(g.name) + + dense_param_name.sort() + dense_grad_name.sort() + + for table in self._worker.dense_table: + if table.table_id == table_id: + desc_dense_param_name = list(table.dense_variable_name) + desc_dense_param_name.sort() + + if dense_param_name == desc_dense_param_name: + desc_dense_grad_name = list( + table.dense_gradient_variable_name + ) + desc_dense_grad_name.sort() + if dense_grad_name == desc_dense_grad_name: + return + else: + raise ValueError( + "dense table %s dense_gradient_variable_name " + "error" % table_id + ) + else: + raise ValueError( + "dense table %s dense_variable_name error" % table_id + ) + + table = self._worker.dense_table.add() + table.table_id = table_id + + # def cmp_fc(x, y): + # if x.startswith("fc_") and y.startswith("fc_"): + # index_x = x.find('.') + # index_y = y.find('.') + # if index_x > 0 and index_y > 0: + # num_x = x[3:index_x] + # num_y = y[3:index_y] + # if num_x.isdigit() and num_y.isdigit(): + # if int(num_x) < int(num_y): + # return -1 + # if int(num_x) > int(num_y): + # return 1 + # if x[index_x + 1] == 'w' and y[index_y + 1] == 'b': + # return -1 + # if x[index_x + 1] == 'b' and y[index_y + 1] == 'w': + # return 1 + # if x < y: + # return -1 + # else: + # return 1 + + # table.dense_variable_name.extend(sorted(dense_param_name, cmp_fc)) + # table.dense_gradient_variable_name.extend( + # sorted(dense_grad_name, cmp_fc)) + table.dense_variable_name.extend(dense_param_name) + table.dense_gradient_variable_name.extend(dense_grad_name) + + def get_desc(self): + """ + Return downpour worker program_desc + """ + return self._worker diff --git a/python/paddle/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/incubate/fleet/parameter_server/pslib/optimizer_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..3a480a36a45976b8dd825daf998c538df709fff3 --- /dev/null +++ b/python/paddle/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -0,0 +1,1071 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Optimizer Factory.""" + +__all__ = ["DistributedAdam", "FLEET_GLOBAL_DICT"] +import copy +import logging +import os +from collections import OrderedDict + +from google.protobuf import text_format + +import paddle +from paddle.framework import core + +from . import ps_pb2 as pslib +from .node import DownpourServer, DownpourWorker + +OpRole = core.op_proto_and_checker_maker.OpRole +# this dict is for store info about pull/push sparse ops. +FLEET_GLOBAL_DICT = { + # global settings + "enable": False, + "emb_to_table": {}, + "emb_to_accessor": {}, + "emb_to_size": {}, + # current embedding settings + "cur_sparse_id": 0, + "cur_accessor": "", + "click_name": "", + "scale_sparse_grad": None, +} + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +formatter = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(message)s') +ch = logging.StreamHandler() +ch.setFormatter(formatter) +logger.addHandler(ch) + + +class DistributedOptimizerImplBase: + """ + DistributedOptimizerImplBase + base class of optimizers + """ + + def __init__(self, optimizer): + self._optimizer = optimizer + self._learning_rate = optimizer._learning_rate + self._regularization = optimizer.regularization + + def minimize( + self, + losses, + startup_program=None, + parameter_list=None, + no_grad_set=None, + ): + """ + Args: + losses(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + """ + pass + + +class DistributedAdam(DistributedOptimizerImplBase): + """ + DistributedAdam + adam optimizer in distributed training + """ + + def __init__(self, optimizer): + # todo(guru4elephant): add more optimizers here as argument + # todo(guru4elephant): make learning_rate as a variable + super().__init__(optimizer) + self._window = 1 + self.type = "downpour" + self.data_norm_name = [ + ".batch_size", + ".batch_square_sum", + ".batch_sum", + ".batch_size@GRAD", + ".batch_square_sum@GRAD", + ".batch_sum@GRAD", + ] + self.supported_embedding_types = [ + "lookup_table", + "pull_sparse", + "pull_sparse_v2", + "pull_box_sparse", + "pull_gpups_sparse", + ] + self.supported_embedding_grad_types = [ + "lookup_table_grad", + "push_sparse", + "push_sparse_v2", + ] + op_maker = core.op_proto_and_checker_maker + self.op_role_key = op_maker.kOpRoleAttrName() + + def _find_distributed_lookup_table_inputs(self, program, table_names): + """ + Find input variable of distribute lookup table in program. + We could support multi-distribute table now. + Args: + program(Program): given program, locate distributed lookup table + table_name(str): given table names that is found beforehand + Returns: + inputs + """ + local_vars = program.current_block().vars + inputs_dict = dict() + for table_name in table_names: + inputs_dict[table_name] = [] + + for op in program.global_block().ops: + if op.type in self.supported_embedding_types: + if op.input("W")[0] in table_names: + inputs_dict[op.input("W")[0]].extend( + [local_vars[name] for name in op.input("Ids")] + ) + return inputs_dict + + def _find_distributed_lookup_table_outputs(self, program, table_names): + """ + Find output variable of distribute lookup table in program. + We could support multi-distribute table now. + Args: + programs(Program): given program, locate distributed lookup table + table_name(str): given table name that is found beforehand + Returns: + outputs + """ + local_vars = program.current_block().vars + outputs_dict = dict() + for table_name in table_names: + outputs_dict[table_name] = [] + + for op in program.global_block().ops: + if op.type in self.supported_embedding_types: + if op.input("W")[0] in table_names: + outputs_dict[op.input("W")[0]].extend( + [local_vars[name] for name in op.output("Out")] + ) + return outputs_dict + + def _find_distributed_lookup_table_grads(self, program, table_names): + local_vars = program.current_block().vars + grads_dict = dict() + for table_name in table_names: + grads_dict[table_name] = [] + + for op in program.global_block().ops: + if op.type in self.supported_embedding_grad_types: + if op.input("W")[0] in table_names: + grads_dict[op.input("W")[0]].extend( + [local_vars[name] for name in op.input("Out@GRAD")] + ) + return grads_dict + + def _is_optimizer_op(self, op): + return self.op_role_key in op.attr_names and int( + op.all_attrs()[self.op_role_key] + ) & int(OpRole.Optimize) + + def _remove_optimize_op_for_embedding(self, loss, table_name): + """ + find multi-sparse-table + """ + table_name = [name + "@GRAD" for name in table_name] + need_remove_op_index = [] + block = loss.block.program.global_block() + for ids, op in list(enumerate(block.ops)): + if self._is_optimizer_op(op): + if op.input("Grad")[0] in table_name: + need_remove_op_index.append(ids) + + need_remove_op_index.sort(reverse=True) + for index in need_remove_op_index: + block._remove_op(index) + + def _find_multi_distributed_lookup_table(self, losses): + """ + find multi-sparse-table + """ + table_names = set() + cnt = 0 + tmp_list = [] + ret_list = [] + for loss in losses: + for op in loss.block.program.global_block().ops: + if op.type in self.supported_embedding_types: + if op.attr('is_distributed') is True: + table_name = op.input("W")[0] + if table_name not in table_names: + table_names.add(table_name) + tmp_list.append([table_name, cnt]) + cnt += 1 + tmp_list.sort(key=lambda k: k[1]) + for x in tmp_list: + ret_list.append(x[0]) + return ret_list + + def _if_last_block(self, op, _equal_dict): + # for conditional_block op + cond_str = op.input('Cond')[0] + bool_test = False + if cond_str.startswith('equal'): + bool_test = True + vars_ = op.input('Input') + equal_keys = _equal_dict.keys() + for var_cond in vars_: + if var_cond in equal_keys: + if bool_test: + print("the conditional block is error") + return False + return True + + def _generte_cond_para_map( + self, op, _fill_value_dict, _equal_fill_dict, _now_program, _all_params + ): + # generate cond value to parameter map recursively + cond_str = op.input('Cond')[0] + vars_ = op.input('Input') + + if self._if_last_block(op, _equal_fill_dict): + vars_ = op.input('Input') + cond_key = "" + if cond_str.startswith('equal'): + cond_key = int(_fill_value_dict[_equal_fill_dict[cond_str]]) + else: + cond_key = -1 + p_list = [] + for var_cond in vars_: + if var_cond in _all_params: + p_list.append(var_cond) + + self._cond_params[cond_key] = p_list + self._other_params.extend(p_list) + else: + ops_cond = _now_program.block(int(op.attr('sub_block').id)).ops + for op in ops_cond: + if op.type == 'conditional_block': + self._generte_cond_para_map( + op, + _fill_value_dict, + _equal_fill_dict, + _now_program, + _all_params, + ) + + def _has_conditional_block(self, loss): + now_program = loss.block.program + root_block = now_program.block(0) + ops_ = root_block.ops + for op in ops_: + if op.type == 'conditional_block': + return True + return False + + def _check_params_grads(self, params, grads): + if len(params) != len(grads): + raise ValueError( + "params size != grads size, %s vs %s" + % (len(params), len(grads)) + ) + + pname2grad = dict() + for i in range(len(params)): + pname = params[i].name + gname = grads[i].name + if pname != gname[:-5]: + raise ValueError(" params != grads , %s vs %s" % (pname, gname)) + pname2grad[pname] = grads[i] + + return pname2grad + + def _generate_multi_dense_table( + self, + params, + grads, + cond_params, + other_params, + sparse_table_names, + dense_table_id=0, + ): + # generate multi dense table by cond value + pname2grad = self._check_params_grads(params, grads) + root_params_list = [] + root_grads_list = [] + dense_tables = [] + for i, p in enumerate(params): + if p.name not in other_params and p.name not in sparse_table_names: + root_params_list.append(p) + root_grads_list.append(grads[i]) + if len(root_params_list) > 0: + dense_tables.append(dense_table_id) + dense_table_id += 1 + lists_params = [[] for i in range(len(cond_params.keys()))] + lists_grads = [[] for i in range(len(cond_params.keys()))] + + key_id = 0 + name2key = dict() + cond2denseid = dict() + for key, value in cond_params.items(): + cond2denseid[key] = dense_table_id + dense_tables.append(dense_table_id) + dense_table_id += 1 + for v in value: + name2key[v] = key_id + key_id += 1 + + for p in params: + if p.name in other_params: + lists_params[name2key[p.name]].append(p) + lists_grads[name2key[p.name]].append(pname2grad[p.name]) + + return ( + dense_tables, + cond2denseid, + lists_params, + lists_grads, + root_params_list, + root_grads_list, + ) + + def _gen_distributed_emb_to_size_dict(self, program): + d_size = dict() + local_vars = program.current_block().vars + + for op in program.global_block().ops: + if op.type in self.supported_embedding_types: + if op.attr('is_distributed') is True: + table_name = op.input("W")[0] + emb_size = local_vars[table_name].shape[-1] + if d_size.get(table_name) is None: + d_size[table_name] = emb_size + elif d_size[table_name] != emb_size: + raise ValueError( + "embedding size error: %s vs %s" + % (emb_size, d_size[table_name]) + ) + + return d_size + + def _check_config_fleet_with_program_op( + self, strategy, table_name, emb_to_size + ): + if strategy.get(table_name) is None: + strategy[table_name] = dict() + st = strategy[table_name] + + accessor = "DownpourCtrAccessor" + if st.get("sparse_accessor_class") is not None: + accessor = st["sparse_accessor_class"] + + # set sparse_embedx_dim in the strategy according to accessor and use_cvm config + if ( + accessor == "DownpourFeatureValueAccessor" + or accessor == "DownpourCtrAccessor" + or accessor == "DownpourCtrDymfAccessor" + or accessor == "DownpourDoubleUnitAccessor" + or accessor == "DownpourUnitAccessor" + ): + if ( + st.get("sparse_embedx_dim") is not None + and strategy.get("use_cvm") is True + and st["sparse_embedx_dim"] != emb_to_size[table_name] - 3 + ): + raise ValueError( + "fleet config sparse_embedx_dim=%s not" + " equal to embedding dim - 3 = %s" + % (st["sparse_embedx_dim"], emb_to_size[table_name] - 3) + ) + if ( + st.get("sparse_embedx_dim") is not None + and strategy.get("use_cvm") is False + and st["sparse_embedx_dim"] != emb_to_size[table_name] - 1 + ): + raise ValueError( + "fleet config sparse_embedx_dim=%s not" + " equal to embedding dim - 1 = %s" + % (st["sparse_embedx_dim"], emb_to_size[table_name] - 1) + ) + if ( + st.get("sparse_embedx_dim") is None + and strategy.get("use_cvm") is True + ): + logger.warning( + "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim " + "with same sparse table name is not set in config_fleet.py. " + "Hence automatically set sparse_embedx_dim = {} - 3.".format( + table_name, + emb_to_size[table_name], + emb_to_size[table_name], + ) + ) + st["sparse_embedx_dim"] = emb_to_size[table_name] - 3 + if ( + st.get("sparse_embedx_dim") is None + and strategy.get("use_cvm") is False + ): + logger.warning( + "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim " + "with same sparse table name is not set in config_fleet.py. " + "Hence automatically set sparse_embedx_dim = {} - 1.".format( + table_name, + emb_to_size[table_name], + emb_to_size[table_name], + ) + ) + st["sparse_embedx_dim"] = emb_to_size[table_name] - 1 + elif accessor == "DownpourSparseValueAccessor": + if ( + st.get("sparse_embedx_dim") is not None + and st["sparse_embedx_dim"] != emb_to_size[table_name] + ): + raise ValueError( + "fleet config sparse_embedx_dim=%s not" + " equal to embedding dim = %s" + % (st["sparse_embedx_dim"], emb_to_size[table_name]) + ) + if st.get("sparse_embedx_dim") is None: + logger.warning( + "sparse embedding dim for table name '{}' is: {}, while sparse_embedx_dim " + "with same sparse table name is not set in config_fleet.py. " + "Hence automatically set sparse_embedx_dim = {}.".format( + table_name, + emb_to_size[table_name], + emb_to_size[table_name], + ) + ) + st["sparse_embedx_dim"] = emb_to_size[table_name] + + return strategy + + def _minimize( + self, + losses, + startup_program=None, + parameter_list=None, + no_grad_set=None, + strategy={}, + ): + """ + DownpounSGD is a distributed optimizer so + that user can call minimize to generate backward + operators and optimization operators within minimize function + Args: + loss(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + strategy(dict): user-defined properties + Returns: + [optimize_ops, grads_and_weights] + """ + # sparse table names of each program + prog_id_to_sparse_table = OrderedDict() + # inputs_dict and outputs_dict of sparse tables of each program + prog_id_to_inputs_dict = OrderedDict() + prog_id_to_outputs_dict = OrderedDict() + # related to PSParameter + ps_param = pslib.PSParameter() + # related to ServerParameter + server = DownpourServer() + # program to worker (related to DownpourTrainerParameter) + prog_id_to_worker = OrderedDict() + # param_grads of each program + prog_id_to_param_grads = OrderedDict() + # sparse_grads of each program + prog_id_to_sparse_grads = OrderedDict() + # unique program set + program_id_set = set() + + sparse_table_to_index = OrderedDict() + sparse_table_index = 0 + for num in range(len(losses)): + loss = losses[num] + parameters = None + if parameter_list is not None: + parameters = parameter_list[num] + prog_id = str(id(loss.block.program)) + # param_grads of program + params_grads = sorted( + paddle.static.append_backward(loss, parameters, no_grad_set), + key=lambda x: x[0].name, + ) + + flag_use_ps_gpu = strategy.get("use_ps_gpu", False) + if flag_use_ps_gpu: + if not isinstance(startup_program, list): + startup_program = [startup_program] + optimizer = copy.deepcopy(self._optimizer) + optimize_ops = optimizer.apply_optimize( + loss, + startup_program=startup_program[num], + params_grads=params_grads, + ) + embedding_table = self._find_multi_distributed_lookup_table( + [loss] + ) + self._remove_optimize_op_for_embedding(loss, embedding_table) + # has condition_block op means multi-task + flag_multi_task = self._has_conditional_block(loss) + if flag_multi_task: + self._cond_params = dict() + self._other_params = [] + now_program = loss.block.program + root_block = now_program.block(0) + all_params = [] + for par in root_block.all_parameters(): + all_params.append(par.name) + + ops_ = root_block.ops + fill_value_dict = dict() + equal_fill_dict = dict() + for op in ops_: + # conditional_block op must has fill_constant and equal op + if op.type == 'fill_constant': + fill_value_dict[op.output('Out')[0]] = op.attr('value') + if op.type == 'equal': + equal_fill_dict[op.output('Out')[0]] = op.input('Y')[0] + if op.type == 'conditional_block': + self._generte_cond_para_map( + op, + fill_value_dict, + equal_fill_dict, + now_program, + all_params, + ) + + if prog_id not in program_id_set: + program_id_set.add(prog_id) + sparse_table = self._find_multi_distributed_lookup_table([loss]) + prog_id_to_sparse_table[prog_id] = sparse_table + + # get sparse_table_to_index + for tn in sparse_table: + if sparse_table_to_index.get(tn) is None: + sparse_table_to_index[tn] = sparse_table_index + sparse_table_index += 1 + + # get {table_name: emb_size} dict from program ops + emb_to_size = self._gen_distributed_emb_to_size_dict( + loss.block.program + ) + + # get inputs_dict + inputs_dict = self._find_distributed_lookup_table_inputs( + loss.block.program, sparse_table + ) + prog_id_to_inputs_dict[prog_id] = inputs_dict + # get outputs_dict + outputs_dict = self._find_distributed_lookup_table_outputs( + loss.block.program, sparse_table + ) + prog_id_to_outputs_dict[prog_id] = outputs_dict + + prog_id_to_worker[prog_id] = DownpourWorker(self._window) + + grads_dict = self._find_distributed_lookup_table_grads( + loss.block.program, sparse_table + ) + prog_id_to_sparse_grads[prog_id] = grads_dict + + if prog_id not in prog_id_to_param_grads: + prog_id_to_param_grads[prog_id] = [] + prog_id_to_param_grads[prog_id].append(params_grads) + + # if strategy.get("parallel_compute") + + # if user specify a fleet_desc.prototxt file, then load the file + # instead of creating default fleet_desc.prototxt. + # user can specify server_param or trainer_param or fs_client_param. + if strategy.get("fleet_desc_file") is not None: + fleet_desc_file = strategy["fleet_desc_file"] + with open(fleet_desc_file) as f: + text_format.Merge(f.read(), ps_param) + server.get_desc().CopyFrom(ps_param.server_param) + if len(ps_param.trainer_param) == 1: + for k in prog_id_to_worker: + prog_id_to_worker[k].get_desc().CopyFrom( + ps_param.trainer_param[0] + ) + else: + if len(ps_param.trainer_param) != len(prog_id_to_worker): + raise ValueError( + "trainer param size != program size, %s vs %s" + % (len(ps_param.trainer_param), len(prog_id_to_worker)) + ) + idx = 0 + # prog_id_to_worker is OrderedDict + for k in prog_id_to_worker: + prog_id_to_worker[k].get_desc().CopyFrom( + ps_param.trainer_param[idx] + ) + idx += 1 + + # check config in op defination and fleet config + if FLEET_GLOBAL_DICT["enable"]: + one_slot = None + strategy["device_worker"] = "Hogwild" + emb_to_table = FLEET_GLOBAL_DICT["emb_to_table"] + emb_to_accessor = FLEET_GLOBAL_DICT["emb_to_accessor"] + emb_to_size = FLEET_GLOBAL_DICT["emb_to_size"] + if len(sparse_table_to_index) != len(emb_to_table): + raise ValueError( + "sparse tables from program != sparse tables from op: %s " + "vs %s" % (len(sparse_table_to_index), len(emb_to_table)) + ) + for key in sparse_table_to_index: + if ( + key not in emb_to_table + or sparse_table_to_index[key] != emb_to_table[key] + ): + print("sparse_table_to_index ", sparse_table_to_index) + print("emb_to_table ", emb_to_table) + raise ValueError("key error: %s" % key) + if strategy.get(key) is None: + strategy[key] = dict() + st = strategy[key] + + accessor = None + if st.get("sparse_accessor_class") is not None: + accessor = st["sparse_accessor_class"] + tables = ( + server.get_desc().downpour_server_param.downpour_table_param + ) + for table in tables: + if table.table_id == sparse_table_to_index[key]: + accessor = table.accessor.accessor_class + break + + for loss in losses: + for op in loss.block.program.global_block().ops: + if op.type in self.supported_embedding_types: + if accessor is not None and op.has_attr( + "AccessorClass" + ): + op._set_attr("AccessorClass", accessor) + if one_slot is None: + one_slot = ( + loss.block.program.global_block().var( + op.input("Ids")[0] + ) + ) + + # if accessor is None, use default accessor in op definition + if accessor is None: + accessor = emb_to_accessor[key] + # set sparse_embedx_dim in strategy, + # user do not have to set it in config_fleet + if ( + accessor == "DownpourFeatureValueAccessor" + or accessor == "DownpourCtrDymfAccessor" + or accessor == "DownpourCtrAccessor" + or accessor == "DownpourDoubleUnitAccessor" + or accessor == "DownpourUnitAccessor" + ): + if ( + st.get("sparse_embedx_dim") is not None + and st["sparse_embedx_dim"] != emb_to_size[key] - 3 + ): + raise ValueError( + "fleet config sparse_embedx_dim=%s not" + " equal to embedding size - 3 = %s" + % (st["sparse_embedx_dim"], emb_to_size[key] - 3) + ) + st["sparse_embedx_dim"] = emb_to_size[key] - 3 + elif accessor == "DownpourSparseValueAccessor": + if ( + st.get("sparse_embedx_dim") is not None + and st["sparse_embedx_dim"] != emb_to_size[key] + ): + raise ValueError( + "fleet config sparse_embedx_dim=%s not" + " equal to embedding size = %s" + % (st["sparse_embedx_dim"], emb_to_size[key]) + ) + st["sparse_embedx_dim"] = emb_to_size[key] + + # ServerParameter add all sparse tables + for tn in sparse_table_to_index: + sparse_table_index = sparse_table_to_index[tn] + st = self._check_config_fleet_with_program_op( + strategy, tn, emb_to_size + ) + if st.get(tn) is not None: + server.add_sparse_table(sparse_table_index, st[tn]) + else: + server.add_sparse_table(sparse_table_index, None) + + # each DownpourTrainerParameter add its own sparse tables + program_id_set.clear() + for loss in losses: + prog_id = str(id(loss.block.program)) + if prog_id not in program_id_set: + program_id_set.add(prog_id) + worker = prog_id_to_worker[prog_id] + inputs_dict = prog_id_to_inputs_dict[prog_id] + outputs_dict = prog_id_to_outputs_dict[prog_id] + for tn in prog_id_to_sparse_table[prog_id]: + sparse_table_index = sparse_table_to_index[tn] + grads_dict = prog_id_to_sparse_grads[prog_id] + worker.add_sparse_table( + sparse_table_index, + inputs_dict[tn], + outputs_dict[tn], + grads_dict[tn], + ) + + dense_start_table_id = len(sparse_table_to_index) + dense_table_index = len(sparse_table_to_index) + program_configs = {} + # ServerParameter add all dense tables + # each DownpourTrainerParameter add its own dense tables + program_id_set.clear() + for loss_index in range(len(losses)): + program_id = str(id(losses[loss_index].block.program)) + if program_id not in program_id_set: + program_id_set.add(program_id) + worker = prog_id_to_worker[program_id] + sparse_table_names = prog_id_to_sparse_table[program_id] + sparse_table_index = [ + sparse_table_to_index[i] for i in sparse_table_names + ] + + program_configs[program_id] = { + "pull_sparse": [t_index for t_index in sparse_table_index], + "push_sparse": [t_index for t_index in sparse_table_index], + } + + params_grads = prog_id_to_param_grads[program_id] + for pg in params_grads: + params = [] + grads = [] + data_norm_params = [] + data_norm_grads = [] + for i in pg: + is_data_norm_data = False + for data_norm_name in self.data_norm_name: + if i[0].name.endswith(data_norm_name): + is_data_norm_data = True + data_norm_params.append(i[0]) + if not is_data_norm_data: + params.append(i[0]) + + for i in pg: + is_data_norm_data = False + for data_norm_grad in self.data_norm_name: + if i[0].name.endswith(data_norm_grad): + is_data_norm_data = True + data_norm_grads.append(i[1]) + if not is_data_norm_data: + grads.append(i[1]) + # for new dense table + multi_task_dense_tables_push = [] + multi_task_dense_tables_pull = [] + if flag_multi_task: + ( + dense_tables, + cond2denseid, + lists_params, + lists_grads, + root_params_list, + root_grads_list, + ) = self._generate_multi_dense_table( + params, + grads, + self._cond_params, + self._other_params, + sparse_table_names, + dense_table_index, + ) + program_configs[program_id][ + 'cond2denseid' + ] = cond2denseid + multi_task_dense_tables_push = dense_tables + multi_task_dense_tables_pull = dense_tables[:] + + if strategy.get('dense_table') is not None: + if flag_multi_task: + server_dense_table_index = dense_table_index + if len(root_params_list) > 0: + server.add_dense_table( + server_dense_table_index, + root_params_list, + root_grads_list, + strategy['dense_table'], + sparse_table_names, + ) + server_dense_table_index += 1 + + for i in range(len(lists_params)): + server.add_dense_table( + server_dense_table_index, + lists_params[i], + lists_grads[i], + strategy['dense_table'], + sparse_table_names, + ) + server_dense_table_index += 1 + else: + server.add_dense_table( + dense_table_index, + params, + grads, + strategy['dense_table'], + sparse_table_names, + ) + + else: + server.add_dense_table( + dense_table_index, + params, + grads, + None, + sparse_table_names, + ) + + if flag_multi_task: + + if len(root_params_list) > 0: + worker.add_dense_table( + dense_table_index, + self._learning_rate, + root_params_list, + root_grads_list, + dense_start_table_id, + sparse_table_names, + ) + dense_table_index += 1 + + for i in range(len(lists_params)): + worker.add_dense_table( + dense_table_index, + self._learning_rate, + lists_params[i], + lists_grads[i], + dense_start_table_id, + sparse_table_names, + ) + dense_table_index += 1 + + dense_table_index -= 1 + else: + worker.add_dense_table( + dense_table_index, + self._learning_rate, + params, + grads, + dense_start_table_id, + sparse_table_names, + ) + + if FLEET_GLOBAL_DICT["enable"]: + cur_prog = losses[loss_index].block.program + cur_prog.global_block().append_op( + type="push_dense", + inputs={"Ids": one_slot}, + attrs={ + "InputNames": [i.name for i in grads], + "TableId": dense_table_index, + "ScaleDataNorm": strategy.get( + "scale_datanorm", -1 + ), + }, + ) + + if ( + "pull_dense" in program_configs[program_id] + and "push_dense" in program_configs[program_id] + and len(program_configs[program_id]["pull_dense"]) > 0 + ): + if flag_multi_task: + program_configs[program_id]["pull_dense"].extend( + multi_task_dense_tables_pull + ) + program_configs[program_id]["push_dense"].extend( + multi_task_dense_tables_push + ) + else: + program_configs[program_id]["pull_dense"].extend( + [dense_table_index] + ) + program_configs[program_id]["push_dense"].extend( + [dense_table_index] + ) + else: + if flag_multi_task: + program_configs[program_id][ + "pull_dense" + ] = multi_task_dense_tables_pull + program_configs[program_id][ + "push_dense" + ] = multi_task_dense_tables_push + else: + program_configs[program_id]["pull_dense"] = [ + dense_table_index + ] + program_configs[program_id]["push_dense"] = [ + dense_table_index + ] + + if len(data_norm_params) != 0 and len(data_norm_grads) != 0: + dense_table_index += 1 + if strategy.get('datanorm_table') is not None: + server.add_data_norm_table( + dense_table_index, + self._learning_rate, + data_norm_params, + data_norm_grads, + strategy['datanorm_table'], + sparse_table_names, + ) + else: + server.add_data_norm_table( + dense_table_index, + self._learning_rate, + data_norm_params, + data_norm_grads, + None, + sparse_table_names, + ) + + worker.add_dense_table( + dense_table_index, + self._learning_rate, + data_norm_params, + data_norm_grads, + dense_start_table_id, + sparse_table_names, + ) + + if FLEET_GLOBAL_DICT["enable"]: + cur_prog = losses[loss_index].block.program + cur_prog.global_block().append_op( + type="push_dense", + inputs={"Ids": one_slot}, + attrs={ + "InputNames": [ + i.name for i in data_norm_grads + ], + "TableId": dense_table_index, + "ScaleDataNorm": strategy.get( + "scale_datanorm", -1 + ), + }, + ) + + program_configs[program_id]["pull_dense"].extend( + [dense_table_index] + ) + program_configs[program_id]["push_dense"].extend( + [dense_table_index] + ) + dense_table_index += 1 + + # Todo(guru4elephant): figure out how to support more sparse parameters + # currently only support lookup_table + worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + if len(worker.get_desc().skip_op) == 0: + worker.get_desc().skip_op.extend(worker_skipped_ops) + + ps_param.server_param.CopyFrom(server.get_desc()) + # prog_id_to_worker is OrderedDict + if len(ps_param.trainer_param) == 0: + for k in prog_id_to_worker: + tp = ps_param.trainer_param.add() + tp.CopyFrom(prog_id_to_worker[k].get_desc()) + + if strategy.get("fs_uri") is not None: + ps_param.fs_client_param.uri = strategy["fs_uri"] + elif ps_param.fs_client_param.uri == "": + ps_param.fs_client_param.uri = "hdfs://your_hdfs_uri" + if strategy.get("fs_user") is not None: + ps_param.fs_client_param.user = strategy["fs_user"] + elif ps_param.fs_client_param.user == "": + ps_param.fs_client_param.user = "your_hdfs_user" + if strategy.get("fs_passwd") is not None: + ps_param.fs_client_param.passwd = strategy["fs_passwd"] + elif ps_param.fs_client_param.passwd == "": + ps_param.fs_client_param.passwd = "your_hdfs_passwd" + if strategy.get("fs_hadoop_bin") is not None: + ps_param.fs_client_param.hadoop_bin = strategy["fs_hadoop_bin"] + elif ps_param.fs_client_param.hadoop_bin == "": + ps_param.fs_client_param.hadoop_bin = "$HADOOP_HOME/bin/hadoop" + + opt_info = {} + opt_info["program_id_to_worker"] = prog_id_to_worker + opt_info["program_configs"] = program_configs + opt_info["trainer"] = strategy.get("trainer", "DistMultiTrainer") + opt_info["device_worker"] = strategy.get("device_worker", "DownpourSGD") + opt_info["optimizer"] = "DownpourSGD" + opt_info["fleet_desc"] = ps_param + opt_info["worker_skipped_ops"] = worker_skipped_ops + opt_info["use_cvm"] = strategy.get("use_cvm", False) + opt_info["no_cvm"] = strategy.get("no_cvm", False) + opt_info["scale_sparse_gradient_with_batch_size"] = strategy.get( + "scale_sparse_gradient_with_batch_size", True + ) + opt_info["worker_class"] = strategy.get( + "worker_class", "DownpourWorker" + ) + opt_info["stat_var_names"] = strategy.get("stat_var_names", []) + opt_info["local_tables"] = strategy.get("local_tables", []) + opt_info["async_tables"] = strategy.get("async_tables", []) + opt_info["async_tables"] = strategy.get("async_tables", []) + opt_info["scale_datanorm"] = strategy.get("scale_datanorm", -1) + opt_info["check_nan_var_names"] = strategy.get( + "check_nan_var_names", [] + ) + opt_info["dump_slot"] = False + opt_info["dump_converter"] = "" + opt_info["dump_fields"] = strategy.get("dump_fields", []) + opt_info["dump_file_num"] = strategy.get("dump_file_num", 16) + opt_info["user_define_dump_filename"] = strategy.get( + "user_define_dump_filename", "" + ) + opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") + opt_info["dump_param"] = strategy.get("dump_param", []) + gpus_env = os.getenv("FLAGS_selected_gpus", "0") + opt_info["worker_places"] = [int(s) for s in gpus_env.split(",")] + opt_info["use_ps_gpu"] = strategy.get("use_ps_gpu", False) + if server._server.downpour_server_param.downpour_table_param[ + 0 + ].accessor.accessor_class in [ + "DownpourCtrAccessor", + "DownpourCtrDoubleAccessor", + "DownpourUnitAccessor", + "DownpourDoubleUnitAccessor", + "DownpourCtrDymfAccessor", + ]: + opt_info["dump_slot"] = True + elif ( + server._server.downpour_server_param.downpour_table_param[ + 0 + ].accessor.accessor_class + == "DownpourSparseValueAccessor" + ): + opt_info["no_cvm"] = True + opt_info["adjust_ins_weight"] = strategy.get("adjust_ins_weight", {}) + opt_info["copy_table"] = strategy.get("copy_table", {}) + opt_info["loss_names"] = strategy.get("loss_names", []) + + for loss in losses: + loss.block.program._fleet_opt = opt_info + + param_grads_list = [] + for loss in losses: + prog_id = str(id(loss.block.program)) + param_grads_list.append(prog_id_to_param_grads[prog_id]) + return None, param_grads_list, opt_info diff --git a/python/setup.py.in b/python/setup.py.in index aee0e69a06d6c43b380007f2ce649ffb1ee3e5ba..44a09489560a3b51d4e0d902a9097b6878d8e454 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -438,6 +438,7 @@ packages=['paddle', 'paddle.incubate.distributed.models', 'paddle.incubate.distributed.models.moe', 'paddle.incubate.distributed.models.moe.gate', + 'paddle.incubate.fleet.parameter_server.distribute_transpiler', 'paddle.quantization', 'paddle.quantization.quanters', 'paddle.sparse', diff --git a/setup.py b/setup.py index 9b7cdc6d7f1fbe035648eebab9e55b8030226ab0..50ef5ee480199d303db12a56463026f63590e56a 100644 --- a/setup.py +++ b/setup.py @@ -1324,6 +1324,7 @@ def get_setup_parameters(): 'paddle.incubate.distributed.models', 'paddle.incubate.distributed.models.moe', 'paddle.incubate.distributed.models.moe.gate', + 'paddle.incubate.fleet.parameter_server.distribute_transpiler', 'paddle.quantization', 'paddle.quantization.quanters', 'paddle.sparse',