未验证 提交 60c3ef3a 编写于 作者: 1 123malin 提交者: GitHub

【paddle.fleet】parameter_server_optimizer support auto_strategy (#27181)

* parameter_server_optimizer support auto_strategy
上级 fde5cfe8
...@@ -231,7 +231,7 @@ class Fleet(object): ...@@ -231,7 +231,7 @@ class Fleet(object):
Returns: Returns:
int: worker numbers int: worker numbers
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -737,7 +737,7 @@ class Fleet(object): ...@@ -737,7 +737,7 @@ class Fleet(object):
""" """
Set the value of the learning rate manually in the optimizer. Set the value of the learning rate manually in the optimizer.
Only work in dygraph mode Only work in dygraph mode
Args: Args:
value (float|Tensor): the value of learning rate value (float|Tensor): the value of learning rate
...@@ -877,7 +877,7 @@ class Fleet(object): ...@@ -877,7 +877,7 @@ class Fleet(object):
""" """
Execute the optimizer once. Execute the optimizer once.
Only work in dygraph mode Only work in dygraph mode
Returns: None Returns: None
Examples: Examples:
...@@ -1019,7 +1019,7 @@ class Fleet(object): ...@@ -1019,7 +1019,7 @@ class Fleet(object):
if self.user_defined_strategy._is_strict_auto(): if self.user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer # turn on all the strategy for each optimizer
for opt in distributed_optimizer_list: for opt in distributed_optimizer_list:
opt._enable_strategy(self.user_defined_strategy) opt._enable_strategy(self.user_defined_strategy, context)
valid_optimizer_list = [] valid_optimizer_list = []
valid_graph_optimizer_list = [] valid_graph_optimizer_list = []
......
...@@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase): ...@@ -34,6 +34,9 @@ class AMPOptimizer(MetaOptimizerBase):
loss, role_maker, user_defined_optimizer, user_defined_strategy) loss, role_maker, user_defined_optimizer, user_defined_strategy)
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.amp: if self.user_defined_strategy.amp:
return True return True
return False return False
...@@ -42,7 +45,7 @@ class AMPOptimizer(MetaOptimizerBase): ...@@ -42,7 +45,7 @@ class AMPOptimizer(MetaOptimizerBase):
dist_strategy.amp = False dist_strategy.amp = False
dist_strategy.amp_configs = {} dist_strategy.amp_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
dist_strategy.amp = True dist_strategy.amp = True
dist_strategy.amp_configs = { dist_strategy.amp_configs = {
"init_loss_scaling": 32768.0, "init_loss_scaling": 32768.0,
......
...@@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase): ...@@ -53,6 +53,9 @@ class DGCOptimizer(MetaOptimizerBase):
name=opt._name) name=opt._name)
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.dgc: if self.user_defined_strategy.dgc:
if not isinstance(self.inner_opt, Momentum): if not isinstance(self.inner_opt, Momentum):
logging.warn("dgc only works on Momentum optimizer") logging.warn("dgc only works on Momentum optimizer")
...@@ -69,7 +72,7 @@ class DGCOptimizer(MetaOptimizerBase): ...@@ -69,7 +72,7 @@ class DGCOptimizer(MetaOptimizerBase):
dist_strategy.dgc = False dist_strategy.dgc = False
dist_strategy.dgc_configs = {} dist_strategy.dgc_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
dist_strategy.dgc = True dist_strategy.dgc = True
dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1} dist_strategy.dgc_configs = {"rampup_begin_step": 0, "rampup_step": 1}
...@@ -89,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase): ...@@ -89,5 +92,5 @@ class DGCOptimizer(MetaOptimizerBase):
no_grad_set=None): no_grad_set=None):
optimize_ops, params_grads = \ optimize_ops, params_grads = \
self.dgc_opt.minimize(loss, startup_program, self.dgc_opt.minimize(loss, startup_program,
parameter_list, no_grad_set) parameter_list, no_grad_set)
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -37,15 +37,18 @@ class GradientMergeOptimizer(MetaOptimizerBase): ...@@ -37,15 +37,18 @@ class GradientMergeOptimizer(MetaOptimizerBase):
self.user_defined_strategy.gradient_merge_configs["avg"]) self.user_defined_strategy.gradient_merge_configs["avg"])
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
can_apply = (self.user_defined_strategy.gradient_merge == True) and \ can_apply = (self.user_defined_strategy.gradient_merge == True) and \
self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1 self.user_defined_strategy.gradient_merge_configs["k_steps"] > 1
return can_apply return can_apply
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.gradient_merge = False dist_strategy.gradient_merge = False
dist_strategy.gradient_merge_configs = {} dist_strategy.gradient_merge_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
# we currently do not support auto-enable gradient merge # we currently do not support auto-enable gradient merge
return return
......
...@@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -48,7 +48,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
callbacks=None): callbacks=None):
pass pass
# should fix the variable # should fix the variable
def _setup_nccl_op(self, startup_program, main_program, build_strategy): def _setup_nccl_op(self, startup_program, main_program, build_strategy):
trainer_endpoints = self.role_maker.get_trainer_endpoints() trainer_endpoints = self.role_maker.get_trainer_endpoints()
trainers = trainer_endpoints trainers = trainer_endpoints
...@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -94,31 +94,31 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
dist_strategy = self.user_defined_strategy dist_strategy = self.user_defined_strategy
local_build_strategy = paddle.fluid.BuildStrategy() local_build_strategy = paddle.fluid.BuildStrategy()
local_build_strategy.enable_sequential_execution = \ local_build_strategy.enable_sequential_execution = \
dist_strategy.build_strategy.enable_sequential_execution dist_strategy.build_strategy.enable_sequential_execution
local_build_strategy.fuse_elewise_add_act_ops = \ local_build_strategy.fuse_elewise_add_act_ops = \
dist_strategy.build_strategy.fuse_elewise_add_act_ops dist_strategy.build_strategy.fuse_elewise_add_act_ops
local_build_strategy.fuse_bn_act_ops = \ local_build_strategy.fuse_bn_act_ops = \
dist_strategy.build_strategy.fuse_bn_act_ops dist_strategy.build_strategy.fuse_bn_act_ops
local_build_strategy.enable_auto_fusion = \ local_build_strategy.enable_auto_fusion = \
dist_strategy.build_strategy.enable_auto_fusion dist_strategy.build_strategy.enable_auto_fusion
local_build_strategy.fuse_relu_depthwise_conv = \ local_build_strategy.fuse_relu_depthwise_conv = \
dist_strategy.build_strategy.fuse_relu_depthwise_conv dist_strategy.build_strategy.fuse_relu_depthwise_conv
local_build_strategy.fuse_broadcast_ops = \ local_build_strategy.fuse_broadcast_ops = \
dist_strategy.build_strategy.fuse_broadcast_ops dist_strategy.build_strategy.fuse_broadcast_ops
local_build_strategy.fuse_all_optimizer_ops = \ local_build_strategy.fuse_all_optimizer_ops = \
dist_strategy.build_strategy.fuse_all_optimizer_ops dist_strategy.build_strategy.fuse_all_optimizer_ops
local_build_strategy.enable_inplace = \ local_build_strategy.enable_inplace = \
dist_strategy.build_strategy.enable_inplace dist_strategy.build_strategy.enable_inplace
local_build_strategy.use_hierarchical_allreduce = \ local_build_strategy.use_hierarchical_allreduce = \
dist_strategy.use_hierarchical_allreduce dist_strategy.use_hierarchical_allreduce
local_build_strategy.hierarchical_allreduce_inter_nranks = \ local_build_strategy.hierarchical_allreduce_inter_nranks = \
dist_strategy.hierarchical_allreduce_inter_nranks dist_strategy.hierarchical_allreduce_inter_nranks
local_build_strategy.sync_batch_norm = \ local_build_strategy.sync_batch_norm = \
dist_strategy.sync_batch_norm dist_strategy.sync_batch_norm
local_build_strategy.fuse_all_reduce_ops = \ local_build_strategy.fuse_all_reduce_ops = \
dist_strategy.fuse_all_reduce_ops dist_strategy.fuse_all_reduce_ops
local_build_strategy.nccl_comm_num = \ local_build_strategy.nccl_comm_num = \
dist_strategy.nccl_comm_num dist_strategy.nccl_comm_num
if self.user_defined_strategy.recompute == True: if self.user_defined_strategy.recompute == True:
logging.warn( logging.warn(
...@@ -190,7 +190,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase): ...@@ -190,7 +190,7 @@ class GraphExecutionOptimizer(MetaOptimizerBase):
# TODO(guru4elephant): should close all PE related flags here # TODO(guru4elephant): should close all PE related flags here
return return
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
# by default, graph execution strategy is enabled # by default, graph execution strategy is enabled
return return
......
...@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -62,6 +62,9 @@ class LambOptimizer(MetaOptimizerBase):
name=opt._name) name=opt._name)
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.lamb: if self.user_defined_strategy.lamb:
if not isinstance(self.inner_opt, AdamOptimizer): if not isinstance(self.inner_opt, AdamOptimizer):
logging.warn( logging.warn(
...@@ -75,7 +78,7 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -75,7 +78,7 @@ class LambOptimizer(MetaOptimizerBase):
dist_strategy.lamb = False dist_strategy.lamb = False
dist_strategy.lamb_configs = {} dist_strategy.lamb_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
dist_strategy.lamb = True dist_strategy.lamb = True
dist_strategy.lamb_configs = { dist_strategy.lamb_configs = {
"lamb_weight_decay": 0.01, "lamb_weight_decay": 0.01,
...@@ -102,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase): ...@@ -102,5 +105,5 @@ class LambOptimizer(MetaOptimizerBase):
no_grad_set=None): no_grad_set=None):
optimize_ops, params_grads = \ optimize_ops, params_grads = \
self.lamb_opt.minimize(loss, startup_program, self.lamb_opt.minimize(loss, startup_program,
parameter_list, no_grad_set) parameter_list, no_grad_set)
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -49,6 +49,9 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -49,6 +49,9 @@ class LarsOptimizer(MetaOptimizerBase):
epsilon=configs['epsilon']) epsilon=configs['epsilon'])
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.lars: if self.user_defined_strategy.lars:
if not isinstance(self.inner_opt, Momentum): if not isinstance(self.inner_opt, Momentum):
logging.warn( logging.warn(
...@@ -62,7 +65,7 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -62,7 +65,7 @@ class LarsOptimizer(MetaOptimizerBase):
dist_strategy.lars = False dist_strategy.lars = False
dist_strategy.lars_configs = {} dist_strategy.lars_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
dist_strategy.lars = True dist_strategy.lars = True
dist_strategy.lars_configs = { dist_strategy.lars_configs = {
"lars_coeff": 0.01, "lars_coeff": 0.01,
...@@ -89,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase): ...@@ -89,5 +92,5 @@ class LarsOptimizer(MetaOptimizerBase):
no_grad_set=None): no_grad_set=None):
optimize_ops, params_grads = \ optimize_ops, params_grads = \
self.lars_opt.minimize(loss, startup_program, self.lars_opt.minimize(loss, startup_program,
parameter_list, no_grad_set) parameter_list, no_grad_set)
return optimize_ops, params_grads return optimize_ops, params_grads
...@@ -29,6 +29,9 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -29,6 +29,9 @@ class LocalSGDOptimizer(MetaOptimizerBase):
self.snapshot_key = '@SNAPSHOT' self.snapshot_key = '@SNAPSHOT'
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if not self.user_defined_strategy.localsgd: if not self.user_defined_strategy.localsgd:
return False return False
...@@ -36,15 +39,15 @@ class LocalSGDOptimizer(MetaOptimizerBase): ...@@ -36,15 +39,15 @@ class LocalSGDOptimizer(MetaOptimizerBase):
return False return False
return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \ return isinstance(self.inner_opt, paddle.optimizer.momentum.Momentum) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \ or isinstance(self.inner_opt, paddle.fluid.optimizer.Momentum) \
or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \ or isinstance(self.inner_opt, paddle.optimizer.sgd.SGD) \
or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD) or isinstance(self.inner_opt, paddle.fluid.optimizer.SGD)
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.localsgd = False dist_strategy.localsgd = False
dist_strategy.localsgd_configs = {} dist_strategy.localsgd_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
dist_strategy.localsgd = True dist_strategy.localsgd = True
dist_strategy.localsgd_configs = {"k_steps": 1} dist_strategy.localsgd_configs = {"k_steps": 1}
......
...@@ -48,7 +48,7 @@ class MetaOptimizerBase(Optimizer): ...@@ -48,7 +48,7 @@ class MetaOptimizerBase(Optimizer):
raise NotImplementedError("you should implement disable strategy in {}". raise NotImplementedError("you should implement disable strategy in {}".
format(type(self).__name__)) format(type(self).__name__))
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context=None):
raise NotImplementedError("you should implement enable strategy in {}". raise NotImplementedError("you should implement enable strategy in {}".
format(type(self).__name__)) format(type(self).__name__))
......
...@@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer): ...@@ -24,6 +24,9 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
self.meta_optimizers_white_list = [] self.meta_optimizers_white_list = []
def _can_apply(self): def _can_apply(self):
if self.role_maker._is_collective:
return False
k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
if k_steps < 0: if k_steps < 0:
return False return False
...@@ -37,12 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer): ...@@ -37,12 +40,11 @@ class ParameterServerGraphOptimizer(ParameterServerOptimizer):
return True return True
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {} return
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
# only open up the async mode for auto-parallel # only open up the async mode for auto-parallel
dist_strategy.a_sync = True return
dist_strategy.a_sync_configs = {}
def _is_graph_out(self): def _is_graph_out(self):
return True return True
......
...@@ -32,8 +32,6 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -32,8 +32,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
def _can_apply(self): def _can_apply(self):
if self.role_maker._is_collective: if self.role_maker._is_collective:
return False return False
if self.user_defined_strategy.auto == True:
return True
k_steps = self.user_defined_strategy.a_sync_configs["k_steps"] k_steps = self.user_defined_strategy.a_sync_configs["k_steps"]
return True if k_steps >= 0 else False return True if k_steps >= 0 else False
...@@ -134,7 +132,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -134,7 +132,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return _main, _startup return _main, _startup
def _try_auto_apply_geo(self, program, compiled_config): def _can_apply_geo(self, dist_strategy, program):
def get_sys_free_mem(): def get_sys_free_mem():
plat = platform.system() plat = platform.system()
if platform.system() == "Darwin": if platform.system() == "Darwin":
...@@ -163,36 +161,28 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -163,36 +161,28 @@ class ParameterServerOptimizer(MetaOptimizerBase):
"%s platform is unsupported is parameter server optimizer" % "%s platform is unsupported is parameter server optimizer" %
(platform.system())) (platform.system()))
if self.user_defined_strategy.auto == False:
return
a_sync_configs = self.user_defined_strategy.a_sync_configs
if a_sync_configs["k_steps"] >= 0:
return
self.user_defined_strategy.a_sync = True
if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer): if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer):
# auto async return False
a_sync_configs["k_steps"] = 0
self.user_defined_strategy.a_sync_configs = a_sync_configs
return
from paddle.fluid.incubate.fleet.parameter_server.ir.vars_metatools import dtype_to_size
free = get_sys_free_mem() free = get_sys_free_mem()
param_grad_pairs = compiled_config.origin_sparse_pairs + compiled_config.origin_dense_pairs from paddle.fluid.incubate.fleet.parameter_server.ir import vars_metatools
processed_var_names = set(["@EMPTY@"])
processed_var_names = set(["@EMPTY@"])
param_memory_size = 0 param_memory_size = 0
for param_grad_pair in param_grad_pairs: for varname in program.global_block().vars:
param, grad = param_grad_pair var = program.global_block().vars[varname]
if not var.persistable or var.desc.type(
) != core.VarDesc.VarType.LOD_TENSOR:
continue
param = vars_metatools.create_var_struct(var)
param_memory_size += param.m_size param_memory_size += param.m_size
processed_var_names.add(param.name) processed_var_names.add(varname)
upper_mem_use = param_memory_size * 5.0 upper_mem_use = param_memory_size * 5.0
program_tmp_vars = dict() program_tmp_vars = dict()
batch_size = 1024 eval_batch_size = 1024
for op in program.global_block().ops: for op in program.global_block().ops:
for var_name in op.output_arg_names: for var_name in op.output_arg_names:
if var_name in processed_var_names: if var_name in processed_var_names:
...@@ -215,23 +205,21 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -215,23 +205,21 @@ class ParameterServerOptimizer(MetaOptimizerBase):
data_count *= (-x) data_count *= (-x)
else: else:
data_count *= x data_count *= x
program_tmp_vars[var_name] = (data_count, neg_dim_count, program_tmp_vars[var_name] = (
dtype_to_size[var.dtype]) data_count, neg_dim_count,
vars_metatools.dtype_to_size[var.dtype])
for varname in program_tmp_vars: for varname in program_tmp_vars:
data_count, neg_dim_count, type_size = program_tmp_vars[varname] data_count, neg_dim_count, type_size = program_tmp_vars[varname]
if neg_dim_count == 1: if neg_dim_count == 1:
data_count *= batch_size data_count *= eval_batch_size
var_memory = data_count * type_size var_memory = data_count * type_size
upper_mem_use += var_memory upper_mem_use += var_memory
if upper_mem_use < free: if upper_mem_use < free:
# auto geo return True
a_sync_configs["k_steps"] = 800
else: else:
# auto async return False
a_sync_configs["k_steps"] = 0
self.user_defined_strategy.a_sync_configs = a_sync_configs
def minimize_impl(self, def minimize_impl(self,
loss, loss,
...@@ -240,6 +228,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -240,6 +228,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
no_grad_set=None): no_grad_set=None):
self.inner_opt.minimize(loss, startup_program, parameter_list, self.inner_opt.minimize(loss, startup_program, parameter_list,
no_grad_set) no_grad_set)
strategy = self._get_distributed_strategy()
_origin_main_program = loss.block.program _origin_main_program = loss.block.program
_origin_startup_program = startup_program _origin_startup_program = startup_program
...@@ -247,11 +236,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -247,11 +236,7 @@ class ParameterServerOptimizer(MetaOptimizerBase):
compiled_config = public.CompileTimeStrategy(_origin_main_program, compiled_config = public.CompileTimeStrategy(_origin_main_program,
_origin_startup_program, _origin_startup_program,
None, self.role_maker) strategy, self.role_maker)
self._try_auto_apply_geo(_origin_main_program, compiled_config)
strategy = self._get_distributed_strategy()
compiled_config.strategy = strategy compiled_config.strategy = strategy
if self.role_maker.is_worker() or self.role_maker._is_heter_worker(): if self.role_maker.is_worker() or self.role_maker._is_heter_worker():
...@@ -267,9 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase): ...@@ -267,9 +252,24 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return None, None return None, None
def _disable_strategy(self, dist_strategy): def _disable_strategy(self, dist_strategy):
dist_strategy.a_sync_configs = {} dist_strategy.a_sync = False
self.user_defined_strategy.a_sync_configs = {} a_sync_configs = dist_strategy.a_sync_configs
a_sync_configs["k_steps"] = -1
dist_strategy.a_sync_configs = a_sync_configs
def _enable_strategy(self, dist_strategy, context):
a_sync_configs = dist_strategy.a_sync_configs
if a_sync_configs["k_steps"] >= 0:
return
def _enable_strategy(self, dist_strategy):
dist_strategy.a_sync = True dist_strategy.a_sync = True
dist_strategy.a_sync_configs = {} a_sync_configs = dist_strategy.a_sync_configs
is_geo = self._can_apply_geo(dist_strategy,
context["origin_main_program"])
if is_geo:
a_sync_configs["k_steps"] = 800
else:
a_sync_configs["k_steps"] = 0
dist_strategy.a_sync_configs = a_sync_configs
...@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -103,6 +103,9 @@ class PipelineOptimizer(MetaOptimizerBase):
self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches) self.wrapped_opt = PO(self.inner_opt, num_microbatches=num_microbatches)
def _can_apply(self): def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.pipeline == True: if self.user_defined_strategy.pipeline == True:
return True return True
return False return False
...@@ -111,7 +114,7 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -111,7 +114,7 @@ class PipelineOptimizer(MetaOptimizerBase):
dist_strategy.pipeline = False dist_strategy.pipeline = False
dist_strategy.pipeline_configs = {} dist_strategy.pipeline_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
# we do not support enable pipeline automatically right now # we do not support enable pipeline automatically right now
return return
...@@ -180,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -180,7 +183,7 @@ class PipelineOptimizer(MetaOptimizerBase):
grad = None grad = None
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_backward_op(op) and \ if is_backward_op(op) and \
OP_ROLE_VAR_KEY in op.attr_names: OP_ROLE_VAR_KEY in op.attr_names:
op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
if len(op_role_var) == 0: if len(op_role_var) == 0:
continue continue
......
...@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase): ...@@ -38,6 +38,9 @@ class RecomputeOptimizer(MetaOptimizerBase):
list(user_defined_strategy.recompute_configs["checkpoints"])) list(user_defined_strategy.recompute_configs["checkpoints"]))
def _can_apply(self): def _can_apply(self):
if self.role_maker._is_collective:
return False
if self.user_defined_strategy.recompute == True: if self.user_defined_strategy.recompute == True:
if len(self.user_defined_strategy.recompute_configs[ if len(self.user_defined_strategy.recompute_configs[
"checkpoints"]) == 0: "checkpoints"]) == 0:
...@@ -49,7 +52,7 @@ class RecomputeOptimizer(MetaOptimizerBase): ...@@ -49,7 +52,7 @@ class RecomputeOptimizer(MetaOptimizerBase):
dist_strategy.recompute = False dist_strategy.recompute = False
dist_strategy.recompute_configs = {} dist_strategy.recompute_configs = {}
def _enable_strategy(self, dist_strategy): def _enable_strategy(self, dist_strategy, context):
# we do not support automatically recompute checkpoints currently # we do not support automatically recompute checkpoints currently
return return
......
...@@ -441,8 +441,6 @@ if(WITH_DISTRIBUTE) ...@@ -441,8 +441,6 @@ if(WITH_DISTRIBUTE)
# FIXME(seiriosX) will fix this # FIXME(seiriosX) will fix this
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_sparse_embedding_ctr")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo") list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_gloo")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_a_sync_optimizer_auto")
list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ctr")
py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS}) py_test_modules(test_recv_save_op MODULES test_recv_save_op ENVS ${dist_ENVS})
py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS}) py_test_modules(test_transpiler_ops MODULES test_transpiler_ops ENVS ${dist_ENVS})
......
...@@ -62,82 +62,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): ...@@ -62,82 +62,6 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0) self.assertTrue(a_sync_configs['k_steps'] == 0)
def test_a_sync_optimizer2(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
def test_a_sync_optimizer3(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
x_embedding = paddle.fluid.layers.embedding(
is_distributed=False,
input=input_x,
size=[1000000000, 100000],
param_attr=paddle.fluid.ParamAttr(
name="embedding",
initializer=paddle.fluid.initializer.Constant(value=0.01)),
is_sparse=True)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_PSERVER_NUMS"] = "2"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def test_a_sync_optimizer3(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x",
shape=[-1, 1],
dtype="int64",
lod_level=1,
append_batch_size=False)
x_embedding = paddle.fluid.layers.embedding(
is_distributed=False,
input=input_x,
size=[1000000000, 100000],
param_attr=paddle.fluid.ParamAttr(
name="embedding",
initializer=paddle.fluid.initializer.Constant(value=0.01)),
is_sparse=True)
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=x_embedding, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 0)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle
import os
import paddle.distributed.fleet.base.role_maker as role_maker
import time
class TestFleetGradientMergeMetaOptimizer(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_PSERVER_NUMS"] = "2"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = \
"127.0.0.1:36001,127.0.0.2:36001"
def test_a_sync_optimizer2(self):
os.environ["TRAINING_ROLE"] = "TRAINER"
import paddle.distributed.fleet as fleet
main_program = paddle.fluid.Program()
startup_program = paddle.fluid.Program()
paddle.fluid.framework.switch_main_program(main_program)
paddle.fluid.framework.switch_startup_program(startup_program)
fleet.init(role_maker.PaddleCloudRoleMaker())
input_x = paddle.fluid.layers.data(
name="x", shape=[32], dtype='float32')
input_y = paddle.fluid.layers.data(name="y", shape=[1], dtype='int64')
fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax')
cost = paddle.fluid.layers.cross_entropy(
input=prediction, label=input_y)
avg_cost = paddle.fluid.layers.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.auto = True
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
self.assertTrue(optimizer.user_defined_strategy.a_sync)
a_sync_configs = optimizer.user_defined_strategy.a_sync_configs
self.assertTrue(a_sync_configs['k_steps'] == 800)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册