未验证 提交 4d7af372 编写于 作者: W WangXi 提交者: GitHub

[hybrid] pp+dp support fp16 allreduce (#34762)

上级 3f962e77
develop Ligoml-patch-1 ZHUI-patch-1 add_some_yaml_config ascendrelease cherry_undefined_var cp_2.4_fix_numpy delete_disable_iterable_dataset_unittest delete_fix_retry_ci delete_fix_undefined_var delete_revert-34910-spinlocks_for_allocator delete_revert-35069-revert-34910-spinlocks_for_allocator delete_revert-36057-dev/read_flags_in_ut dingjiaweiww-patch-1 disable_iterable_dataset_unittest dy2static enable_eager_model_test final_state_gen_python_c final_state_intermediate fix-numpy-issue fix_concat_slice fix_dlpack_for fix_npu_ci fix_op_flops fix_retry_ci fix_rnn_docs fix_tensor_type fix_undefined_var fix_var_stop_gradient_error incubate/frl_train_eval incubate/infrt inplace_addto layer_norm make_flag_adding_easier matmul_double_grad move_embedding_to_phi move_histogram_to_pten move_sgd_to_phi move_slice_to_pten move_temporal_shift_to_phi move_yolo_box_to_phi npu_fix_alloc preln_ernie prv-md-even-more prv-onednn-2.5 prv-reshape-mkldnn-ut2 pten_tensor_refactor release/2.2 release/2.3 release/2.3-fc-ernie-fix release/2.4 revert-34406-add_copy_from_tensor revert-34910-spinlocks_for_allocator revert-35069-revert-34910-spinlocks_for_allocator revert-36057-dev/read_flags_in_ut revert-36201-refine_fast_threaded_ssa_graph_executor revert-36985-add_license revert-37318-refactor_dygraph_to_eager revert-37926-eager_coreops_500 revert-37956-revert-37727-pylayer_support_tuple revert-38100-mingdong revert-38301-allocation_rearrange_pr revert-38703-numpy_bf16_package_reupload revert-38732-remove_useless_header_in_elementwise_mul_grad revert-38959-Reduce_Grad revert-39143-adjust_empty revert-39227-move_trace_op_to_pten revert-39268-dev/remove_concat_fluid_kernel revert-40170-support_partial_grad revert-41056-revert-40727-move_some_activaion_to_phi revert-41065-revert-40993-mv_ele_floordiv_pow revert-41068-revert-40790-phi_new revert-41944-smaller_inference_api_test revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator revert-43155-fix_ut_tempfile revert-43882-revert-41944-smaller_inference_api_test revert-45808-phi/simplify_size_op revert-46827-deform_comment revert-47325-remove_cudnn_hardcode revert-47645-add_npu_storage_dims revert-48815-set_free_when_no_cache_hit_default_value_true revert-49654-prim_api_gen revert-49763-fix_static_composite_gen support-0D-sort support_weight_transpose test_for_Filtetfiles zhiqiu-patch-1 v2.4.1 v2.4.0 v2.4.0-rc0 v2.3.2 v2.3.1 v2.3.0 v2.3.0-rc0 v2.2.2 v2.2.1 v2.2.0 v2.2.0-rc0 v2.2.0-bak0
无相关合并请求
......@@ -14,7 +14,7 @@
import paddle
from paddle.fluid import core, unique_name
from functools import reduce
from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op
from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op
from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
import re
......@@ -431,15 +431,19 @@ def insert_reduce_ops(block,
reduce_vars,
shard,
op_role=OpRole.Backward,
use_calc_stream=False):
use_calc_stream=False,
rank=None):
"""
_add_allreduce_ops
"""
grad_in_this_device = []
for var in reduce_vars:
root_id = get_grad_device(var, shard)
assert root_id >= 0, "root id should be a positive int, but now root id is {}".format(
root_id)
if rank is not None and rank == root_id:
grad_in_this_device.append(var)
block._insert_op_without_sync(
insert_idx,
type='c_reduce_sum',
......@@ -451,16 +455,23 @@ def insert_reduce_ops(block,
'use_calc_stream': use_calc_stream,
OP_ROLE_KEY: op_role
})
return
return grad_in_this_device
def get_grad_device(grad_name, shard):
assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
grad_name)
base_name = None
# mind the traversal order
# NOTE: mind the traversal order
possible_suffixes = [
'.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD'
# sharding gm
'.cast_fp16@GRAD@MERGED',
'.cast_fp16@GRAD',
# pipeline
'@GRAD@MERGED@FP16',
'@GRAD@MERGED',
'@GRAD',
]
for suffix in possible_suffixes:
if suffix in grad_name:
......@@ -487,6 +498,15 @@ def get_first_check_finite_and_unscale_op_idx(block, raise_error=True):
return -1
def get_first_optimize_op_idx(block):
first_opt_op_idx = None
for index, op in reversed(tuple(enumerate(block.ops))):
if is_backward_op(op) and first_opt_op_idx is None:
first_opt_op_idx = index + 1
break
return first_opt_op_idx
def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root):
"""
_add_broadcast_ops
......@@ -672,23 +692,6 @@ def save_persistables(exe, dirname, main_program, filename=None):
return
def get_grad_device(grad_name, shard):
assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format(
grad_name)
base_name = None
# mind the traversal order
possible_suffixes = ['.cast_fp16@GRAD', '@GRAD']
for suffix in possible_suffixes:
if suffix in grad_name:
base_name = re.sub(suffix, '', grad_name)
break
assert base_name in shard.global_param2device, "[{}] should be a param variable.".format(
base_name)
return shard.global_param2device[base_name]
def append_naive_sync(block, sync_var, ring_id):
# NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic
# sync within global
......
......@@ -294,6 +294,8 @@ class ShardingOptimizer(MetaOptimizerBase):
if self.pp_degree == 1: return
strategy = self.user_defined_strategy
fp16_allreduce = strategy.fp16_allreduce
main_block = self._main_program.global_block()
startup_block = self._startup_program.global_block()
......@@ -317,33 +319,44 @@ class ShardingOptimizer(MetaOptimizerBase):
main_block._remove_op(idx)
accumulated_grad_names = self._pp_optimizer._accumulate_gradients(
main_block)
# accumulated_grad_names = sorted(accumulated_grad_names)
main_block, fp16_allreduce=fp16_allreduce)
len_of_ops = len(main_block.ops)
first_optimize_op_index = get_first_optimize_op_idx(main_block)
if self.pp_allreduce_in_optimize:
print("persistable FP32 grad: ")
print(accumulated_grad_names)
first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
main_block, raise_error=strategy.amp)
insert_reduce_ops(
logger.info("Pipeline Persistable grad is {}".format(
accumulated_grad_names))
# FIXME(wangxi): accumulated_grad get from pipeline is not
# include sharding's param@BroadCast grad when
# pp_allreduce_in_optimize
accumulated_grad_names = insert_reduce_ops(
main_block,
first_optimize_op_index,
self.sharding_ring_id,
accumulated_grad_names,
self._shard,
core.op_proto_and_checker_maker.OpRole.Optimize,
use_calc_stream=True)
use_calc_stream=True,
rank=self.sharding_rank)
logger.info("PP-Sharding grad is {}".format(accumulated_grad_names))
first_optimize_op_index += (len(main_block.ops) - len_of_ops)
len_of_ops = len(main_block.ops)
if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp":
first_optimize_op_index = get_first_check_finite_and_unscale_op_idx(
main_block, raise_error=strategy.amp)
if first_optimize_op_index >= 0:
insert_allreduce_ops(
main_block,
first_optimize_op_index,
self.dp_ring_id,
accumulated_grad_names,
core.op_proto_and_checker_maker.OpRole.Optimize,
use_calc_stream=True,
user_defined_strategy=strategy)
insert_allreduce_ops(
main_block,
first_optimize_op_index,
self.dp_ring_id,
accumulated_grad_names,
core.op_proto_and_checker_maker.OpRole.Optimize,
use_calc_stream=True,
user_defined_strategy=strategy)
first_optimize_op_index += (len(main_block.ops) - len_of_ops)
len_of_ops = len(main_block.ops)
# FIXME(wangxi): if fp16_allreduce, put cast fp16->fp32 to there?
def _adapt_amp_clip_without_sharding(self):
if self.sharding_degree > 1: return
......
......@@ -4528,7 +4528,7 @@ class PipelineOptimizer(object):
op._rename_input(old_name, new_name)
op._rename_output(old_name, new_name)
def _create_var(self, block, ref_var, name):
def _create_var(self, block, ref_var, name, dtype=None):
"""
Create a new var for block, which has the same type,
shape and dtype as ref_var, then rename it with the
......@@ -4537,7 +4537,7 @@ class PipelineOptimizer(object):
new_var = block.create_var(
name=name,
shape=ref_var.shape,
dtype=ref_var.dtype,
dtype=ref_var.dtype if dtype is None else dtype,
type=ref_var.type,
lod_level=ref_var.lod_level,
persistable=ref_var.persistable,
......@@ -5044,7 +5044,10 @@ class PipelineOptimizer(object):
new_grad_name = name + "@MERGED"
self._rename_arg(op, name, new_grad_name)
def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False):
def _accumulate_gradients(self,
block,
pp_allreduce_in_optimize=False,
fp16_allreduce=False):
"""
Create a new merged gradient for each parameter and accumulate the
corresponding gradient to it.
......@@ -5052,6 +5055,9 @@ class PipelineOptimizer(object):
merged_gradient_names = []
first_opt_op_idx = None
merged_suffix = '@MERGED@FP16' if fp16_allreduce else '@MERGED'
dtype = paddle.float16 if fp16_allreduce else None
for index, op in reversed(tuple(enumerate(list(block.ops)))):
# remove the cast op of fp16 grad to fp32 grad
if self._is_optimize_op(op) and op.type == 'cast':
......@@ -5062,12 +5068,10 @@ class PipelineOptimizer(object):
block._remove_op(index)
continue
if self._is_backward_op(op) and not first_opt_op_idx:
if self._is_backward_op(op) and first_opt_op_idx is None:
first_opt_op_idx = index + 1
# no optimize phase
if first_opt_op_idx == len(block.ops): return
if block.ops[first_opt_op_idx].type == "c_sync_comm_stream":
first_opt_op_idx += 1
if self._is_backward_op(op) and (
self._op_role_var_key in op.attr_names):
......@@ -5079,12 +5083,14 @@ class PipelineOptimizer(object):
param_name = op_role_var[i]
if not block.has_var(param_name): continue
if '@BroadCast' in param_name: continue
param_grad_name = param_name + core.grad_var_suffix()
merged_param_grad_name = param_grad_name + '@MERGED'
merged_param_grad_name = param_grad_name + merged_suffix
if not block.has_var(merged_param_grad_name):
self._create_var(block, block.vars[param_name],
merged_param_grad_name)
merged_param_grad_name, dtype)
assert block.has_var(merged_param_grad_name)
param_grad_var = block.var(param_grad_name)
merged_param_grad_var = block.var(merged_param_grad_name)
merged_param_grad_var.persistable = True
......@@ -5103,22 +5109,18 @@ class PipelineOptimizer(object):
offset += 1
grad_name = op_role_var[i + 1]
grad_var = block.vars[grad_name]
if not 'cast_fp16' in grad_name:
block._insert_op(
index=first_opt_op_idx + offset,
type='sum',
inputs={'X': [grad_var, merged_param_grad_var]},
outputs={'Out': merged_param_grad_var},
attrs={
self._op_role_key: self._op_role.Backward,
})
offset += 1
merged_gradient_names.append(merged_param_grad_name)
else:
# cast gradient to fp32 to accumulate to merged gradient
is_fp16_grad = 'cast_fp16' in grad_name
need_cast = (is_fp16_grad is not fp16_allreduce)
if need_cast:
# if fp16_allreduce:
# cast grad to fp16 to accumulate to merged gradient
# else:
# cast grad to fp32 to accumulate to merged gradient
cast_grad_var_name = param_grad_name + '@TMP'
cast_grad_var = self._create_var(block, param_grad_var,
cast_grad_var_name)
cast_grad_var = self._create_var(
block, param_grad_var, cast_grad_var_name, dtype)
cast_grad_var.persistable = False
block._insert_op(
index=first_opt_op_idx + offset,
......@@ -5131,18 +5133,52 @@ class PipelineOptimizer(object):
self._op_role_key: self._op_role.Backward,
})
offset += 1
block._insert_op(
index=first_opt_op_idx + offset,
type='sum',
inputs={
'X': [merged_param_grad_var, cast_grad_var]
},
outputs={'Out': merged_param_grad_var},
attrs={
self._op_role_key: self._op_role.Backward,
})
offset += 1
merged_gradient_names.append(merged_param_grad_name)
grad_var = cast_grad_var
block._insert_op(
index=first_opt_op_idx + offset,
type='sum',
inputs={'X': [merged_param_grad_var, grad_var]},
outputs={'Out': merged_param_grad_var},
attrs={self._op_role_key: self._op_role.Backward, })
offset += 1
merged_gradient_names.append(merged_param_grad_name)
if not fp16_allreduce: return merged_gradient_names
first_opt_op_idx = None
for index, op in reversed(tuple(enumerate(list(block.ops)))):
if self._is_backward_op(op) and first_opt_op_idx is None:
first_opt_op_idx = index + 1
break
assert first_opt_op_idx is not None
# insert cast op from fp16->fp32
# FIXME(wangxi): maybe put in sharding is better, for some grad
# is not in sharding device.
for fp16_grad_name in merged_gradient_names:
grad_name = fp16_grad_name.replace('@FP16', '')
param_name = fp16_grad_name.replace('@GRAD@MERGED@FP16', '')
if not block.has_var(grad_name):
self._create_var(block, block.vars[param_name], grad_name)
assert block.has_var(grad_name)
fp16_grad_var = block.var(fp16_grad_name)
grad_var = block.var(grad_name)
grad_var.persistable = False
block._insert_op(
index=first_opt_op_idx,
type='cast',
inputs={'X': fp16_grad_var},
outputs={'Out': grad_var},
attrs={
'in_dtype': fp16_grad_var.dtype,
'out_dtype': grad_var.dtype,
self._op_role_key: self._op_role.Optimize,
})
return merged_gradient_names
def _add_sub_blocks(self, main_block, program_list):
......
......@@ -552,9 +552,9 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream',
'c_sync_comm_stream', 'fill_constant', 'sum', 'fill_constant',
'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
'fill_constant', 'sum', 'momentum', 'momentum', 'momentum',
'c_sync_comm_stream', 'momentum', 'momentum', 'momentum',
'momentum', 'momentum'
])
......@@ -694,6 +694,171 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
)
avg_cost, strategy = self.pp_net(train_prog, startup_prog)
strategy.amp = True
strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
strategy.sharding = True
strategy.sharding_configs = {
"sharding_degree": 1,
"mp_degree": 1,
"pp_degree": 2,
"dp_degree": 2,
}
strategy.pipeline = True
strategy.pipeline_configs = {
"schedule_mode": "1F1B",
"micro_batch_size": 2,
"accumulate_steps": 4,
}
strategy.fp16_allreduce = True
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
train_prog = train_prog._pipeline_opt['section_program']
startup_prog = startup_prog._pipeline_opt['startup_program']
startup_prog_ops = startup_prog.global_block().ops
main_prog_ops = train_prog.global_block().ops
# check program
startup_prog_op_types = [op.type for op in startup_prog_ops]
main_prog_op_types = [op.type for op in main_prog_ops]
# ring: mp, pp_group, pp_pair, pp_pair
self.assertEqual(startup_prog_op_types, [
'uniform_random', 'fill_constant', 'uniform_random',
'fill_constant', 'uniform_random', 'fill_constant',
'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream'
])
self.assertEqual(main_prog_op_types, [
'recv_v2', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast',
'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
'fill_constant', 'scale', 'scale', 'elementwise_mul_grad',
'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
'fill_constant', 'sum', 'coalesce_tensor', 'c_allreduce_sum',
'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast', 'cast',
'c_sync_comm_stream', 'check_finite_and_unscale', 'cast',
'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
'momentum', 'momentum', 'momentum', 'momentum', 'momentum',
'momentum', 'momentum'
])
# amp check_finite_and_unscale, allreduce(pp)
self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 1)
# should has ring id for pp
created_ring_ids = [
op.desc.attr("ring_id") for op in startup_prog_ops
if op.type == "c_comm_init"
]
self.assertIn(self.pp_pair_ring_id, created_ring_ids)
self.assertIn(self.dp_ring_id, created_ring_ids)
# check correctness of pp group
for op in startup_prog_ops:
if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
0] == "comm_id_0":
pp_group_waiting_ports = op.desc.attr("other_endpoints")
self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36003'])
# check correctness of dp group
for op in startup_prog_ops:
if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
0] == "comm_id_3":
dp_group_waiting_ports = op.desc.attr("other_endpoints")
self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
def test_hybrid_with_sharding_pp_amp_fp16allreduce_in_optimize(self):
train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
)
avg_cost, strategy = self.pp_net(train_prog, startup_prog)
strategy.amp = True
strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], }
strategy.sharding = True
strategy.sharding_configs = {
"segment_broadcast_MB": 0.1,
"sharding_degree": 2,
"mp_degree": 1,
"pp_degree": 2,
"dp_degree": 1,
'pp_allreduce_in_optimize': True,
}
strategy.pipeline = True
strategy.pipeline_configs = {
"schedule_mode": "1F1B",
"micro_batch_size": 2,
"accumulate_steps": 4,
}
strategy.fp16_allreduce = True
self.optimizer(avg_cost, strategy, train_prog, startup_prog)
train_prog = train_prog._pipeline_opt['section_program']
startup_prog = startup_prog._pipeline_opt['startup_program']
startup_prog_ops = startup_prog.global_block().ops
main_prog_ops = train_prog.global_block().ops
# check program
startup_prog_op_types = [op.type for op in startup_prog_ops]
main_prog_op_types = [op.type for op in main_prog_ops]
# ring: sharding, pp_group, pp_pair, pp_pair
self.assertEqual(startup_prog_op_types, [
'fill_constant', 'uniform_random', 'fill_constant',
'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id',
'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
])
# FIXME(wangxi): some bug in sharding+pp with pp_allreduce_in_optimize
# self.assertEqual(main_prog_op_types, [])
# amp check_finite_and_unscale, allreduce(pp)
self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 2)
# should has ring id for pp
created_ring_ids = [
op.desc.attr("ring_id") for op in startup_prog_ops
if op.type == "c_comm_init"
]
self.assertIn(self.sharding_ring_id, created_ring_ids)
self.assertIn(self.pp_pair_ring_id, created_ring_ids)
# check correctness of sharding group
for op in startup_prog_ops:
if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
0] == "comm_id_0":
sharding_group_waiting_ports = op.desc.attr("other_endpoints")
self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
# check correctness of pp group
for op in startup_prog_ops:
if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
0] == "comm_id_1":
pp_group_waiting_ports = op.desc.attr("other_endpoints")
self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部