未验证 提交 4d2994cb 编写于 作者: Y Yuang Liu 提交者: GitHub

Optimize fused allreduce in raw program (#34509)

上级 6a9fac14
develop 1.8.5 2.4.1 Ligoml-patch-1 ZHUI-patch-1 add_kylinv10 add_some_yaml_config ascendrelease bugfix-eval-frame-leakgae cherry-pick-fix-customOP-random-fail cherry_undefined_var cp_2.4_fix_numpy delete_disable_iterable_dataset_unittest delete_fix_retry_ci delete_fix_undefined_var delete_revert-34910-spinlocks_for_allocator delete_revert-35069-revert-34910-spinlocks_for_allocator delete_revert-36057-dev/read_flags_in_ut dingjiaweiww-patch-1 disable_iterable_dataset_unittest dy2static enable_eager_model_test final_state_gen_python_c final_state_intermediate fix-numpy-issue fix-run-program-grad-node-mem fix_check fix_concat_slice fix_custom_device_copy_sync fix_dlpack_for fix_newexe_gc fix_npu_ci fix_op_flops fix_retry_ci fix_rnn_docs fix_tensor_type fix_undefined_var fix_var_stop_gradient_error hack_event incuabte/new_frl incubate/frl_train_eval incubate/infrt incubate/new_frl incubate/new_frl_rc incubate/stride inplace_addto layer_norm make_flag_adding_easier matmul_double_grad move_embedding_to_phi move_histogram_to_pten move_sgd_to_phi move_slice_to_pten move_temporal_shift_to_phi move_yolo_box_to_phi npu_fix_alloc operator_opt pass-compile-eval-frame preln_ernie prv-md-even-more prv-onednn-2.5 prv-reshape-mkldnn-ut2 pten_tensor_refactor release-deleted/2.5 release-rc/2.5 release/2.2 release/2.3 release/2.3-fc-ernie-fix release/2.4 release/2.5 release/llm_2.5 revert-34406-add_copy_from_tensor revert-34910-spinlocks_for_allocator revert-35069-revert-34910-spinlocks_for_allocator revert-36057-dev/read_flags_in_ut revert-36201-refine_fast_threaded_ssa_graph_executor revert-36985-add_license revert-37318-refactor_dygraph_to_eager revert-37926-eager_coreops_500 revert-37956-revert-37727-pylayer_support_tuple revert-38100-mingdong revert-38301-allocation_rearrange_pr revert-38703-numpy_bf16_package_reupload revert-38732-remove_useless_header_in_elementwise_mul_grad revert-38959-Reduce_Grad revert-39143-adjust_empty revert-39227-move_trace_op_to_pten revert-39268-dev/remove_concat_fluid_kernel revert-40170-support_partial_grad revert-41056-revert-40727-move_some_activaion_to_phi revert-41065-revert-40993-mv_ele_floordiv_pow revert-41068-revert-40790-phi_new revert-41944-smaller_inference_api_test revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator revert-43155-fix_ut_tempfile revert-43882-revert-41944-smaller_inference_api_test revert-45808-phi/simplify_size_op revert-46827-deform_comment revert-47325-remove_cudnn_hardcode revert-47645-add_npu_storage_dims revert-48815-set_free_when_no_cache_hit_default_value_true revert-49499-test_ninja_on_ci revert-49654-prim_api_gen revert-49673-modify_get_single_cov revert-49763-fix_static_composite_gen revert-50158-fix_found_inf_bug_for_custom_optimizer revert-50188-refine_optimizer_create_accumulators revert-50335-fix_optminizer_set_auxiliary_var_bug revert-51676-flag_delete revert-51850-fix_softmaxce_dev revert-52175-dev_peak_memory revert-52186-deve revert-52523-test_py38 revert-52912-develop revert-53248-set_cmake_policy revert-54029-fix_windows_compile_bug revert-54068-support_translating_op_attribute revert-54214-modify_cmake_dependencies revert-54370-offline_pslib revert-54391-fix_cmake_md5error revert-54411-fix_cpp17_compile revert-54466-offline_pslib revert-54480-cmake-rocksdb revert-55568-fix_BF16_bug1 revert-56328-new_ir_support_vector_type_place_transfer revert-56366-fix_openssl_bug revert-56545-revert-56366-fix_openssl_bug revert-56620-fix_new_ir_ocr_bug revert-56925-check_inputs_grad_semantic revert-57005-refine_stride_flag sd_conv_linear_autocast semi-auto/rule-base support-0D-sort support_weight_transpose test_for_Filtetfiles zhiqiu-patch-1 v2.5.1 v2.5.0 v2.5.0-rc1 v2.5.0-rc0 v2.4.2 v2.4.1 v2.4.0 v2.4.0-rc0 v2.3.2 v2.3.1 v2.3.0 v2.3.0-rc0 v2.2.2 v2.2.1 v2.2.0 v2.2.0-rc0 v2.2.0-bak0
无相关合并请求
......@@ -217,9 +217,13 @@ class RawProgramOptimizer(MetaOptimizerBase):
block = self.main_program.global_block()
ring_id = self.global_ring_id
param_grads = []
first_backward_idx = -1
# find all grad params
for op in reversed(block.ops):
for idx, op in enumerate(block.ops):
if first_backward_idx == -1 and \
is_backward_op(op):
first_backward_idx = idx
if is_backward_op(op) and \
OP_ROLE_VAR_KEY in op.attr_names:
op_role_var = op.attr(OP_ROLE_VAR_KEY)
......@@ -234,70 +238,100 @@ class RawProgramOptimizer(MetaOptimizerBase):
grad = block.var(grad_name)
if param.is_distributed:
continue
param_grads.append(grad)
param_grads.append((param, grad))
outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
block)
segments = []
# structure of grad_param_segments is
# [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
# each entry of the list is a tuple stores the grads segment list and
# the corresponding params segment list
grad_param_segments = []
last_dtype = None
# split the grad based on dtype and fused size
for var in param_grads:
if len(segments) == 0 \
or len(segments[-1]) == self.fuse_grad_size_in_num \
or var.dtype != last_dtype:
segments.append([var])
last_dtype = var.dtype
for param, grad in param_grads:
if len(grad_param_segments) == 0 \
or len(grad_param_segments[-1][0]) == self.fuse_grad_size_in_num \
or grad.dtype != last_dtype:
grad_param_segments.append(([grad], [param]))
last_dtype = grad.dtype
else:
segments[-1].append(var)
grad_param_segments[-1][0].append(grad)
grad_param_segments[-1][1].append(param)
fused_vars = []
for idx, op in enumerate(block.ops):
if is_optimizer_op(op):
for segment in segments:
# insert coalesce tensor
tmp_var = block.create_var(
name=unique_name.generate('FusedOutput_{}'.format(
segment[0].name)),
dtype=segment[0].dtype,
persistable=True,
stop_gradient=True)
fused_vars.append(tmp_var)
block._insert_op_without_sync(
idx,
type="coalesce_tensor",
inputs={"Input": segment},
outputs={"Output": segment,
"FusedOutput": tmp_var},
attrs={
"copy_data": True,
"use_align": True,
"dtype": segment[0].dtype,
OP_ROLE_KEY: OpRole.Backward
})
break
if len(grad_param_segments) == 0:
return
# insert the allreduce_sum op
for idx, op in enumerate(block.ops):
if is_optimizer_op(op):
for fused_var in fused_vars:
block._insert_op_without_sync(
idx,
type='c_allreduce_sum',
inputs={'X': fused_var},
outputs={'Out': fused_var},
attrs={
'ring_id': ring_id,
'use_calc_stream': self.calc_comm_same_stream,
OP_ROLE_KEY: OpRole.Backward
})
if not self.calc_comm_same_stream:
block._insert_op_without_sync(
idx,
type='c_sync_calc_stream',
inputs={'X': fused_var},
outputs={'Out': fused_var},
attrs={OP_ROLE_KEY: OpRole.Backward})
break
fused_vars = [None] * len(grad_param_segments)
for i in range(len(grad_param_segments) - 1, -1, -1):
# travers the grad_param_segments in backward
# not to use reversed since needs the absolute index value
grad_segment, param_segment = grad_param_segments[i]
# insert coalesce tensor
fused_var = block.create_var(
name=unique_name.generate('FusedOutput_{}'.format(grad_segment[
0].name)),
dtype=grad_segment[0].dtype,
persistable=False,
stop_gradient=True)
fused_vars[i] = fused_var
after_idx = outputs_name_to_idx[grad_segment[-1]][1]
block._insert_op_without_sync(
after_idx + 1,
type='c_allreduce_sum',
inputs={'X': fused_var},
outputs={'Out': fused_var},
attrs={
'ring_id': ring_id,
'use_calc_stream': self.calc_comm_same_stream,
OP_ROLE_KEY: OpRole.Backward
})
if not self.calc_comm_same_stream:
block._insert_op_without_sync(
after_idx + 1,
type='c_sync_calc_stream',
inputs={'X': fused_var},
outputs={'Out': fused_var},
attrs={OP_ROLE_KEY: OpRole.Backward})
if len(fused_vars) == 0:
# update the outputs_name_to_idx after insertion of sync/allreduce ops
outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
block)
# the before_idx is not guaranteed sorted, therefore we have to find the
# topology to insert the coalesce ops
pos_for_coalesce = {}
for i in range(len(grad_param_segments) - 1, -1, -1):
# We separate the insertion of coalesce op and the insertion of sync/allreduce op,
# since that the coalesce op's insertion may invalidate the outputs_name_to_idx
grad_segment, param_segment = grad_param_segments[i]
before_idx = len(block.ops)
for grad in outputs_name_to_idx:
before_idx = min(before_idx, outputs_name_to_idx[grad][0])
pos_for_coalesce[i] = before_idx
# insert the coalesce op based on the sorted before_idx
pos_for_coalesce = sorted(
pos_for_coalesce.items(),
key=lambda kv: (kv[1], kv[0]),
reverse=True)
for i, before_idx in pos_for_coalesce:
grad_segment, param_segment = grad_param_segments[i]
fused_var = fused_vars[i]
block._insert_op_without_sync(
before_idx,
type="coalesce_tensor",
inputs={"Input": param_segment},
outputs={"Output": grad_segment,
"FusedOutput": fused_var},
attrs={
"copy_data": False,
"use_align": True,
"dtype": grad_segment[0].dtype,
OP_ROLE_KEY: OpRole.Backward
})
if self.calc_comm_same_stream:
block._sync_with_cpp()
return
......@@ -307,9 +341,31 @@ class RawProgramOptimizer(MetaOptimizerBase):
block._insert_op_without_sync(
idx,
type='c_sync_comm_stream',
inputs={'X': fused_vars[0]},
outputs={'Out': fused_vars[0]},
inputs={'X': grad_segment[0]},
outputs={'Out': grad_segment[0]},
attrs={'ring_id': ring_id,
OP_ROLE_KEY: OpRole.Backward})
break
block._sync_with_cpp()
def __get_ouputs_name_to_idx(self, first_backward_idx, block):
# Each item of outputs_name_to_idx is a pair of idx.
# The first entry of this pair is the idx of the first op generates the grad,
# which is used to indicate the position to insert coalesce op.
# The second entry of this pair is the idx of the last op generates the grad,
# which is used to indicate the position to insert sync and allreduce op.
outputs_name_to_idx = {}
for idx in range(first_backward_idx, len(block.ops)):
op = block.ops[idx]
if is_optimizer_op(op):
break
for name in op.output_arg_names:
var = block.var(name)
if not outputs_name_to_idx.get(var):
# if the grad only be generated by one op
# the first idx and the last ids are identical
outputs_name_to_idx[var] = (idx, idx)
else:
outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
idx)
return outputs_name_to_idx
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部