提交 6655077e 编写于 作者: S sandyhouse

fix: set @Bcast@Grad to non persistable

上级 a6879219
...@@ -4843,8 +4843,8 @@ class PipelineOptimizer(object): ...@@ -4843,8 +4843,8 @@ class PipelineOptimizer(object):
Accumulate the gradients generated in microbatch to the one in mini-batch. Accumulate the gradients generated in microbatch to the one in mini-batch.
""" """
first_optimize_op_index = None first_optimize_op_index = None
accumulated_grad_names = []
for index, op in reversed(tuple(enumerate(list(block.ops)))): for index, op in reversed(tuple(enumerate(list(block.ops)))):
# device = op.attr(self._op_device_key)
# remove the cast op of fp16 grad to fp32 grad # remove the cast op of fp16 grad to fp32 grad
if self._is_optimize_op(op) and op.type == 'cast': if self._is_optimize_op(op) and op.type == 'cast':
in_name = op.input_arg_names[0] in_name = op.input_arg_names[0]
...@@ -4872,13 +4872,15 @@ class PipelineOptimizer(object): ...@@ -4872,13 +4872,15 @@ class PipelineOptimizer(object):
for i in range(0, len(op_role_var), 2): for i in range(0, len(op_role_var), 2):
offset = 0 offset = 0
param_name = op_role_var[i] param_name = op_role_var[i]
if not block.has_var(param_name): continue # if not block.has_var(param_name): continue
if '@BroadCast' in param_name:
param_name = param_name[0:param_name.find('@BroadCast')]
# clear gradient # clear gradient
param_grad_name = self._append_grad_suffix(param_name) param_grad_name = self._append_grad_suffix(param_name)
# if not main_block.has_var(grad_name): continue accumulated_grad_names.append(param_grad_name)
if not block.has_var(param_grad_name): #if not block.has_var(param_grad_name):
self._create_var(block, block.vars[param_name], # self._create_var(block, block.vars[param_name],
param_grad_name) # param_grad_name)
assert block.has_var(param_grad_name) assert block.has_var(param_grad_name)
param_grad_var = block.var(param_grad_name) param_grad_var = block.var(param_grad_name)
param_grad_var.persistable = True param_grad_var.persistable = True
...@@ -4898,10 +4900,10 @@ class PipelineOptimizer(object): ...@@ -4898,10 +4900,10 @@ class PipelineOptimizer(object):
#offset += 1 #offset += 1
grad_name = op_role_var[i + 1] # with _0 suffix grad_name = op_role_var[i + 1] # with _0 suffix
grad_var = block.vars[grad_name] grad_var = block.vars[grad_name]
real_grad_name = grad_name[0:grad_name.find( #real_grad_name = grad_name[0:grad_name.find(
'@GRAD')] + '@GRAD' # without _0 suffix # '@GRAD')] + '@GRAD' # without _0 suffix
real_grad_var = block.vars[ #real_grad_var = block.vars[
real_grad_name] # without _0 suffix # real_grad_name] # without _0 suffix
# new_grad_var_name = unique_name.generate(grad_name) # new_grad_var_name = unique_name.generate(grad_name)
# new_var = self._create_var(block, grad_var, # new_var = self._create_var(block, grad_var,
# new_grad_var_name) # new_grad_var_name)
...@@ -4911,7 +4913,7 @@ class PipelineOptimizer(object): ...@@ -4911,7 +4913,7 @@ class PipelineOptimizer(object):
block._insert_op( block._insert_op(
index=index + 1, index=index + 1,
type='sum', type='sum',
inputs={'X': [grad_var, real_grad_var]}, inputs={'X': [grad_var, param_grad_var]},
outputs={'Out': real_grad_var}, outputs={'Out': real_grad_var},
attrs={ attrs={
#self._op_device_key: device, #self._op_device_key: device,
...@@ -4922,13 +4924,13 @@ class PipelineOptimizer(object): ...@@ -4922,13 +4924,13 @@ class PipelineOptimizer(object):
else: else:
grad_name = op_role_var[i + 1] # with _0 suffix grad_name = op_role_var[i + 1] # with _0 suffix
grad_var = block.vars[grad_name] grad_var = block.vars[grad_name]
fp32_grad_var_name = param_name + core.grad_var_suffix( #fp32_grad_var_name = param_name + core.grad_var_suffix(
) # without _0 suffix #) # without _0 suffix
fp32_grad_var = block.vars[fp32_grad_var_name] #fp32_grad_var = block.vars[fp32_grad_var_name]
fp32_grad_var.persistable = True #fp32_grad_var.persistable = True
cast_grad_var_name = unique_name.generate( cast_grad_var_name = unique_name.generate(
fp32_grad_var_name) param_grad_name)
cast_grad_var = self._create_var(block, fp32_grad_var, cast_grad_var = self._create_var(block, param_grad_var,
cast_grad_var_name) cast_grad_var_name)
cast_grad_var.persistable = False cast_grad_var.persistable = False
block._insert_op( block._insert_op(
...@@ -4947,7 +4949,7 @@ class PipelineOptimizer(object): ...@@ -4947,7 +4949,7 @@ class PipelineOptimizer(object):
block._insert_op( block._insert_op(
index=index + 2, index=index + 2,
type='sum', type='sum',
inputs={'X': [fp32_grad_var, cast_grad_var]}, inputs={'X': [param_grad_var, cast_grad_var]},
outputs={'Out': fp32_grad_var}, outputs={'Out': fp32_grad_var},
attrs={ attrs={
# self._op_device_key: device, # self._op_device_key: device,
...@@ -4995,6 +4997,7 @@ class PipelineOptimizer(object): ...@@ -4995,6 +4997,7 @@ class PipelineOptimizer(object):
# self._op_role_key: self._op_role.Backward, # self._op_role_key: self._op_role.Backward,
# # self._op_role_var_key: op_role_var # # self._op_role_var_key: op_role_var
# }) # })
return first_optimize_op_index, accumulated_grad_names
def _add_sub_blocks(self, main_block, program_list): def _add_sub_blocks(self, main_block, program_list):
main_program = main_block.program main_program = main_block.program
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册