未验证 提交 02b0be08 编写于 作者: W WangXi 提交者: GitHub

[hybrid] remove scale op in insert_scale_loss_grad_ops (#35775)

上级 1dcf22a8
...@@ -738,14 +738,13 @@ def insert_scale_loss_grad_ops(block, scale=1.0): ...@@ -738,14 +738,13 @@ def insert_scale_loss_grad_ops(block, scale=1.0):
''' '''
for idx, op in reversed(list(enumerate(block.ops))): for idx, op in reversed(list(enumerate(block.ops))):
if is_loss_grad_op(op): if is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]] assert op.type == 'fill_constant', \
block._insert_op_without_sync( "loss_grad_op must be fill_constant op, " \
idx + 1, "but this op is {}".format(op.type)
type='scale', assert op.has_attr('value')
inputs={'X': loss_grad_var}, loss_scale = float(op.attr('value'))
outputs={'Out': loss_grad_var}, loss_scale = loss_scale / scale
attrs={'scale': scale, op._set_attr('value', loss_scale)
OP_ROLE_KEY: OpRole.Backward})
break break
......
...@@ -455,7 +455,7 @@ class ShardingOptimizer(MetaOptimizerBase): ...@@ -455,7 +455,7 @@ class ShardingOptimizer(MetaOptimizerBase):
global_dp_degree = self.sharding_degree * self.dp_degree global_dp_degree = self.sharding_degree * self.dp_degree
assert int(global_dp_degree) == global_dp_degree assert int(global_dp_degree) == global_dp_degree
if global_dp_degree > 1: if global_dp_degree > 1:
insert_scale_loss_grad_ops(main_block, scale=1.0 / global_dp_degree) insert_scale_loss_grad_ops(main_block, scale=global_dp_degree)
main_block._sync_with_cpp() main_block._sync_with_cpp()
......
...@@ -5019,16 +5019,13 @@ class PipelineOptimizer(object): ...@@ -5019,16 +5019,13 @@ class PipelineOptimizer(object):
if self._num_microbatches == 1: return if self._num_microbatches == 1: return
for index, op in reversed(tuple(enumerate(list(block.ops)))): for index, op in reversed(tuple(enumerate(list(block.ops)))):
if self._is_loss_grad_op(op): if self._is_loss_grad_op(op):
loss_grad_var = block.vars[op.output_arg_names[0]] assert op.type == 'fill_constant', \
block._insert_op( "loss_grad_op must be fill_constant op, " \
index=index + 1, "but this op is {}".format(op.type)
type='scale', assert op.has_attr('value')
inputs={'X': loss_grad_var}, loss_scale = float(op.attr('value'))
outputs={'Out': loss_grad_var}, loss_scale = loss_scale / self._num_microbatches
attrs={ op._set_attr('value', loss_scale)
'scale': 1.0 / self._num_microbatches,
self._op_role_key: self._op_role.Backward
})
break break
def _rename_gradient_var_name(self, block): def _rename_gradient_var_name(self, block):
......
...@@ -18,6 +18,7 @@ import paddle.static as static ...@@ -18,6 +18,7 @@ import paddle.static as static
import unittest import unittest
from fleet_meta_optimizer_base import TestFleetMetaOptimizer from fleet_meta_optimizer_base import TestFleetMetaOptimizer
from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op
paddle.enable_static() paddle.enable_static()
...@@ -77,10 +78,10 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -77,10 +78,10 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'scale', 'mean_grad', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
...@@ -158,10 +159,10 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -158,10 +159,10 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'scale', 'mean_grad', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum',
...@@ -220,8 +221,8 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -220,8 +221,8 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast',
'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax',
'cast', 'cross_entropy2', 'mean', 'elementwise_mul', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
'fill_constant', 'scale', 'scale', 'elementwise_mul_grad', 'fill_constant', 'elementwise_mul_grad', 'mean_grad',
'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
...@@ -293,23 +294,23 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -293,23 +294,23 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax', 'tanh', 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'softmax',
'cast', 'cross_entropy2', 'mean', 'elementwise_mul', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
'coalesce_tensor', 'fill_constant', 'scale', 'scale', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast', 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream',
'c_sync_calc_stream', 'send_v2', 'cast', 'sum', 'cast', 'sum', 'send_v2', 'cast', 'sum', 'cast', 'sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale',
'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast',
'c_allreduce_max', 'cast', 'update_loss_scaling', 'squared_l2_norm', 'update_loss_scaling', 'squared_l2_norm', 'squared_l2_norm',
'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum',
'squared_l2_norm', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt', 'fill_constant',
'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div', 'elementwise_max', 'elementwise_div', 'elementwise_mul',
'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul',
'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum', 'elementwise_mul', 'momentum', 'momentum', 'momentum', 'momentum',
'momentum', 'momentum', 'momentum', 'coalesce_tensor', 'momentum', 'coalesce_tensor', 'c_broadcast', 'coalesce_tensor',
'c_broadcast', 'coalesce_tensor', 'c_broadcast' 'c_broadcast'
]) ])
...@@ -327,7 +328,10 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -327,7 +328,10 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
self._debug = False self._debug = False
def test_opt_sharding_with_pp_amp_gclip_boundary(self): def test_opt_sharding_with_pp_amp_gclip_boundary(self):
""" test optimizer sharding without parameter """ """
test optimizer sharding without parameter
test loss grad scale value
"""
train_prog, startup_prog = static.Program(), static.Program() train_prog, startup_prog = static.Program(), static.Program()
avg_cost, strategy = self.boundary_net(train_prog, startup_prog) avg_cost, strategy = self.boundary_net(train_prog, startup_prog)
...@@ -357,6 +361,16 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -357,6 +361,16 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
startup_prog_op_types = [op.type for op in startup_prog_ops] startup_prog_op_types = [op.type for op in startup_prog_ops]
main_prog_op_types = [op.type for op in main_prog_ops] main_prog_op_types = [op.type for op in main_prog_ops]
# check loss scale for hybrid
for op in main_prog_ops:
if is_loss_grad_op(op):
self.assertEqual(op.type, 'fill_constant')
self.assertTrue(op.has_attr('value'))
scale = strategy.pipeline_configs[
'accumulate_steps'] * strategy.sharding_configs['dp_degree']
loss_scale = 1.0 / scale
self.assertAlmostEqual(float(op.attr('value')), loss_scale)
# global, sharding, pp_send, pp_recv # global, sharding, pp_send, pp_recv
self.assertEqual(startup_prog_op_types, [ self.assertEqual(startup_prog_op_types, [
'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
...@@ -367,14 +381,13 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -367,14 +381,13 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
self.assertEqual(main_prog_op_types, [ self.assertEqual(main_prog_op_types, [
'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean', 'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean',
'elementwise_mul', 'fill_constant', 'scale', 'scale', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'reduce_mean_grad', 'cast', 'matmul_grad', 'reduce_mean_grad', 'cast', 'matmul_grad', 'c_sync_calc_stream',
'c_sync_calc_stream', 'send_v2', 'fill_constant', 'cast', 'sum', 'send_v2', 'fill_constant', 'cast', 'sum', 'c_reduce_sum',
'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale', 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast',
'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling',
'update_loss_scaling', 'fill_constant', 'c_allreduce_sum', 'fill_constant', 'c_allreduce_sum', 'c_allreduce_sum', 'sqrt',
'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', 'fill_constant', 'elementwise_max', 'elementwise_div', 'c_broadcast'
'elementwise_div', 'c_broadcast'
]) ])
def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self): def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self):
...@@ -419,14 +432,14 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -419,14 +432,14 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
self.assertEqual(main_prog_op_types, [ self.assertEqual(main_prog_op_types, [
'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean', 'recv_v2', 'cast', 'matmul', 'cast', 'reduce_mean',
'elementwise_mul', 'fill_constant', 'scale', 'scale', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'reduce_mean_grad', 'cast', 'matmul_grad', 'reduce_mean_grad', 'cast', 'matmul_grad', 'c_sync_calc_stream',
'c_sync_calc_stream', 'send_v2', 'fill_constant', 'cast', 'sum', 'send_v2', 'fill_constant', 'cast', 'sum', 'c_reduce_sum',
'c_reduce_sum', 'c_sync_comm_stream', 'check_finite_and_unscale', 'c_sync_comm_stream', 'check_finite_and_unscale', 'cast',
'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'c_allreduce_max', 'c_allreduce_max', 'cast', 'update_loss_scaling',
'update_loss_scaling', 'squared_l2_norm', 'sum', 'c_allreduce_sum', 'squared_l2_norm', 'sum', 'c_allreduce_sum', 'c_allreduce_sum',
'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', 'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div',
'elementwise_div', 'elementwise_mul', 'momentum', 'c_broadcast' 'elementwise_mul', 'momentum', 'c_broadcast'
]) ])
......
...@@ -16,12 +16,11 @@ import unittest ...@@ -16,12 +16,11 @@ import unittest
import paddle import paddle
import os import os
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from fleet_meta_optimizer_base import TestFleetMetaOptimizer from fleet_meta_optimizer_base import TestFleetMetaOptimizer
import paddle.distributed.fleet.meta_optimizers.sharding as sharding import paddle.distributed.fleet.meta_optimizers.sharding as sharding
from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op
paddle.enable_static() paddle.enable_static()
...@@ -52,8 +51,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -52,8 +51,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -91,16 +90,16 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -91,16 +90,16 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast', 'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', 'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
'mean', 'elementwise_mul', 'fill_constant', 'scale', 'mean', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast', 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast',
'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
'c_sync_comm_stream', 'cast', 'cast', 'cast', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast',
'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast', 'cast', 'cast', 'check_finite_and_unscale', 'cast',
'update_loss_scaling', 'momentum', 'momentum', 'momentum' 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum',
'momentum', 'momentum'
]) ])
def test_sharding_recompute_optimizer(self): def test_sharding_recompute_optimizer(self):
...@@ -132,11 +131,11 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -132,11 +131,11 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
]) ])
...@@ -177,7 +176,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -177,7 +176,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', 'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad', 'fill_constant', 'elementwise_mul_grad', 'mean_grad',
'cross_entropy_grad2', 'cast', 'softmax_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad',
'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul',
'elementwise_add', 'cast', 'tanh_grad', 'cast', 'elementwise_add', 'cast', 'tanh_grad', 'cast',
...@@ -222,8 +221,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -222,8 +221,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -259,8 +258,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -259,8 +258,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -397,11 +396,14 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -397,11 +396,14 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
# check loss scale for sharding hybrid dp # check loss scale for sharding hybrid dp
scale_ = -1
for op in main_prog_ops: for op in main_prog_ops:
if op.type == "scale": if is_loss_grad_op(op):
scale_ = float(op.desc.attr("scale")) self.assertEqual(op.type, 'fill_constant')
self.assertEqual(scale_, 0.25) self.assertTrue(op.has_attr('value'))
scale = strategy.sharding_configs[
'sharding_degree'] * strategy.sharding_configs['dp_degree']
loss_scale = 1.0 / scale
self.assertAlmostEqual(float(op.attr('value')), loss_scale)
# check program (allreudce) # check program (allreudce)
ops = [op.type for op in main_prog_ops] ops = [op.type for op in main_prog_ops]
...@@ -411,8 +413,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -411,8 +413,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -474,8 +476,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -474,8 +476,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', 'fill_constant', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -543,11 +545,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -543,11 +545,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul', 'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax',
'cross_entropy2', 'mean', 'fill_constant', 'scale', 'scale', 'cross_entropy2', 'mean', 'fill_constant', 'mean_grad',
'mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum',
'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum',
...@@ -742,11 +743,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -742,11 +743,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'mul', 'cast', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast',
'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
'fill_constant', 'scale', 'scale', 'elementwise_mul_grad', 'fill_constant', 'elementwise_mul_grad', 'mean_grad',
'mean_grad', 'cross_entropy_grad2', 'softmax_grad', 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad',
'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad', 'cast', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant', 'fill_constant', 'sum', 'fill_constant', 'sum', 'fill_constant',
...@@ -908,10 +908,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -908,10 +908,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'elementwise_mul', 'fill_constant', 'scale', 'scale', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
...@@ -1003,10 +1003,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -1003,10 +1003,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul', 'recv_v2', 'mul', 'elementwise_add', 'tanh', 'mul',
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'elementwise_mul', 'fill_constant', 'scale', 'scale', 'elementwise_mul', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'fill_constant', 'cast', 'sum', 'fill_constant', 'sum', 'fill_constant', 'cast', 'sum', 'fill_constant', 'sum',
...@@ -1102,8 +1102,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -1102,8 +1102,8 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul',
'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', 'cast', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor',
'coalesce_tensor', 'coalesce_tensor', 'fill_constant', 'scale', 'coalesce_tensor', 'coalesce_tensor', 'fill_constant',
'scale', 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2',
'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
...@@ -1193,10 +1193,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -1193,10 +1193,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add', 'elementwise_add', 'tanh', 'cast', 'mul', 'cast', 'elementwise_add',
'softmax', 'cross_entropy2', 'mean', 'elementwise_mul', 'softmax', 'cross_entropy2', 'mean', 'elementwise_mul',
'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor',
'coalesce_tensor', 'fill_constant', 'scale', 'scale', 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad',
'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'mean_grad', 'cross_entropy_grad2', 'softmax_grad',
'softmax_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'elementwise_add_grad', 'cast', 'mul_grad', 'tanh_grad',
'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2',
'cast', 'sum', 'sum', 'c_allreduce_sum', 'c_allreduce_sum', 'cast', 'sum', 'sum', 'c_allreduce_sum', 'c_allreduce_sum',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册