diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index 8aee34960332ac5789face7806960f2606dcad3f..3816e9b3051abfc026dd426d6988537d64185de0 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -213,6 +213,7 @@ class OffloadHelper(object): if out_name in param_name_to_offload_name: var_name = out_name + # FIXME(wangxi): offload should insert after broadcast param if offload: offload_var_name = param_name_to_offload_name[var_name] self._insert_offload_op(startup_block, idx + 1, diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index f14f1e06624028e31f1bd87f973d7859fa27ec83..1af646b3959e014a99713ccb984e1fc12a320c7e 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -1380,10 +1380,18 @@ class ShardingOptimizer(MetaOptimizerBase): return startup_block = self._startup_program.global_block() - - params = [] - for param in startup_block.iter_parameters(): - params.append(param) + params = startup_block.all_parameters() + + broadcast_params = [] + for param in params: + broadcast_params.append(param) + # optimize_cast need broadcast fp16 param + fp16_param_name = param.name + '.cast_fp16' + if startup_block.has_var(fp16_param_name): + fp16_param = startup_block.var(fp16_param_name) + broadcast_params.append(fp16_param) + + for param in broadcast_params: startup_block.append_op( type='c_broadcast', inputs={'X': param}, @@ -1395,8 +1403,8 @@ class ShardingOptimizer(MetaOptimizerBase): }) startup_block.append_op( type='c_sync_comm_stream', - inputs={'X': params}, - outputs={'Out': params}, + inputs={'X': broadcast_params}, + outputs={'Out': broadcast_params}, attrs={'ring_id': self.dp_ring_id, OP_ROLE_KEY: OpRole.Forward}) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index f809f1bda079daece81510f576a7f0513a7b0fdd..00188168727f964ae2f7234ef6391464d64af019 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4381,6 +4381,18 @@ class PipelineOptimizer(object): name=var, type=core.VarDesc.VarType.READER, persistable=source_var.persistable) + elif isinstance(source_var, Parameter): + dest_var = block.create_parameter( + name=source_var.name, + shape=source_var.shape, + dtype=source_var.dtype, + type=source_var.type, + lod_level=source_var.lod_level, + stop_gradient=source_var.stop_gradient, + trainable=source_var.trainable, + optimize_attr=source_var.optimize_attr, + regularizer=source_var.regularizer, + error_clip=source_var.error_clip) else: dest_var = block._clone_variable(source_var, False) self._clone_var_attr(dest_var, source_var) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py index 6de2d2fb09228a546ff015766b9ffb0eea44df44..db8689c14c30f3785a1e38593acbb09756f7692f 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py @@ -71,6 +71,8 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) @@ -152,6 +154,8 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) @@ -212,7 +216,9 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', - 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -284,7 +290,9 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', - 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -376,7 +384,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', - 'c_comm_init', 'c_sync_comm_stream' + 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -427,7 +435,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 6b0a7b79c232cca3e69a037f9ece6ef0f168d083..61d98d32ec5fd7f2512a54ef998ae7b8ef392f2e 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -762,7 +762,9 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -928,7 +930,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -1023,7 +1028,11 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -1121,7 +1130,10 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [ @@ -1211,7 +1223,9 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' ]) self.assertEqual(main_prog_op_types, [