提交 70eb21c5 编写于 作者: S sandyhouse

update, test=develop

上级 594bbcb1
...@@ -32,6 +32,9 @@ message ShardingConfig { ...@@ -32,6 +32,9 @@ message ShardingConfig {
optional float fuse_broadcast_MB = 1 [ default = 32.0 ]; optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
optional bool hybrid_dp = 2 [ default = false ]; optional bool hybrid_dp = 2 [ default = false ];
optional int32 sharding_group_size = 3 [ default = 8 ]; optional int32 sharding_group_size = 3 [ default = 8 ];
optional bool as_outer_parallelism = 4 [ default = false ];
optional int32 inner_parallelism_size = 5 [ default = 8 ];
optional bool use_pipeline = 6 [ default = false ];
} }
message AMPConfig { message AMPConfig {
...@@ -117,6 +120,8 @@ message AsyncConfig { ...@@ -117,6 +120,8 @@ message AsyncConfig {
message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; } message PipelineConfig { optional int32 micro_batch = 1 [ default = 1 ]; }
message ModelParallelConfig { optional int32 parallelism = 1 [ default = 1 ]; }
message DistributedStrategy { message DistributedStrategy {
// bool options // bool options
optional Mode mode = 1 [ default = COLLECTIVE ]; optional Mode mode = 1 [ default = COLLECTIVE ];
...@@ -146,6 +151,7 @@ message DistributedStrategy { ...@@ -146,6 +151,7 @@ message DistributedStrategy {
optional bool fp16_allreduce = 25 [ default = false ]; optional bool fp16_allreduce = 25 [ default = false ];
optional bool sharding = 26 [ default = false ]; optional bool sharding = 26 [ default = false ];
optional float last_comm_group_size_MB = 27 [ default = 1 ]; optional float last_comm_group_size_MB = 27 [ default = 1 ];
optional bool model_parallel = 28 [ default = false ];
optional RecomputeConfig recompute_configs = 101; optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102; optional AMPConfig amp_configs = 102;
...@@ -158,6 +164,7 @@ message DistributedStrategy { ...@@ -158,6 +164,7 @@ message DistributedStrategy {
optional LambConfig lamb_configs = 109; optional LambConfig lamb_configs = 109;
optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110; optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
optional ShardingConfig sharding_configs = 111; optional ShardingConfig sharding_configs = 111;
optional ModelParallelConfig model_parallel_configs = 112;
optional BuildStrategy build_strategy = 201; optional BuildStrategy build_strategy = 201;
optional ExecutionStrategy execution_strategy = 202; optional ExecutionStrategy execution_strategy = 202;
} }
......
...@@ -107,7 +107,7 @@ void SectionWorker::TrainFiles() { ...@@ -107,7 +107,7 @@ void SectionWorker::TrainFiles() {
int op_role = op->Attr<int>(std::string("op_role")); int op_role = op->Attr<int>(std::string("op_role"));
if (op_role == static_cast<int>(OpRole::kOptimize)) { if (op_role == static_cast<int>(OpRole::kOptimize)) {
VLOG(3) << "Update: running op " << op->Type(); VLOG(3) << "Update: running op " << op->Type();
op->Run(*microbatch_scopes_[0], place_); op->Run(*microbatch_scopes_[num_microbatches_ - 1], place_);
if (gc) { if (gc) {
DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_, DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_,
gc.get()); gc.get());
......
...@@ -56,9 +56,10 @@ class AMPOptimizer(MetaOptimizerBase): ...@@ -56,9 +56,10 @@ class AMPOptimizer(MetaOptimizerBase):
# add is_distributed to optimize amp, overlap communication and # add is_distributed to optimize amp, overlap communication and
# computation by split the check_finite_and_unscale op. # computation by split the check_finite_and_unscale op.
is_distributed = self.role_maker._worker_num() > 1 is_distributed = self.role_maker._worker_num() > 1
if self.user_defined_strategy.sharding: #if self.user_defined_strategy.sharding or self.user_defined_strategy.model_parallel:
# FIXME(wangxi). sharding failed when split check_finite_and_unscale # # FIXME(wangxi). sharding failed when split check_finite_and_unscale
is_distributed = False # # FIXME(JZ-LIANG). To support Sharding-Megatron-AMP, Megatron should follow Sharding's behavior
# is_distributed = False
self.wrapped_opt._set_distributed(is_distributed) self.wrapped_opt._set_distributed(is_distributed)
def _can_apply(self): def _can_apply(self):
......
...@@ -154,8 +154,10 @@ class PipelineOptimizer(MetaOptimizerBase): ...@@ -154,8 +154,10 @@ class PipelineOptimizer(MetaOptimizerBase):
def __init__(self, optimizer): def __init__(self, optimizer):
super(PipelineOptimizer, self).__init__(optimizer) super(PipelineOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [
self.meta_optimizers_white_list = [] "RecomputeOptimizer",
"AMPOptimizer",
]
self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
def _set_basic_info(self, loss, role_maker, user_defined_optimizer, def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册