From 4ec51e0205545031c98746e22ed5b22311948a98 Mon Sep 17 00:00:00 2001 From: Dong Daxiang <35550832+guru4elephant@users.noreply.github.com> Date: Mon, 17 Aug 2020 22:07:00 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90paddle.fleet=E3=80=91Clear=20disable?= =?UTF-8?q?=20(#26334)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add check approval test=develop --- .../distributed/fleet/base/strategy_compiler.py | 12 ++++++++++++ .../fleet/meta_optimizers/amp_optimizer.py | 1 + .../async_graph_execution_optimizer.py | 3 +++ .../fleet/meta_optimizers/async_optimizer.py | 2 +- .../fleet/meta_optimizers/dgc_optimizer.py | 6 +----- .../meta_optimizers/gradient_merge_optimizer.py | 2 +- .../fleet/meta_optimizers/lamb_optimizer.py | 5 +---- .../fleet/meta_optimizers/lars_optimizer.py | 5 +---- .../fleet/meta_optimizers/localsgd_optimizer.py | 2 +- .../fleet/meta_optimizers/meta_optimizer_base.py | 1 + .../fleet/meta_optimizers/pipeline_optimizer.py | 2 +- .../fleet/meta_optimizers/recompute_optimizer.py | 2 +- 12 files changed, 25 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/fleet/base/strategy_compiler.py b/python/paddle/distributed/fleet/base/strategy_compiler.py index f0e23713e4f..a5ff247a21f 100644 --- a/python/paddle/distributed/fleet/base/strategy_compiler.py +++ b/python/paddle/distributed/fleet/base/strategy_compiler.py @@ -76,6 +76,18 @@ class StrategyCompiler(StrategyCompilerBase): opt._disable_strategy(valid_strategy) return valid_strategy + """ + Meta Optimizer Type A: rewrite forward, backward. e.g. recompute, async, sync, pipeline. + results will be splitted in async, sync, pipeline + Meta Optimizer Type B: rewrite forward, + e.g. AMP and the corresponding backward is generated by rewritten forward + Meta Opitmizer Type B: rewrite backward. e.g. gradient fusion + Meta Optimizer Type D: rewrite optimize. e.g. lars, lamb, localsgd, gradient merge, dgc + Meta Optimizer Type E: only transpile to Graph structure for runtime, + currently, grad fusion and kernel fusion, sync batch-norm included. + we will remove grad fusion and sync batch-norm + """ + def generate_optimizer(self, loss, role_maker, optimizer, user_defined_strategy, meta_optimizer_list, graph_optimizer_list): diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py index 8316d807fa8..6b1756c3695 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -37,6 +37,7 @@ class AMPOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.amp = False + dist_strategy.amp_configs = {} def minimize_impl(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py index 890eae2c143..c0dee220aaf 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/async_graph_execution_optimizer.py @@ -33,6 +33,9 @@ class AsyncGraphExecutionOptimizer(AsyncMetaOptimizer): return True + def _disable_strategy(self, dist_strategy): + dist_strategy.a_sync_configs = {} + def _is_graph_out(self): return True diff --git a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py index b88e863d7be..b6543549728 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/async_optimizer.py @@ -139,4 +139,4 @@ class AsyncMetaOptimizer(MetaOptimizerBase): return None, None def _disable_strategy(self, dist_strategy): - self.user_defined_strategy.a_sync_configs["k_steps"] = -1 + self.user_defined_strategy.a_sync_configs = {} diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index c9a28fdaf11..361175a11c5 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -68,11 +68,7 @@ class DGCOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.dgc = False - dist_strategy.dgc_configs = { - 'rampup_begin_step': 0, - 'rampup_step': 1, - 'sparsity': [0.999] - } + dist_strategy.dgc_configs = {} def backward(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index 668cf605def..28cbce317a9 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -40,7 +40,7 @@ class GradientMergeOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.gradient_merge = False - dist_strategy.gradient_merge_configs = {"k_steps": 1, "avg": True} + dist_strategy.gradient_merge_configs = {} def minimize_impl(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index cf4b479b523..d9a31c17e0d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -74,10 +74,7 @@ class LambOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.lamb = False - dist_strategy.lamb_configs = { - 'lamb_weight_decay': 0.01, - 'exclude_from_weight_decay': [], - } + dist_strategy.lamb_configs = {} def backward(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index ff535e3ebf2..a54a4fc5599 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -58,10 +58,7 @@ class LarsOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.lars = False - dist_strategy.lars_configs = { - 'lars_coeff': 0.001, - 'lars_weight_decay': 0.0005, - } + dist_strategy.lars_configs = {} def backward(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index 9a5c6745164..c807815ff46 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -39,7 +39,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.localsgd = False - dist_strategy.localsgd_configs = {'k_steps': 1} + dist_strategy.localsgd_configs = {} def snapshot_name(self, param_name): return param_name + self.snapshot_key diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py index 9ba184fb008..04800cefdda 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py +++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py @@ -38,6 +38,7 @@ class MetaOptimizerBase(object): def _can_update(self, optimizer): if str(optimizer.__class__.__name__) in self.meta_optimizers_white_list: return True + return False def _disable_strategy(self, dist_strategy): raise NotImplementedError("you should implement disable strategy in {}". diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index a42c7e63cc6..8a0c48aa544 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -110,7 +110,7 @@ class PipelineOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.pipeline = False - dist_strategy.pipeline_configs = {"micro_batch": 1} + dist_strategy.pipeline_configs = {} def minimize_impl(self, loss, diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py index 73119d81094..96247474927 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py @@ -42,7 +42,7 @@ class RecomputeOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.recompute = False - dist_strategy.recompute_configs = {"checkpoints": []} + dist_strategy.recompute_configs = {} def backward(self, loss, -- GitLab