diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py index b25c1ff7cf58c541cbf5c6ef2d99e8bf08de84b3..1680b6ace858289b27bcbfc2ced37bc3c3e0cb8f 100644 --- a/python/paddle/amp/grad_scaler.py +++ b/python/paddle/amp/grad_scaler.py @@ -75,8 +75,8 @@ class AmpScaler: data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') model = paddle.nn.Conv2D(3, 2, 3) - optimizer = paddle.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters()) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = paddle.to_tensor(data) with paddle.amp.amp_guard(): @@ -168,8 +168,8 @@ class AmpScaler: data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') model = paddle.nn.Conv2D(3, 2, 3) - optimizer = paddle.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters()) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = paddle.to_tensor(data) with paddle.amp.amp_guard(): @@ -221,8 +221,8 @@ class AmpScaler: data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32') model = paddle.nn.Conv2D(3, 2, 3) - optimizer = paddle.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters()) + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters()) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = paddle.to_tensor(data) with paddle.amp.amp_guard(): diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py index f1967b5e1dea5e5845578008c6df3f79c6567fbb..f3f3bf950d507bf570c7730186f01e43be69f6c0 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py @@ -21,9 +21,10 @@ __all__ = [] import paddle from paddle.common_ops_import import LayerHelper from paddle.fluid.dygraph import base as imperative_base -from paddle.fluid.optimizer import Momentum, Optimizer +from paddle.fluid.optimizer import Optimizer from paddle.framework import core, in_dynamic_mode from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops +from paddle.optimizer import Momentum from paddle.regularizer import L1Decay, L2Decay from paddle.static import create_global_var diff --git a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py index 618465d401b8b689ed0e243f02fa93cc02fc7b12..86516bcc510fcfacfb71af73a08626a5526acef9 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/fp16_allreduce_optimizer.py @@ -152,6 +152,6 @@ class FP16AllReduceOptimizer(MetaOptimizerBase): def apply_optimize(self, loss, startup_program, params_grads): new_params_grads = self.fp16_compression(params_grads) - return self.inner_opt.apply_optimize( + return self.inner_opt._apply_optimize( loss, startup_program=startup_program, params_grads=new_params_grads ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py index 858949b6a44527ce91063353fba06f4abc62a80d..fa100111e88b108d76dca627fc7a87d8bb507f8a 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/gradient_merge_optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.optimizer import GradientMergeOptimizer as GM +from paddle.incubate.optimizer import GradientMergeOptimizer as GM from .meta_optimizer_base import MetaOptimizerBase diff --git a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py index 9083c659bfccaabd7832cb4618ab69546673301d..8c6474cf200f36ef51824d15a0e10d44ff5a6281 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lamb_optimizer.py @@ -14,7 +14,7 @@ import logging import paddle -from paddle.fluid.optimizer import AdamOptimizer +from paddle.optimizer import Adam from .meta_optimizer_base import MetaOptimizerBase @@ -38,7 +38,7 @@ class LambOptimizer(MetaOptimizerBase): ) opt = self.inner_opt - if not isinstance(opt, AdamOptimizer): + if not isinstance(opt, Adam): return configs = self.user_defined_strategy.lamb_configs @@ -72,7 +72,7 @@ class LambOptimizer(MetaOptimizerBase): return False if self.user_defined_strategy.lamb: - if not isinstance(self.inner_opt, AdamOptimizer): + if not isinstance(self.inner_opt, Adam): logging.warn( "lamb need the inner optimizer to be AdamOptimizer optimizer but got {}.".format( self.inner_opt.type diff --git a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py index a81305cecaf093ea90f79ce4c1af4af02ac31664..53541e4a809fdeaa97cb1a97abd1fdf507b091a3 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/lars_optimizer.py @@ -13,7 +13,8 @@ import logging -from paddle.fluid.optimizer import LarsMomentumOptimizer, Momentum +from paddle.incubate.optimizer import LarsMomentumOptimizer +from paddle.optimizer import Momentum from .meta_optimizer_base import MetaOptimizerBase @@ -98,7 +99,7 @@ class LarsOptimizer(MetaOptimizerBase): return self.lars_opt.apply_gradients(params_grads=params_grads) def apply_optimize(self, loss, startup_program, params_grads): - return self.lars_opt.apply_optimize( + return self.lars_opt._apply_optimize( loss, startup_program=startup_program, params_grads=params_grads ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py index fc9cf107f866779fb1cf8593fb4f2ed2ccc8525f..f3be337fedb77315ddbff492216591817cc7b8ef 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py @@ -49,9 +49,7 @@ class LocalSGDOptimizer(MetaOptimizerBase): self.inner_opt, ( paddle.optimizer.momentum.Momentum, - paddle.fluid.optimizer.Momentum, paddle.optimizer.sgd.SGD, - paddle.fluid.optimizer.SGD, ), ) @@ -235,9 +233,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): self.inner_opt, ( paddle.optimizer.Momentum, - paddle.fluid.optimizer.Momentum, paddle.optimizer.sgd.SGD, - paddle.fluid.optimizer.SGD, ), ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py index 9a7660ebd7dc1fd85ac8386bdaf17f95710d0f98..79bcc134656f5bc79ebeb78b66cda94b1f2ae5b6 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py +++ b/python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.fluid.optimizer import Optimizer +from paddle.optimizer import Optimizer __all__ = [] @@ -81,7 +81,7 @@ class MetaOptimizerBase(Optimizer): ) def apply_optimize(self, loss, startup_program, params_grads): - return self.inner_opt.apply_optimize( + return self.inner_opt._apply_optimize( loss, startup_program=startup_program, params_grads=params_grads ) diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py index a33d66a13a6dcc4dc946286af0c7b358e5fc0ef0..3ce159385fa5a43a8611a805b21e546881aa8065 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py @@ -17,7 +17,6 @@ import re import subprocess import paddle -from paddle import fluid from paddle.framework import core from ..base.private_helper_function import wait_server_ready @@ -293,7 +292,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): % (platform.system()) ) - if not isinstance(self.inner_opt, fluid.optimizer.SGDOptimizer): + if not isinstance(self.inner_opt, paddle.optimizer.SGD): return False free = get_sys_free_mem() diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index eff7f0ebbba8465a52e7ccaa12b0f0865e7855f0..26056b70d3663904b28c661df7a04a8ccb337a94 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and import paddle -from paddle.fluid.optimizer import PipelineOptimizer as PO +from paddle.incubate.optimizer import PipelineOptimizer as PO from .common import ( OP_ROLE_KEY, diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 01a111a899063ac2c37af5241d293a7e3ccc08bf..c6549d71079919b106744ec0cbd30730b068f34f 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -204,7 +204,7 @@ class ParameterServerOptimizer(MetaOptimizerBase): % (platform.system()) ) - if not isinstance(self.inner_opt, paddle.fluid.optimizer.SGDOptimizer): + if not isinstance(self.inner_opt, paddle.optimizer.SGD): return False free = get_sys_free_mem() diff --git a/python/paddle/distributed/fleet/meta_optimizers/qat_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/qat_optimizer.py index f4667e0ddc218d6b848336b932de42b149310f90..2662749733ad35a0c667cc9fde83e2e3f6d3e125 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/qat_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/qat_optimizer.py @@ -96,9 +96,9 @@ class QATOptimizer(MetaOptimizerBase): ): optimize_ops, params_grads = self.inner_opt.minimize( loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set, + startup_program, + parameter_list, + no_grad_set, ) device = paddle.device.get_device() place = paddle.set_device(device) diff --git a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py index 7c7fbecf7008dcaf65ca90747729254e38384683..41ee41b4bfd47a69ef2d5a61d978569c0730fba7 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/recompute_optimizer.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -from paddle.fluid.optimizer import RecomputeOptimizer as RO +from paddle.incubate.optimizer import RecomputeOptimizer as RO from .meta_optimizer_base import MetaOptimizerBase diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 2452f6882baed7780c4187714834033681f4a63a..ad126d5348633685fa6ffe40c1c05646206cb8fe 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -15,7 +15,7 @@ import os from paddle.fluid import core -from paddle.fluid.optimizer import PipelineOptimizer +from paddle.incubate.optimizer import PipelineOptimizer from paddle.static import ( create_global_var, default_startup_program, diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py index 3e3b4cce74c5631de6d505498b2bd642c72f367c..da80fb2a6373d15d251dfa6cb75252e6985f6000 100644 --- a/python/paddle/distributed/passes/auto_parallel_fp16.py +++ b/python/paddle/distributed/passes/auto_parallel_fp16.py @@ -917,7 +917,7 @@ class FP16Pass(AMPPass): if self.target_dtype == "fp16": if isinstance( - base_opt, (paddle.static.Adam, paddle.optimizer.AdamW) + base_opt, (paddle.optimizer.Adam, paddle.optimizer.AdamW) ): with main_program._optimized_guard([]): # found_inf = paddle.tensor.creation._memcpy( diff --git a/python/paddle/distributed/transpiler/distribute_transpiler.py b/python/paddle/distributed/transpiler/distribute_transpiler.py index a4447347e8aeab19c4cbdc8c04811297be0d69ce..38892c1a6e92eaad114a1ea68b153f218d2cef3a 100644 --- a/python/paddle/distributed/transpiler/distribute_transpiler.py +++ b/python/paddle/distributed/transpiler/distribute_transpiler.py @@ -295,7 +295,7 @@ class DistributeTranspiler: cost =paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_loss = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_loss) # for pserver mode diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 3f27d310624d3f32e9a8018ffd9d1275b9a27206..c986d39aafe46a923797952491d6528d570103da 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -58,27 +58,7 @@ from ..fluid.framework import ( _current_expected_place, ) -__all__ = [ - 'SGD', - 'Momentum', - 'Adam', - 'Dpsgd', - 'DecayedAdagrad', - 'Ftrl', - 'SGDOptimizer', - 'MomentumOptimizer', - 'AdamOptimizer', - 'DpsgdOptimizer', - 'DecayedAdagradOptimizer', - 'FtrlOptimizer', - 'ModelAverage', - 'LarsMomentum', - 'LarsMomentumOptimizer', - 'ExponentialMovingAverage', - 'PipelineOptimizer', - 'LookaheadOptimizer', - 'RecomputeOptimizer', -] +__all__ = [] class Optimizer: @@ -202,7 +182,7 @@ class Optimizer: with fluid.dygraph.guard(): emb = paddle.nn.Embedding(10, 10) - adam = fluid.optimizer.Adam(0.001, parameter_list=emb.parameters()) + adam = paddle.optimizer.Adam(0.001, parameters=emb.parameters()) state_dict = adam.state_dict() ''' @@ -466,13 +446,13 @@ class Optimizer: with fluid.dygraph.guard(): linear = paddle.nn.Linear(10, 10) - adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters()) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) # set learning rate manually by python float value lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] for i in range(5): adam.set_lr(lr_list[i]) - lr = adam.current_step_lr() + lr = adam.get_lr() print("current lr is {}".format(lr)) # Print: # current lr is 0.2 @@ -482,14 +462,6 @@ class Optimizer: # current lr is 0.6 - # set learning rate manually by framework Variable - lr_var = paddle.static.create_global_var( - shape=[1], value=0.7, dtype='float32') - adam.set_lr(lr_var) - lr = adam.current_step_lr() - print("current lr is {}".format(lr)) - # Print: - # current lr is 0.7 @@ -557,8 +529,8 @@ class Optimizer: # example1: LearningRateDecay is not used, return value is all the same with fluid.dygraph.guard(): emb = paddle.nn.Embedding(10, 10) - adam = fluid.optimizer.Adam(0.001, parameter_list = emb.parameters()) - lr = adam.current_step_lr() + adam = paddle.optimizer.Adam(0.001, parameters = emb.parameters()) + lr = adam.get_lr() print(lr) # 0.001 # example2: PiecewiseDecay is used, return the step learning rate @@ -1324,7 +1296,7 @@ class Optimizer: import paddle.fluid as fluid loss = network() - optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) params_grads = optimizer.backward(loss) # you may append operations for params_grads here # ... @@ -1415,8 +1387,8 @@ class Optimizer: a = fluid.dygraph.to_variable(value) linear = paddle.nn.Linear(13, 5) # This can be any optimizer supported by dygraph. - adam = fluid.optimizer.Adam(learning_rate = 0.01, - parameter_list = linear.parameters()) + adam = paddle.optimizer.Adam(learning_rate = 0.01, + parameters = linear.parameters()) out = linear(a) out.backward() adam.minimize(out) @@ -1474,5255 +1446,3 @@ class Optimizer: ) return optimize_ops, params_grads - - -class SGDOptimizer(Optimizer): - r""" - Optimizer of the stochastic gradient descent algorithm. - - .. math:: - - param\_out = param - learning\_rate * grad - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) - sgd_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - - def __init__( - self, - learning_rate, - parameter_list=None, - regularization=None, - grad_clip=None, - multi_precision=False, - name=None, - ): - assert learning_rate is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "sgd" - self._use_mkldnn = False - self._multi_precision = multi_precision - self._master_weights = {} - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - if isinstance(parameters, dict): - parameters = self._update_param_group(parameters) - - # Create accumulator tensors for first and second moments - for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Adam optimizer." - ) - - @no_grad - def _append_optimize_op(self, block, param_and_grad): - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - - lr = self._create_param_lr(param_and_grad) - if in_dygraph_mode(): - _C_ops.sgd_( - param_and_grad[0], - lr, - param_and_grad[1], - master_weight, - find_master, - ) - return None - else: - assert isinstance(block, framework.Block) - # create the optimize op - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": lr, - } - - outputs = {"ParamOut": param_and_grad[0]} - - attrs = {"multi_precision": find_master} - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - sgd_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return sgd_op - - -class MomentumOptimizer(Optimizer): - r""" - - Simple Momentum optimizer with velocity state - - This optimizer has a flag for Nestrov Momentum. - - The update equations are as follows: - - .. math:: - - & velocity = mu * velocity + gradient - - & if (use\_nesterov): - - &\quad param = param - (gradient + mu * velocity) * learning\_rate - - & else: - - &\quad param = param - learning\_rate * velocity - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - momentum (float): Momentum factor - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - use_nesterov (bool, optional): Enables Nesterov momentum, default is false. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - moment_optimizer = fluid.optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) - moment_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - """ - _velocity_acc_str = "velocity" - - def __init__( - self, - learning_rate, - momentum, - parameter_list=None, - use_nesterov=False, - regularization=None, - grad_clip=None, - name=None, - ): - assert learning_rate is not None - assert momentum is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "momentum" - self._momentum = momentum - self._use_nesterov = bool(use_nesterov) - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._velocity_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - velocity_acc = self._get_accumulator( - self._velocity_acc_str, param_and_grad[0] - ) - lr = self._create_param_lr(param_and_grad) - master_weight = None - if in_dygraph_mode(): - _, _, _ = _legacy_C_ops.momentum( - param_and_grad[0], - param_and_grad[1], - velocity_acc, - lr, - master_weight, - param_and_grad[0], - velocity_acc, - master_weight, - 'mu', - self._momentum, - 'use_nesterov', - self._use_nesterov, - ) - return None - else: - attrs = {"mu": self._momentum, "use_nesterov": self._use_nesterov} - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "Velocity": [velocity_acc], - "LearningRate": [lr], - } - - outputs = { - "ParamOut": [param_and_grad[0]], - "VelocityOut": [velocity_acc], - } - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return momentum_op - - -class LarsMomentumOptimizer(Optimizer): - r""" - Momentum optimizer with LARS support - - The update equations are as follows: - - .. math:: - - & local\_learning\_rate = learning\_rate * lars\_coeff * \\ - \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} - - & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon) - - & param = param - velocity - - Parameters: - learning_rate (float|Variable): The learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. \ - momentum (float): momentum factor - lars_coeff (float): Defines how much we trust the layer to change its weights. - lars_weight_decay (float): Weight decay coefficient for decaying using LARS. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None. - epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0. - multi_precision (bool, optional): Whether to use multi-precision during weight updating. - rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \ - before updating. Often choose to be `1.0/batch_size`. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = paddle.static.data( - name="inp", shape=[2, 2], dtype='float32') - out = paddle.static.nn.fc(inp, size=3) - out = paddle.sum(out) - optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) - optimizer.minimize(out) - - exe = fluid.Executor(fluid.CPUPlace()) - exe.run(fluid.default_startup_program()) - exe.run( - feed={"inp": np_inp}, - fetch_list=[out.name]) - """ - _velocity_acc_str = "velocity" - - def __init__( - self, - learning_rate, - momentum, - lars_coeff=0.001, - lars_weight_decay=0.0005, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - exclude_from_weight_decay=None, - epsilon=0, - multi_precision=False, - rescale_grad=1.0, - ): - assert learning_rate is not None - assert momentum is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "lars_momentum" - self._momentum = momentum - self._lars_coeff = float(lars_coeff) - self._lars_weight_decay = float(lars_weight_decay) - self._epsilon = float(epsilon) - if exclude_from_weight_decay is None: - self._exclude_from_weight_decay = [] - else: - self._exclude_from_weight_decay = exclude_from_weight_decay - self._multi_precision = multi_precision - self._rescale_grad = float(rescale_grad) - self._master_weights = {} - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): - master_p = self._create_master_weight(p) - self._add_accumulator(self._velocity_acc_str, master_p) - continue - if ( - self._is_dtype_fp16_or_bf16(p.dtype) - and not self._multi_precision - ): - warnings.warn( - "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." - "Consider using multi_precision=True option of the Lars optimizer." - ) - self._add_accumulator(self._velocity_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - _lars_weight_decay = self._lars_weight_decay - param_name = param_and_grad[0].name - if len(self._exclude_from_weight_decay) > 0: - for name in self._exclude_from_weight_decay: - if name in param_name: - _lars_weight_decay = 0.0 - break - - velocity_acc = self._get_accumulator_master( - self._velocity_acc_str, param_and_grad[0] - ) - lr = self._create_param_lr(param_and_grad) - - find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( - param_and_grad[0].dtype - ) - master_weight = ( - self._master_weights[param_and_grad[0].name] - if find_master - else None - ) - - attrs = { - "mu": self._momentum, - "lars_coeff": self._lars_coeff, - "lars_weight_decay": [_lars_weight_decay], - "multi_precision": find_master, - "epsilon": self._epsilon, - "rescale_grad": self._rescale_grad, - } - - inputs = { - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Velocity": velocity_acc, - "LearningRate": lr, - } - - outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc} - - if find_master: - inputs["MasterParam"] = master_weight - outputs["MasterParamOut"] = master_weight - - if in_dygraph_mode(): - tmp, tmp2 = _legacy_C_ops.lars_momentum( - [param_and_grad[0]], - [param_and_grad[1]], - [velocity_acc], - [lr], - [param_and_grad[0]], - [velocity_acc], - "mu", - self._momentum, - "lars_coeff", - self._lars_coeff, - "lars_weight_decay", - [_lars_weight_decay], - "multi_precision", - find_master, - "epsilon", - self._epsilon, - "rescale_grad", - self._rescale_grad, - ) - else: - # create the momentum optimize op - momentum_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return momentum_op - - -class AdamOptimizer(Optimizer): - r""" - The Adam optimizer uses an optimization described at the end - of section 2 of `Adam paper `_ , - it can dynamically adjusts the learning rate of each parameter using - the 1st moment estimates and the 2nd moment estimates of the gradient. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - t & = t + 1 - - moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad - - moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad - - learning\_rate & = learning\_rate * \\ - \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t} - - param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} - - Related paper: `Adam: A Method for Stochastic Optimization `_ - - Args: - learning_rate (float|Variable, optional): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. The default value is 0.001. - beta1 (float|Variable, optional): The exponential decay rate for the 1st moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. - The default value is 0.9. - beta2 (float|Variable, optional): The exponential decay rate for the 2nd moment estimates. - It should be a float number or a Variable with shape [1] and data type as float32. - The default value is 0.999. - epsilon (float|Tensor, optional): A small float value for numerical stability. - It should be a float number or a Variable with shape [1] and data type as float32. - The default value is 1e-08. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. - The accumulators are updated at every step. Every element of the two moving-average - is updated in both dense mode and sparse mode. If the size of parameter is very large, - then the update may be very slow. The lazy mode only update the element that has - gradient in current mini-batch, so it will be much more faster. But this mode has - different semantics with the original Adam algorithm and may lead to different result. - The default value is False. - use_global_beta_pow (bool, optional): Whether to use global beta_pow. If true, Adam will use global beta_pow - for whole model instead of creating beta_pow for each parameter. Default is false. - flatten_param_grads (bool, optional): Whether to flatten all parameters and gradients. Default is false. - align_size (int, optional): The alignment size when flatten parameters and gradients. Default is -1, which means - use same align_size as allocator. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - adam_optimizer = fluid.optimizer.AdamOptimizer(0.01) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - .. code-block:: python - - # Adam with beta1/beta2 as Variable - import paddle - import paddle.fluid as fluid - import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler - - paddle.enable_static() - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - # define beta decay variable - def get_decayed_betas(beta1_init, beta2_init, decay_steps, decay_rate, epsilon_init): - global_step = lr_scheduler._decay_step_counter() - - beta1 = paddle.static.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta1") - beta2 = paddle.static.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="beta2") - epsilon = paddle.static.create_global_var( - shape=[1], - value=float(epsilon_init), - dtype='float32', - # set persistable for save checkpoints and resume - persistable=True, - name="epsilon") - - div_res = global_step / decay_steps - decayed_beta1 = beta1_init * (decay_rate**div_res) - decayed_beta2 = beta2_init * (decay_rate**div_res) - paddle.assign(decayed_beta1, beta1) - paddle.assign(decayed_beta2, beta2) - - return beta1, beta2, epsilon - - beta1, beta2, epsilon = get_decayed_betas(0.9, 0.99, 1e5, 0.9, 1e-8) - adam_optimizer = fluid.optimizer.AdamOptimizer( - learning_rate=0.01, - beta1=beta1, - beta2=beta2, - epsilon=epsilon) - adam_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - """ - _moment1_acc_str = "moment1" - _moment2_acc_str = "moment2" - _beta1_pow_acc_str = "beta1_pow_acc" - _beta2_pow_acc_str = "beta2_pow_acc" - - def __init__( - self, - learning_rate=0.001, - beta1=0.9, - beta2=0.999, - epsilon=1e-8, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - lazy_mode=False, - use_global_beta_pow=False, - flatten_param_grads=False, - align_size=-1, - ): - assert learning_rate is not None - assert beta1 is not None - assert beta2 is not None - assert epsilon is not None - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - flatten_param_grads=flatten_param_grads, - align_size=align_size, - name=name, - ) - self.type = "adam" - self._beta1 = beta1 - self._beta2 = beta2 - self._epsilon = epsilon - self._lazy_mode = lazy_mode - self._use_global_beta_pow = use_global_beta_pow - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - # Create accumulator tensors for first and second moments - for p in parameters: - self._add_accumulator(self._moment1_acc_str, p) - self._add_accumulator(self._moment2_acc_str, p) - if not self._use_global_beta_pow: - self._add_accumulator( - name=self._beta1_pow_acc_str, - param=p, - fill_value=0.9 - if isinstance(self._beta1, Variable) - else self._beta1, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, - device='cpu', - ) - self._add_accumulator( - name=self._beta2_pow_acc_str, - param=p, - fill_value=0.999 - if isinstance(self._beta2, Variable) - else self._beta2, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, - device='cpu', - ) - if self._use_global_beta_pow: - self._add_global_accumulator( - name=self._beta1_pow_acc_str, - fill_value=0.9 - if isinstance(self._beta1, Variable) - else self._beta1, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, - device='cpu', - ) - self._add_global_accumulator( - name=self._beta2_pow_acc_str, - fill_value=0.999 - if isinstance(self._beta2, Variable) - else self._beta2, - shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, - device='cpu', - ) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment1 = self._get_accumulator( - self._moment1_acc_str, param_and_grad[0] - ) - moment2 = self._get_accumulator( - self._moment2_acc_str, param_and_grad[0] - ) - if self._use_global_beta_pow: - beta1_pow_acc = self._get_global_accumulator( - self._beta1_pow_acc_str - ) - beta2_pow_acc = self._get_global_accumulator( - self._beta2_pow_acc_str - ) - else: - beta1_pow_acc = self._get_accumulator( - self._beta1_pow_acc_str, param_and_grad[0] - ) - beta2_pow_acc = self._get_accumulator( - self._beta2_pow_acc_str, param_and_grad[0] - ) - lr = self._create_param_lr(param_and_grad) - # create the adam optimize op - - if in_dygraph_mode(): - _beta1 = ( - self._beta1 - if not isinstance(self._beta1, Variable) - else self._beta1.item(0) - ) - _beta2 = ( - self._beta2 - if not isinstance(self._beta2, Variable) - else self._beta2.item(0) - ) - master_weight = None - _, _, _, _, _, _ = _legacy_C_ops.adam( - param_and_grad[0], - param_and_grad[1], - lr, - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - param_and_grad[0], - moment1, - moment2, - beta1_pow_acc, - beta2_pow_acc, - master_weight, - 'epsilon', - self._epsilon, - 'lazy_mode', - self._lazy_mode, - 'min_row_size_to_use_multithread', - 1000, - 'beta1', - _beta1, - 'beta2', - _beta2, - 'use_global_beta_pow', - self._use_global_beta_pow, - ) - - return None - - inputs = { - "Param": [param_and_grad[0]], - "Grad": [param_and_grad[1]], - "LearningRate": [lr], - "Moment1": [moment1], - "Moment2": [moment2], - "Beta1Pow": [beta1_pow_acc], - "Beta2Pow": [beta2_pow_acc], - } - - # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow - found_inf = self._get_auxiliary_var('found_inf') - - if found_inf: - inputs['SkipUpdate'] = found_inf - - outputs = { - "ParamOut": [param_and_grad[0]], - "Moment1Out": [moment1], - "Moment2Out": [moment2], - "Beta1PowOut": [beta1_pow_acc], - "Beta2PowOut": [beta2_pow_acc], - } - attrs = { - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": 1000, - 'use_global_beta_pow': self._use_global_beta_pow, - } - - if isinstance(self._beta1, Variable): - inputs['Beta1Tensor'] = self._beta1 - else: - attrs['beta1'] = self._beta1 - if isinstance(self._beta2, Variable): - inputs['Beta2Tensor'] = self._beta2 - else: - attrs['beta2'] = self._beta2 - if isinstance(self._epsilon, Variable): - inputs['EpsilonTensor'] = self._epsilon - else: - attrs['epsilon'] = self._epsilon - - adam_op = block.append_op( - type=self.type, - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - return adam_op - - def _finish_update(self, block, parameters_and_grads): - r"""Update beta1_pow and beta2_pow accumulator""" - assert isinstance(block, framework.Block) - if self._use_global_beta_pow: - beta1_pow_acc = self._get_global_accumulator( - self._beta1_pow_acc_str - ) - beta2_pow_acc = self._get_global_accumulator( - self._beta2_pow_acc_str - ) - - with block.program._optimized_guard([]): - inputs = {"X": beta1_pow_acc} - outputs = {"Out": beta1_pow_acc} - attrs = {} - if isinstance(self._beta1, Variable): - inputs["Y"] = self._beta1 - # use elementwise_mul for better performance - block.append_op( - type="elementwise_mul", - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - else: - attrs['scale'] = self._beta1 - block.append_op( - type="scale", - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - inputs = {"X": beta2_pow_acc} - outputs = {"Out": beta2_pow_acc} - attrs = {} - if isinstance(self._beta2, Variable): - inputs["Y"] = self._beta2 - # use elementwise_mul for better performance - block.append_op( - type="elementwise_mul", - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - else: - attrs['scale'] = self._beta2 - block.append_op( - type="scale", - inputs=inputs, - outputs=outputs, - attrs=attrs, - stop_gradient=True, - ) - - -class DpsgdOptimizer(Optimizer): - r""" - We implement the Dpsgd optimizer according to CCS16 paper - - Deep Learning with Differential Privacy. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - import numpy - import paddle - paddle.enable_static() - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - data = paddle.static.data(name='X', shape=[-1,1], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) - optimizer.minimize(loss) - - # Run the startup program once and only once. - exe.run(startup_program) - - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - Args: - learning_rate (float|Variable): the learning rate used to update parameters. \ - Can be a float value or a Variable with one float value as data element. - clip (float): clipping threshold - batch_size (float): batch size. - sigma (float): for gaussian noise. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - Notes: - Currently, DpsgdOptimizer doesn't support sparse parameter optimization. - """ - - def __init__( - self, - learning_rate=0.001, - clip=0.9, - batch_size=0.999, - sigma=1e-8, - parameter_list=None, - ): - assert learning_rate is not None - assert clip is not None - assert batch_size is not None - assert sigma is not None - super().__init__( - learning_rate=learning_rate, parameter_list=parameter_list - ) - self.type = "dpsgd" - self._clip = clip - self._batch_size = batch_size - self._sigma = sigma - ''' - Note(wangzhongpu): - This property is only used for debugging, do not need to set it! - Dpsgd operator use time(NULL) as random seed to generate random number. - However, during debugging, we need determinated result, so we will set self._seed to a fixed number. - ''' - self._seed = None - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - # create the dpsgd optimize op - if self._seed is None: - self._seed = 0 - - if in_dygraph_mode(): - _legacy_C_ops.dpsgd( - param_and_grad[0], - param_and_grad[1], - self._create_param_lr(param_and_grad), - param_and_grad[0], - "clip", - self._clip, - "batch_size", - self._batch_size, - "sigma", - self._sigma, - "seed", - self._seed, - ) - else: - dpsgd_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={"ParamOut": param_and_grad[0]}, - attrs={ - "clip": self._clip, - "batch_size": self._batch_size, - "sigma": self._sigma, - "seed": self._seed, - }, - stop_gradient=True, - ) - - return dpsgd_op - - -class DecayedAdagradOptimizer(Optimizer): - r""" - The Decayed Adagrad optimizer can be seen as an Adagrad algorithm that introduces - the decay rate to solve the problem of a sharp drop in the learning rate - during model training when using the AdagradOptimizer. - - The parameter ``param_out`` update rule with gradient ``grad``: - - .. math:: - - moment\_out & = decay * moment + (1 - decay) * grad * grad - - param\_out & = param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} - - Related paper: `Adaptive Subgradient Methods for Online Learning and Stochastic - Optimization `_. - - The original paper does not have an ``epsilon`` attribute. It is added here for numerical - stability to avoid the division by zero error. - - Args: - learning_rate (float|Variable): The learning rate used to update ``Parameter``. - It can be a float value or a ``Variable`` with a float type. - decay (float, optional): The decay rate. The default value is 0.95. - epsilon (float, optional): A small float value for numerical stability. - The default value is 1e-06. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - **Notes**: - **Currently, DecayedAdagradOptimizer doesn't support sparse parameter optimization.** - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - x = paddle.static.data(name='x', shape=[None, 10], dtype='float32') - trans = paddle.static.nn.fc(x, 100) - cost = paddle.mean(trans) - optimizer = fluid.optimizer.DecayedAdagradOptimizer(learning_rate=0.2) - optimizer.minimize(cost) - """ - _moment_acc_str = "moment" - - def __init__( - self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - ): - assert learning_rate is not None - assert decay is not None - assert epsilon is not None - - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - self.type = "decayed_adagrad" - self._decay = decay - self._epsilon = epsilon - - def _create_accumulators(self, block, parameters): - assert isinstance(block, framework.Block) - - for p in parameters: - self._add_accumulator(self._moment_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - assert isinstance(block, framework.Block) - - moment_acc = self._get_accumulator( - self._moment_acc_str, param_and_grad[0] - ) - - if in_dygraph_mode(): - _legacy_C_ops.decayed_adagrad( - param_and_grad[0], - param_and_grad[1], - moment_acc, - self._create_param_lr(param_and_grad), - param_and_grad[0], - moment_acc, - "epsilon", - self._epsilon, - "decay", - self._decay, - ) - else: - # Create the decayed adagrad optimizer op - decayed_adagrad_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "Moment": moment_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "MomentOut": moment_acc, - }, - attrs={"epsilon": self._epsilon, "decay": self._decay}, - stop_gradient=True, - ) - - return decayed_adagrad_op - - -class FtrlOptimizer(Optimizer): - r""" - FTRL (Follow The Regularized Leader) Optimizer. - - The paper that proposed Follow The Regularized Leader (FTRL): - (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf) - - .. math:: - - &new\_accum = squared\_accum + grad^2 - - &if (lr\_power == -0.5): - - &\quad linear\_accum += grad - \\frac{\\sqrt{new\_accum} - \\sqrt{squared\_accum}}{learning\_rate * param} - - &else: - - &\quad linear\_accum += grad - \\frac{new\_accum^{-lr\_power} - accum^{-lr\_power}}{learning\_rate * param} - - - &x = l1 * sign(linear\_accum) - linear\_accum - - &if (lr\_power == -0.5): - - &\quad y = \\frac{\\sqrt{new\_accum}}{learning\_rate} + (2 * l2) - - &\quad pre\_shrink = \\frac{x}{y} - - &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) - - &else: - - &\quad y = \\frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) - - &\quad pre\_shrink = \\frac{x}{y} - - &\quad param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) - - &squared\_accum += grad^2 - - Parameters: - learning_rate (float|Variable): Global learning rate. - l1 (float): L1 regularization strength, default is 0.0. - l2 (float): L2 regularization strength, default is 0.0. - lr_power (float): Learning Rate Power, default is -0.5. - parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static graph mode, at this time all parameters will be updated. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - name (str, optional): This parameter is used by developers to print debugging information. \ - For details, please refer to :ref:`api_guide_Name`. Default is None. - - Raises: - ValueError: If learning_rate, rho, epsilon, momentum are None. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - - place = fluid.CPUPlace() - main = fluid.Program() - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, activation=None) - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - - ftrl_optimizer = fluid.optimizer.Ftrl(learning_rate=0.1) - ftrl_optimizer.minimize(avg_cost) - - fetch_list = [avg_cost] - train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=1) - feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for data in train_reader(): - exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list) - - NOTE: - Currently, FtrlOptimizer doesn't support sparse parameter optimization. - """ - - _squared_acc_str = "squared" - _linear_acc_str = "linear" - - def __init__( - self, - learning_rate, - l1=0.0, - l2=0.0, - lr_power=-0.5, - parameter_list=None, - regularization=None, - grad_clip=None, - name=None, - ): - super().__init__( - learning_rate=learning_rate, - parameter_list=parameter_list, - regularization=regularization, - grad_clip=grad_clip, - name=name, - ) - if learning_rate is None: - raise ValueError("learning_rate is not set.") - - self.type = "ftrl" - self._l1 = l1 - self._l2 = l2 - self._lr_power = lr_power - - def _create_accumulators(self, block, parameters): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - for p in parameters: - self._add_accumulator(self._squared_acc_str, p) - self._add_accumulator(self._linear_acc_str, p) - - def _append_optimize_op(self, block, param_and_grad): - if not isinstance(block, framework.Block): - raise TypeError("block is not instance of framework.Block.") - - squared_acc = self._get_accumulator( - self._squared_acc_str, param_and_grad[0] - ) - linear_acc = self._get_accumulator( - self._linear_acc_str, param_and_grad[0] - ) - if in_dygraph_mode(): - _legacy_C_ops.ftrl( - param_and_grad[0], - squared_acc, - linear_acc, - param_and_grad[1], - self._create_param_lr(param_and_grad), - param_and_grad[0], - squared_acc, - linear_acc, - "l1", - self._l1, - "l2", - self._l2, - "lr_power", - self._lr_power, - ) - - else: - ftrl_op = block.append_op( - type=self.type, - inputs={ - "Param": param_and_grad[0], - "Grad": param_and_grad[1], - "SquaredAccumulator": squared_acc, - "LinearAccumulator": linear_acc, - "LearningRate": self._create_param_lr(param_and_grad), - }, - outputs={ - "ParamOut": param_and_grad[0], - "SquaredAccumOut": squared_acc, - "LinearAccumOut": linear_acc, - }, - attrs={ - "l1": self._l1, - "l2": self._l2, - "lr_power": self._lr_power, - }, - stop_gradient=True, - ) - - return ftrl_op - - -# We short the class name, since users will use the optimizer with the package -# name. The sample code: -# -# import paddle.fluid as fluid -# -# sgd = fluid.optimizer.SGD(...) -# -# It is no need to add an `Optimizer` as the class suffix -SGD = SGDOptimizer -Momentum = MomentumOptimizer -Adam = AdamOptimizer -Dpsgd = DpsgdOptimizer -DecayedAdagrad = DecayedAdagradOptimizer -Ftrl = FtrlOptimizer -LarsMomentum = LarsMomentumOptimizer - - -class ModelAverage(Optimizer): - r""" - :api_attr: Static Graph - - The ModelAverage optimizer accumulates specific continuous historical parameters - during training. The accumulated historical range can be controlled by the passed - ``average_window_rate`` argument. The averaged ``Parameter`` are used in the prediction, - which usually can improve the accuracy of the prediction. - - Accumulate the average of the ``Parameter`` in the sliding window, the result will be saved - in a temporary variable, can be applied to the current model's ``Parameter`` by calling - the ``apply()`` method, and the current model ``Parameter`` can be restored by calling - the ``restore()`` method. - - The window size for calculating the average is determined by ``average_window_rate``, - ``min_average_window``, ``max_average_window`` and the current ``Parameter`` update times (num_updates). - - When the cumulative times (num_accumulates) is greater than the specific window - threshold (average_window), the accumulated ``Parameter`` temporary variable is set to 0.0. - The following example will help to understand the role of these arguments: - - :: - - if num_accumulates >= min_average_window and num_accumulates >= min(max_average_window, num_updates * average_window_rate): - num_accumulates = 0 - - In the above conditional judgment statement, ``num_accumulates`` indicates the current - accumulated number, which can be abstractly understood as the length of the cumulative window. - The length of the window must be at least the length set by the ``min_average_window`` argument, - and cannot exceed the length specified by the ``max_average_window`` argument or - ``num_updates * average_window_rate``, where ``num_updates`` indicates the current ``Parameter`` - update times, ``average_window_rate`` is a coefficient that calculates the length of the window. - - Args: - average_window_rate (float): The calculate ratio of the window length relative to ``Parameter`` update times. - min_average_window (int, optional): the minimum size of average window length. The default value is 10000. - max_average_window (int, optional): The maximum size of average window length. The default value is 10000. - regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ - :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ - regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ - ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ - Default None, meaning there is no regularization. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy - paddle.enable_static() - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = paddle.static.data(name='X', shape=[None, 1], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = fluid.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - - def __init__( - self, - average_window_rate, - min_average_window=10000, - max_average_window=10000, - regularization=None, - name=None, - ): - if in_dygraph_mode(): - raise Exception("In dygraph, don't support ModelAverage.") - super().__init__(0.0, regularization=regularization, name=name) - self.average_window = average_window_rate - self.min_average_window = min_average_window - self.max_average_window = max_average_window - - self.params_grads = [] - for param in ( - framework.default_main_program().global_block().all_parameters() - ): - if param.do_model_average != False: - grad = param.block.create_var( - name=unique_name.generate_with_ignorable_key( - ".".join([param.name, 'tmp']) - ), - dtype=param.dtype, - persistable=False, - stop_gradient=True, - ) - self.params_grads.append((param, grad)) - - for param, grad in self.params_grads: - if grad is None: - continue - with param.block.program._optimized_guard( - [param, grad] - ), name_scope('move_average'): - self._append_average_accumulate_op(param) - - self.apply_program = Program() - block = self.apply_program.global_block() - with program_guard(main_program=self.apply_program): - for param_grad in self.params_grads: - self._add_average_apply_op(block, param_grad) - - self.restore_program = Program() - block = self.restore_program.global_block() - with program_guard(main_program=self.restore_program): - for param_grad in self.params_grads: - self._add_average_restore_op(block, param_grad) - - def _add_average_apply_op(self, block, param_grad): - param = block._clone_variable(param_grad[0]) - grad = block._clone_variable(param_grad[1]) - sum_1 = block._clone_variable(self._get_accumulator('sum_1', param)) - sum_2 = block._clone_variable(self._get_accumulator('sum_2', param)) - sum_3 = block._clone_variable(self._get_accumulator('sum_3', param)) - num_accumulates = block._clone_variable( - self._get_accumulator('num_accumulates', param) - ) - old_num_accumulates = block._clone_variable( - self._get_accumulator('old_num_accumulates', param) - ) - num_updates = block._clone_variable( - self._get_accumulator('num_updates', param) - ) - # backup param value to grad - paddle.assign(param, output=grad) - # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates) - tmp = paddle.add_n([num_accumulates, old_num_accumulates]) - sum = paddle.add_n([sum_1, sum_2, sum_3]) - tmp = paddle.cast( - x=tmp, dtype='float32' if self._dtype is None else self._dtype - ) - sum = paddle.cast( - x=sum, dtype='float32' if self._dtype is None else self._dtype - ) - paddle.assign(paddle.divide(sum, tmp), output=param) - - def _add_average_restore_op(self, block, param_grad): - param = block._clone_variable(param_grad[0]) - grad = block._clone_variable(param_grad[1]) - paddle.assign(grad, output=param) - - def _append_average_accumulate_op(self, param): - self.helper = LayerHelper("average_accumulate") - sum_1 = self._add_accumulator('sum_1', param) - sum_2 = self._add_accumulator('sum_2', param) - sum_3 = self._add_accumulator('sum_3', param) - num_accumulates = self._add_accumulator( - 'num_accumulates', param, dtype='int64', shape=[1] - ) - old_num_accumulates = self._add_accumulator( - 'old_num_accumulates', param, dtype='int64', shape=[1] - ) - num_updates = self._add_accumulator( - 'num_updates', param, dtype='int64', shape=[1] - ) - - self.helper.append_op( - type='average_accumulates', - inputs={ - "param": param, - "in_sum_1": sum_1, - "in_sum_2": sum_2, - "in_sum_3": sum_3, - "in_num_accumulates": num_accumulates, - "in_old_num_accumulates": old_num_accumulates, - "in_num_updates": num_updates, - }, - outputs={ - "out_sum_1": sum_1, - "out_sum_2": sum_2, - "out_sum_3": sum_3, - "out_num_accumulates": num_accumulates, - "out_old_num_accumulates": old_num_accumulates, - "out_num_updates": num_updates, - }, - attrs={ - "average_window": self.average_window, - "min_average_window": self.min_average_window, - "max_average_window": self.max_average_window, - }, - stop_gradient=True, - ) - - @signature_safe_contextmanager - def apply(self, executor, need_restore=True): - """ - Apply the average of the cumulative ``Parameter`` to the parameters of the current model. - - Args: - executor(fluid.Executor): The current network executor. - need_restore(bool): Restore flag variable, if set to True, the network will restore - the parameters of the network to the default value, if set to False, - it will not be restored. The default value is True. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy - import paddle - paddle.enable_static() - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = paddle.static.data(name='X', shape=[None, 1], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = fluid.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - """ - executor.run(self.apply_program) - try: - yield - finally: - if need_restore: - self.restore(executor) - - def restore(self, executor): - """ - Restore ``Parameter`` values of current model. - - Args: - executor(fluid.Executor): The current network executor. - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import numpy - import paddle - paddle.enable_static() - - # First create the Executor. - place = fluid.CPUPlace() # fluid.CUDAPlace(0) - exe = fluid.Executor(place) - - train_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - # build net - data = paddle.static.data(name='X', shape=[None, 1], dtype='float32') - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer = fluid.optimizer.Momentum(learning_rate=0.2, momentum=0.1) - optimizer.minimize(loss) - - # build ModelAverage optimizer - model_average = fluid.optimizer.ModelAverage(0.15, - min_average_window=10000, - max_average_window=12500) - - exe.run(startup_program) - for i in range(12500): - x = numpy.random.random(size=(10, 1)).astype('float32') - outs = exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # apply ModelAverage - with model_average.apply(exe, False): - x = numpy.random.random(size=(10, 1)).astype('float32') - exe.run(program=train_program, - feed={'X': x}, - fetch_list=[loss.name]) - - # restore Parameters - model_average.restore(exe) - """ - executor.run(self.restore_program) - - -class ExponentialMovingAverage: - r""" - - Compute the moving average of parameters with exponential decay. - Given a parameter :math:`\\theta`, its exponential moving average (EMA) - will be - - .. math:: - - \text{EMA}_0 & = 0 - - \text{EMA}_t & = \text{decay} * \text{EMA}_{t-1} + (1 - \text{decay}) * \theta_t - - The average results calculated by **update()** method will be saved in - temporary variables which are created and maintained by the object, and can - be applied to parameters of current model by calling **apply()** method. And - the **restore()** method is used to restore the parameters. - - **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be - zero biased, which can be corrected by divided by a factor - :math:`(1 - \text{decay}^t)` , i.e., the actual EMAs applied to parameters - when calling **apply()** method would be - - .. math:: - - \widehat{\text{EMA}}_t = \frac{\text{EMA}_t}{1 - \text{decay}^t} - - **Decay rate scheduling**. A large decay rate very close to 1 would result - in that the averages move very slowly. And a better strategy is to set a - relative smaller decay rate in the very beginning. The argument **thres_steps** - allows users to pass a Variable to schedule the decay rate, in this case, - the actual decay rate becomes - - .. math:: - - \min(\text{decay}, \frac{1 + \text{thres_steps}}{10 + \text{thres_steps}}) - - Usually **thres_steps** can be the global training steps. - - - Args: - decay (float, optional): The exponential decay rate, usually close to 1, such as 0.999, 0.9999, ... . Default 0.999. - thres_steps (Variable|None, optional): If not `None`, schedule the decay rate. Default None. - name (str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. - - - Examples: - - .. code-block:: python - - import numpy - import paddle - import paddle.static as static - from paddle.static import ExponentialMovingAverage - - paddle.enable_static() - - data = static.data(name='x', shape=[-1, 5], dtype='float32') - hidden = static.nn.fc(x=data, size=10) - cost = paddle.mean(hidden) - - test_program = static.default_main_program().clone(for_test=True) - optimizer = paddle.optimizer.Adam(learning_rate=0.001) - optimizer.minimize(cost) - - ema = ExponentialMovingAverage(0.999) - ema.update() - - place = paddle.CPUPlace() - exe = static.Executor(place) - exe.run(static.default_startup_program()) - - for pass_id in range(3): - for batch_id in range(6): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=static.default_main_program(), - feed={'x': data}, - fetch_list=[cost.name]) - - # usage 1 - with ema.apply(exe): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=test_program, - feed={'x': data}, - fetch_list=[hidden.name]) - - # usage 2 - with ema.apply(exe, need_restore=False): - data = numpy.random.random(size=(10, 5)).astype('float32') - exe.run(program=test_program, - feed={'x': data}, - fetch_list=[hidden.name]) - ema.restore(exe) - - """ - - def __init__(self, decay=0.999, thres_steps=None, name=None): - if in_dygraph_mode(): - raise Exception( - "In dygraph, don't support ExponentialMovingAverage." - ) - self._decay = decay - self._thres_steps = thres_steps - self._name = name if name is not None else '' - self._decay_var = self._get_ema_decay() - - self._step_counter_name = "@EMA_STEP_COUNTER@" - self._params_tmps = [] - for param in default_main_program().global_block().all_parameters(): - if param.do_model_average != False: - tmp = param.block.create_var( - name=unique_name.generate( - ".".join([self._name + param.name, 'ema_tmp']) - ), - dtype=param.dtype, - persistable=False, - stop_gradient=True, - ) - self._params_tmps.append((param, tmp)) - - self._ema_vars = {} - for param, tmp in self._params_tmps: - with param.block.program._optimized_guard([param, tmp]), name_scope( - 'moving_average' - ): - self._ema_vars[param.name] = self._create_ema_vars(param) - - self.apply_program = Program() - block = self.apply_program.global_block() - with program_guard(main_program=self.apply_program): - decay_pow, global_step = self._get_decay_pow(block) - for param, tmp in self._params_tmps: - param = block._clone_variable(param) - tmp = block._clone_variable(tmp) - ema = block._clone_variable(self._ema_vars[param.name]) - paddle.assign(param, output=tmp) - # bias correction - param_val = paddle.static.nn.cond( - global_step > 0, - lambda: ema / (1.0 - decay_pow), - lambda: ema, - ) - paddle.assign(param_val, output=param) - self.restore_program = Program() - block = self.restore_program.global_block() - with program_guard(main_program=self.restore_program): - for param, tmp in self._params_tmps: - tmp = block._clone_variable(tmp) - param = block._clone_variable(param) - paddle.assign(tmp, output=param) - - def _get_ema_decay(self): - with default_main_program()._lr_schedule_guard(): - decay_var = paddle.static.create_global_var( - shape=[1], - value=self._decay, - dtype='float32', - persistable=True, - name="scheduled_ema_decay_rate", - ) - - if self._thres_steps is not None: - decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0) - decay_val = paddle.static.nn.cond( - decay_t < self._decay, - lambda: decay_t, - lambda: np.array([self._decay], dtype=np.float32), - ) - paddle.assign(decay_val, decay_var) - return decay_var - - def _get_decay_pow(self, block): - global_step = paddle.static.create_global_var( - name=self._step_counter_name, - shape=[1], - value=0, - dtype='int64', - persistable=True, - ) - global_step = paddle.cast(global_step, "float32") - decay_var = block._clone_variable(self._decay_var) - decay_pow_acc = paddle.pow(decay_var, global_step) - return decay_pow_acc, global_step - - def _create_ema_vars(self, param): - param_ema = paddle.static.create_global_var( - name=unique_name.generate(self._name + param.name + '_ema'), - shape=param.shape, - value=0.0, - dtype=param.dtype, - persistable=True, - ) - - return param_ema - - def update(self): - """ - Update Exponential Moving Average. Should only call this method in - train program. - """ - global_step = layers.autoincreased_step_counter( - counter_name=self._step_counter_name - ) - param_master_emas = [] - for param, tmp in self._params_tmps: - with param.block.program._optimized_guard([param, tmp]), name_scope( - 'moving_average' - ): - param_ema = self._ema_vars[param.name] - if param.name + '.master' in self._ema_vars: - master_ema = self._ema_vars[param.name + '.master'] - param_master_emas.append([param_ema, master_ema]) - else: - ema_t = param_ema * self._decay_var + param * ( - 1 - self._decay_var - ) - paddle.assign(ema_t, output=param_ema) - - # for fp16 params - for param_ema, master_ema in param_master_emas: - default_main_program().global_block().append_op( - type="cast", - inputs={"X": master_ema}, - outputs={"Out": param_ema}, - attrs={ - "in_dtype": master_ema.dtype, - "out_dtype": param_ema.dtype, - }, - ) - - @signature_safe_contextmanager - def apply(self, executor, need_restore=True): - """ - Apply moving average to parameters for evaluation. - - Args: - executor (Executor): The Executor to execute applying. - need_restore (bool, optional): Whether to restore parameters after - applying. Default True. - """ - executor.run(self.apply_program) - try: - yield - finally: - if need_restore: - self.restore(executor) - - def restore(self, executor): - """Restore parameters. - - Args: - executor (Executor): The Executor to execute restoring. - """ - executor.run(self.restore_program) - - -class PipelineOptimizer: - """ - :api_attr: Static Graph - - Pipeline Optimizer: Make a program to run as pipeline, that is splitting a - program into multiple sections (sub-programs) and each section run on a - device to enable the training of large scale models and the use of - heterogeneous devices. Meanwhile, all sections run in the stype of pipeline. - - Args: - optimizer (Optimizer): The optimizer to use, such as SGD. - num_microbatches (int): Number of microbatches. [Optional. Default:1]. - start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0]. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import paddle.fluid.layers as layers - import numpy as np - - paddle.enable_static() - with fluid.device_guard("gpu:0"): - x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) - data_loader = fluid.io.DataLoader.from_generator( - feed_list=[x, y], - capacity=64, - use_double_buffer=True, - iterable=False) - - emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False) - emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) - - with fluid.device_guard("gpu:1"): - concat = layers.concat([emb_x, emb_y], axis=1) - fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) - loss = paddle.mean(fc) - optimizer = fluid.optimizer.SGD(learning_rate=0.5) - optimizer = fluid.optimizer.PipelineOptimizer(optimizer) - optimizer.minimize(loss) - - def train_reader(): - for _ in range(4): - x = np.random.random(size=[1]).astype('int64') - y = np.random.random(size=[1]).astype('int64') - yield x, y - data_loader.set_sample_generator(train_reader, batch_size=1) - - place = fluid.CUDAPlace(0) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - batch_size = 1 - data_loader.start() - exe.train_from_dataset( - fluid.default_main_program()) - data_loader.reset() - """ - - def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): - self._device = 'cpu' - if core.is_compiled_with_cuda(): - self._device = "gpu" - if in_dygraph_mode(): - raise Exception("In dygraph, don't support PipelineOptimizer.") - valid_optimizers = ( - Optimizer, - paddle.optimizer.Optimizer, - paddle.static.amp.decorator.OptimizerWithMixedPrecision, - ) - if not isinstance(optimizer, valid_optimizers): - raise ValueError( - "The 'optimizer' parameter for " - "PipelineOptimizer must be an instance of " - "{}, but the given type is {}.".format( - valid_optimizers, type(optimizer) - ) - ) - self._optimizer = optimizer - - # Get the original optimizer defined by users, such as SGD - self._origin_optimizer = self._optimizer - while hasattr(self._origin_optimizer, "inner_opt"): - self._origin_optimizer = self._origin_optimizer.inner_opt - - assert ( - num_microbatches >= 1 - ), "num_microbatches must be a positive value." - self._num_microbatches = num_microbatches - assert ( - start_cpu_core_id >= 0 - ), "start_cpu_core_id must be a non-negative integer." - self._start_cpu_core_id = start_cpu_core_id - self._place_list = None - op_maker = core.op_proto_and_checker_maker - self._op_role = op_maker.OpRole - self._op_role_key = op_maker.kOpRoleAttrName() - self._op_role_var_key = op_maker.kOpRoleVarAttrName() - self._op_device_key = op_maker.kOpDeviceAttrName() - self._param_device_map = None - self._pipeline_pair = [] - self._pp_ring_map = dict() - self.output_var_to_op = None - self.input_var_to_op = None - - # insert allreduce op to sync global information for global - # gradient clip and amp - def _insert_allreduce_op(self, op_idx, block): - """ - Insert allreduce op to sync global information for global - gradient clip and amp. - """ - op = block.ops[op_idx] - out_name = op.desc.output_arg_names()[0] - out_var = block.var(out_name) - offset = 0 - if op.type == "reduce_any": - # cast the bool var to int32 to use allreduce_max op - temp_var_name = unique_name.generate(out_name + "_cast_int32") - temp_var = block.create_var( - name=temp_var_name, shape=[1], dtype="int32" - ) - block._insert_op( - op_idx + 1 + offset, - type='cast', - inputs={'X': out_var}, - outputs={'Out': temp_var}, - attrs={ - 'in_dtype': out_var.dtype, - 'out_dtype': temp_var.dtype, - self._op_role_key: self._op_role.Optimize, - }, - ) - offset += 1 - block._insert_op( - op_idx + 1 + offset, - type='c_allreduce_max' - if op.type == "reduce_any" - else 'c_allreduce_sum', - inputs={'X': temp_var if op.type == "reduce_any" else out_var}, - outputs={'Out': temp_var if op.type == "reduce_any" else out_var}, - attrs={ - 'ring_id': self.global_ring_id, - self._op_role_key: self._op_role.Optimize, - 'use_calc_stream': True, - }, - ) - offset += 1 - if op.type == "reduce_any": - block._insert_op( - op_idx + 1 + offset, - type='cast', - inputs={'X': temp_var}, - outputs={'Out': out_var}, - attrs={ - 'in_dtype': temp_var.dtype, - 'out_dtype': out_var.dtype, - self._op_role_key: self._op_role.Optimize, - }, - ) - offset += 1 - return offset - - def _create_vars(self, block, ori_block): - # Create vars for block, copied from ori_block - used_var_set = set() - added_op_num = 0 - op_idx = 0 - op_size = block.desc.op_size() - while op_idx < op_size + added_op_num: - # Whether to insert allreduce_sum or allreduce_max op. - # For amp and global gradient clip strategies, we should - # get the global information, so allreduce op is needed. - should_insert = False - op = block.ops[op_idx] - # For op process vars on all devices, remove its input - # vars not in this block - reserved_x = [] - if op.type == 'reduce_any' and self._is_optimize_op(op): - should_insert = True - elif op.type == 'concat' and self._is_optimize_op(op): - for input_name in op.desc.input("X"): - if block._find_var_recursive(input_name): - reserved_x.append(input_name) - op.desc.set_input('X', reserved_x) - elif op.type == 'update_loss_scaling': - for input_name in op.desc.input("X"): - if block._find_var_recursive(input_name): - reserved_x.append(input_name) - op.desc.set_input('X', reserved_x) - op.desc.set_output('Out', reserved_x) - elif op.type == 'check_finite_and_unscale': - for input_name in op.desc.input("X"): - if block._find_var_recursive(input_name): - reserved_x.append(input_name) - op.desc.set_input('X', reserved_x) - op.desc.set_output('Out', reserved_x) - if len(reserved_x) == 0: - block._remove_op(op_idx) - op_size -= 1 - continue - elif op.type == 'sum' and self._is_gradient_clip_op(op): - for input_name in op.desc.input("X"): - if block._find_var_recursive(input_name): - reserved_x.append(input_name) - op.desc.set_input('X', reserved_x) - should_insert = True - - vars = op.desc.input_arg_names() + op.desc.output_arg_names() - for var in vars: - # a var whose name contains "blocking_queue" - # only exists in startup program - if var in used_var_set or "_blocking_queue" in var: - continue - used_var_set.add(var) - if block._find_var_recursive(str(var)): - continue - source_var = ori_block._var_recursive(str(var)) - if source_var.type == core.VarDesc.VarType.READER: - dest_var = block.create_var( - name=var, - type=core.VarDesc.VarType.READER, - persistable=source_var.persistable, - ) - elif isinstance(source_var, Parameter): - dest_var = block.create_parameter( - name=source_var.name, - shape=source_var.shape, - dtype=source_var.dtype, - type=source_var.type, - lod_level=source_var.lod_level, - stop_gradient=source_var.stop_gradient, - trainable=source_var.trainable, - optimize_attr=source_var.optimize_attr, - regularizer=source_var.regularizer, - error_clip=source_var.error_clip, - ) - else: - dest_var = block._clone_variable(source_var, False) - self._clone_var_attr(dest_var, source_var) - # When use with sharding, allreduce_sum and allreduce_max - # used for global gradient clip and amp will be added by sharding. - op_idx += 1 - if self.use_sharding or not should_insert: - continue - inserted_ops = self._insert_allreduce_op(op_idx - 1, block) - added_op_num += inserted_ops - op_idx += inserted_ops - block._sync_with_cpp() - - def _is_loss_grad_op(self, op): - assert self._op_role_key in op.attr_names - op_role = int(op.attr(self._op_role_key)) - return op_role & int(self._op_role.Backward) and op_role & int( - self._op_role.Loss - ) - - def _is_forward_op(self, op): - return self._op_role_key in op.attr_names and ( - int(op.attr(self._op_role_key)) == int(self._op_role.Forward) - ) - - def _is_backward_op(self, op): - return self._op_role_key in op.attr_names and ( - int(op.attr(self._op_role_key)) & int(self._op_role.Backward) - ) - - def _is_loss_op(self, op): - assert self._op_role_key in op.attr_names - return int(op.attr(self._op_role_key)) == int(self._op_role.Loss) - - def _is_optimize_op(self, op): - return self._op_role_key in op.attr_names and ( - int(op.attr(self._op_role_key)) & int(self._op_role.Optimize) - ) - - def _is_update_op(self, op): - return ( - 'Param' in op.input_names - and 'Grad' in op.input_names - and ("LearningRate" in op.input_names) - ) - - def _split_program(self, main_program, devices): - """ - Split a program into sections according to devices that ops run on. - The op whose op_device attr is "gpu:all" is copied to all sections. - - Args: - main_program (Program): the main program - devices: all used devices - """ - # Map from device to its corresponding section program info - device_program_map = defaultdict(Program) - - block = main_program.block(0) - for op in block.ops: - device = op.attr(self._op_device_key) - # Copy ops whose op_device set to "gpu:all" to all sections. - if device == f"{self._device}:all": - for device in devices: - program = device_program_map[device] - op_desc = op.desc - ap_op = program.global_block().desc.append_op() - ap_op.copy_from(op_desc) - ap_op._set_attr(self._op_device_key, "") - else: - program = device_program_map[device] - op_desc = op.desc - ap_op = program.global_block().desc.append_op() - ap_op.copy_from(op_desc) - ap_op._set_attr(self._op_device_key, "") - - program_list = [] - for key in devices: - program = device_program_map[key] - program._sync_with_cpp() - program_list.append(program) - - return program_list - - def _get_op_device_for_startup_program(self, var_name): - """ - For adam optimizer, it will add accumulators and initialize them - with fill_constant, and force the op device to cpu. Hence, we should - get the real op_device attribute of the fill_constant as the device - where the corresponding parameters on. - """ - assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, ( - 'For accumulators for Adam, the name must contain beta1_pow_acc ' - 'or beta2_pow_acc.' - ) - param_name = var_name[0 : var_name.index('_beta')] - device = self._param_device_map[param_name] - return device - - def _split_startup_program(self, startup_program, device_id): - block = startup_program.global_block() - new_startup_program = Program() - for op in block.ops: - device = op.attr(self._op_device_key) - if device == "cpu": - assert op.type == "fill_constant", ( - "For ops in startup program with the op_device attribute " - "of cpu, they must be of type fill_constant." - ) - output_var = op.output_arg_names[0] - device = self._get_op_device_for_startup_program(output_var) - - if device: - device_index = int(device.split(':')[1]) - else: - # LR related ops - device = None - if device and device_index != device_id: - continue - op_desc = op.desc - ap_op = new_startup_program.global_block().desc.append_op() - ap_op.copy_from(op_desc) - ap_op._set_attr(self._op_device_key, "") - new_startup_program._sync_with_cpp() - self._create_vars(new_startup_program.global_block(), block) - return new_startup_program - - def _find_post_op(self, index, var_name): - """ - Find the post op that has variable named var_name as input. - """ - # bugfix for uniform hybrid parallelism - if '.cast_fp32' in var_name: - var_name = var_name.replace('.cast_fp32', '') - if '.cast_fp16' in var_name: - var_name = var_name.replace('.cast_fp16', '') - - post_ops = self.input_var_to_op[var_name] - if post_ops is None: - return None - result_op = None - for post_op, post_idx in reversed(post_ops): - if post_idx > index: - result_op = post_op - break - return result_op - - def _find_prev_op(self, index, var_name): - """ - Find the previous op of op with index that outputs - variable named var_name. - """ - prev_ops = self.output_var_to_op[var_name] - if prev_ops is None: - return None - result_op = None - for prev_op, prev_idx in reversed(prev_ops): - if prev_idx < index: - result_op = prev_op - break - return result_op - - def _rename_arg(self, op, old_name, new_name): - op._rename_input(old_name, new_name) - op._rename_output(old_name, new_name) - - def _create_var(self, block, ref_var, name, dtype=None): - """ - Create a new var for block, which has the same type, - shape and dtype as ref_var, then rename it with the - name `name`. - """ - new_var = block.create_var( - name=name, - shape=ref_var.shape, - dtype=ref_var.dtype if dtype is None else dtype, - type=ref_var.type, - lod_level=ref_var.lod_level, - persistable=ref_var.persistable, - is_data=ref_var.is_data, - need_check_feed=ref_var.desc.need_check_feed(), - ) - self._clone_var_attr(new_var, ref_var) - return new_var - - def _clone_var_attr(self, dest, src): - dest.stop_gradient = src.stop_gradient - if hasattr(src, 'is_distributed'): - dest.is_distributed = src.is_distributed - - def _strip_grad_suffix(self, name): - """ - Strip the grad suffix from the given variable name - """ - pos = name.find(core.grad_var_suffix()) - return name[:pos] if pos != -1 else name - - def _append_grad_suffix(self, name): - """ - Append grad suffix to the given variable name - """ - return name + core.grad_var_suffix() - - def _get_op_device_attr(self, op): - """ - Get the op_device attribute of a op. - """ - device = ( - op.attr(self._op_device_key) - if op.has_attr(self._op_device_key) - else None - ) - if device: - assert device[0:3] == 'gpu', ( - "Now, only gpu devices are " - "supported in pipeline parallemism." - ) - return device - - def _add_op_device_attr_for_op(self, op, idx, block): - """ - Add op_device attrribute for ops that have not that attribute set. - We use "gpu:all" to represent the op should be put on all - sub-programs, such as lr-related ops. Note that: "gpu:all" - is only used by pipeline as an indicator. - """ - lrsched_role = int(self._op_role.LRSched) - if op.attr(self._op_role_key) == lrsched_role: - # For LRSched ops, we should put them on all sub-programs to - # make sure each sub-program update the lr correctly - op._set_attr(self._op_device_key, f"{self._device}:all") - # bugfix in hybrid parallelism - elif op.type == "sum" and self._is_backward_op(op): - # For sum ops that compute the sum of @RENAMED@ vars - for name in op.desc.input_arg_names(): - assert ( - '@RENAME@' in name - ), "The op must be sum used to accumulate renamed vars." - assert len(op.desc.output_arg_names()) == 1 - out_name = op.desc.output_arg_names()[0] - post_op = self._find_post_op(idx, out_name) - assert post_op.has_attr( - 'op_device' - ), "{} has no op_device attr for var {}".format( - post_op.type, out_name - ) - device = post_op.attr(self._op_device_key) - assert device, "The post op must have op_device set." - op._set_attr(self._op_device_key, device) - elif (op.type == "cast" or op.type == "scale") and ( - self._is_backward_op(op) or self._is_forward_op(op) - ): - prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) - op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) - elif op.type == "memcpy" and not self._is_optimize_op(op): - # for checkpoint offloading - assert ( - len(op.input_arg_names) == 1 and len(op.output_arg_names) == 1 - ) - input_name = op.input_arg_names[0] - output_name = op.output_arg_names[0] - if '@Fetch' in output_name: - post_op = self._find_post_op(idx, output_name) - op._set_attr( - self._op_device_key, post_op.attr(self._op_device_key) - ) - else: - prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) - op._set_attr( - self._op_device_key, prev_op.attr(self._op_device_key) - ) - elif self._is_loss_op(op): - # For loss * loss_scaling op added by AMP - offset = 1 - while not block.ops[idx + offset].has_attr( - self._op_device_key - ) or not block.ops[idx + offset].attr(self._op_device_key): - offset += 1 - device = block.ops[idx + offset].attr(self._op_device_key) - assert device, "Please put you program within device_guard scope." - for i in range(offset): - block.ops[idx + i]._set_attr(self._op_device_key, device) - elif self._is_optimize_op(op) and op.type == "cast": - # For fp16-->fp32 cast added by AMP - grad_name = op.output('Out') - assert len(grad_name) == 1 - param_name = self._strip_grad_suffix(grad_name[0]) - device = self._param_device_map[param_name] - op._set_attr(self._op_device_key, device) - elif self._is_gradient_clip_op(op) or self._is_regularization_op(op): - # For gradient clip and regularization ops, we set their op_device - # attribute to the device where their corresponding parameters on. - assert self._op_role_var_key in op.attr_names, ( - "gradient_clip " - "and regularization ops must have op_role_var attribute." - ) - op_role_var = op.attr(self._op_role_var_key) - assert len(op_role_var) == 2, ( - "op_role_var for gradient_clip " - "regularization ops must have two elements." - ) - param_name = op_role_var[0] - device = self._param_device_map[param_name] - # For sum op added by global gradient clip, it must be - # put on all devices - if ( - op.type == 'sum' - or op.type == 'sqrt' - or op.type == 'fill_constant' - or op.type == 'elementwise_max' - or op.type == 'elementwise_div' - ): - device = f"{self._device}:all" - op._set_attr(self._op_device_key, device) - elif op.type == "alloc_float_status" or op.type == "clear_float_status": - op._set_attr(self._op_device_key, f"{self._device}:all") - # NOTE(wangxi): NPU should only clear the float status - # once at each batch step - op._set_attr(self._op_role_key, self._op_role.LRSched) - - float_status_name = op.output_arg_names[0] - float_status_var = block.var(float_status_name) - # FIXME(wangxi): pipeline lr schedule will exec on sub_scope(0) - # while update will exec on sub_scope(last_micro_step), should - # set persistable to use global scope - float_status_var.persistable = True - else: - other_known_ops = [ - 'update_loss_scaling', - 'reduce_any', - 'concat', - 'sum', - 'check_finite_and_unscale', - 'memcpy', - ] - assert op.type in other_known_ops, ( - "For other ops without " - "op_device set, they must be one of {}, but it " - "is {}".format(other_known_ops, op.type) - ) - assert self._is_optimize_op(op) - op._set_attr(self._op_device_key, f"{self._device}:all") - - def _add_op_device_attr(self, block): - """ - Add op_device attrribute for ops in block that have - not that attribute set. - """ - for idx, op in enumerate(list(block.ops)): - if ( - op.type == "create_py_reader" - or op.type == "read" - or op.type == "create_double_buffer_reader" - ): - # Copy read related ops to all section to make them exit - # after each epoch. - # We use "gpu:all" to represent the op should be put on all - # sub-programs, such as lr-related ops. Note that: "gpu:all" - # is only used by pipeline as an indicator. - op._set_attr(self._op_device_key, f"{self._device}:all") - continue - # op_device attribute has been set - if self._get_op_device_attr(op): - continue - self._add_op_device_attr_for_op(op, idx, block) - - def _check_validation(self, block): - """ - Check whether ops in a block have both the op_device and the - op_role attributes set. - Then, return all devices in order. - """ - device_list = [] - # Section worker only supports the following op_role - valid_op_role_value = [ - int(self._op_role.LRSched), - int(self._op_role.Forward), - int(self._op_role.Backward), - int(self._op_role.Loss), - int(self._op_role.Optimize), - int(self._op_role.Backward) | int(self._op_role.Loss), - ] - for op in block.ops: - if not op._has_kernel(op.type): - assert op.type == "conditional_block" and ( - op.attr(self._op_role_key) == int(self._op_role.LRSched) - ), ( - "Now, the only supported op without kernel is " - "conditional_block, and its op role must be LRSched." - ) - assert op.has_attr( - self._op_role_key - ), "op ({}) has no {} attribute.".format(op.type, self._op_role_key) - op_role = op.attr(self._op_role_key) - assert ( - int(op_role) in valid_op_role_value - ), "op_role {} for op {} must be one of {}".format( - op_role, op.type, valid_op_role_value - ) - - assert op.has_attr( - self._op_device_key - ), "op ({}) has no {} attribute.".format( - op.type, self._op_device_key - ) - - device = op.attr(self._op_device_key) - assert ( - device - ), "op_device attribute for op " "{} has not been set.".format( - op.type - ) - if device == f"{self._device}:all": - continue - - dev_type = device.split(':')[0] - assert dev_type == "gpu", ( - "Now only gpu devices are supported " - "for pipeline parallelism." - ) - - if device not in device_list: - device_list.append(device) - - return device_list - - def _insert_sendrecv_ops_for_boundaries(self, block): - """ - Insert a pair of send and recv ops for every two - consecutive ops on different devices. - """ - # A map from var to device where op takes it as input, - # avoiding multiple send and recv ops. - input_var_to_device = dict() - # bugfix hybrid parallelism - first_optimize_index = None - for index, op in enumerate(list(block.ops)): - if self._is_optimize_op(op): - first_optimize_index = index - break - extra_index_info = { - 'index': 0, - 'first_optimize_index': first_optimize_index, - } - - for index, op in enumerate(list(block.ops)): - cur_device = op.attr(self._op_device_key) - if cur_device == f"{self._device}:all": - continue - for var_name in op.input_arg_names: - var = block.var(var_name) - # skip data var - if var.is_data: - continue - prev_device = None - - prev_op = self._find_prev_op(index, var_name) - if prev_op is None: - if var_name not in self._param_device_map: - continue - prev_device = self._param_device_map[var_name] - - if not prev_device: - prev_device = ( - prev_op.attr(self._op_device_key) if prev_op else None - ) - - if prev_device is None or prev_device == f"{self._device}:all": - continue - - if prev_device == cur_device: - continue - - if var_name not in input_var_to_device: - input_var_to_device[var_name] = [] - if (cur_device, prev_device) in input_var_to_device[var_name]: - continue - - device_type = cur_device.split(':')[0] + ':' - - def _check_stage(cur_id, prev_id): - # check send/recv stage valid - is_forward = self._is_forward_op(op) - is_backward = self._is_backward_op(op) - assert is_forward or is_backward, ( - 'send/recv in pipeline should only be inserted in forward or backward,' - 'please check the op_role of op={}'.format(op) - ) - - if is_forward: - assert prev_id < cur_id, ( - "In forward, send/recv can only be passed forward, but now " - "prev_stage={} great than cur_stage={}, please check op_device of op={}".format( - prev_id, cur_id, op - ) - ) - elif is_backward: - assert prev_id > cur_id, ( - "In backward, send/recv can only be passed backward, but now " - "prev_stage={} less than cur_stage={}, please check op_device of op={}".format( - prev_id, cur_id, op - ) - ) - - def _insert_send_recv(cur_id, prev_id): - cur_dev = device_type + str(cur_id) - prev_dev = device_type + str(prev_id) - if (cur_dev, prev_dev) in input_var_to_device[var_name]: - return - - if cur_id - prev_id > 1: - _insert_send_recv(cur_id - 1, prev_id) - _insert_send_recv(cur_id, cur_id - 1) - input_var_to_device[var_name].append( - (cur_dev, prev_dev) - ) - return - elif cur_id - prev_id < -1: - _insert_send_recv(cur_id + 1, prev_id) - _insert_send_recv(cur_id, cur_id + 1) - input_var_to_device[var_name].append( - (cur_dev, prev_dev) - ) - return - - assert abs(cur_id - prev_id) == 1 - input_var_to_device[var_name].append((cur_dev, prev_dev)) - - op_role = op.attr(self._op_role_key) - var = block.vars[var_name] - pair = (prev_id, cur_id) - # 1000 is just a magic number - pair_key = prev_id * 1000 + cur_id - if pair not in self._pipeline_pair: - self._pipeline_pair.append(pair) - self._pp_ring_map[pair_key] = self.ring_id - ring_id = self.ring_id - self.ring_id += 1 - else: - ring_id = self._pp_ring_map[pair_key] - - if self.schedule_mode == 'F-then-B': # F-then-B - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='send_v2', - inputs={'X': var}, - attrs={ - self._op_device_key: prev_dev, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': 1, - 'ring_id': ring_id, - }, - ) - extra_index_info['index'] += 1 - var_shape = list(var.shape) - var_shape[0] = ( - self.micro_batch_size - if var_shape[0] < 0 - else var_shape[0] - ) - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='recv_v2', - outputs={'Out': [var]}, - attrs={ - 'out_shape': var_shape, - 'dtype': var.dtype, - self._op_device_key: cur_dev, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': 0, - 'ring_id': ring_id, - }, - ) - extra_index_info['index'] += 1 - elif self.schedule_mode == '1F1B': # 1F1B - var_shape = list(var.shape) - var_shape[0] = ( - self.micro_batch_size - if var_shape[0] < 0 - else var_shape[0] - ) - - numel = np.prod(var_shape) - use_mp = (self.mp_degree > 1) and ( - numel % self.mp_degree == 0 - ) - - if 'subprog' in var.name: - # For recompute, if the checkpoints var is layer_norm_6.tmp_2 - # this var will be sent twice, layer_norm_6.tmp_2 for forward pass, - # layer_norm_6.tmp_2.subprog_* for recompute pass. - # We can store the first sent var and copy the value to the - # second one to reduce one send/recv op. - # The origin_ckpt_name is layer_norm_6.tmp_2, which will be used - # to find the stored var for the forward pass. - origin_name = var.name.split('subprog')[0][0:-1] - associate_var = block.var(origin_name) - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='assign', - inputs={'X': [associate_var]}, - outputs={'Out': [var]}, - attrs={ - 'out_shape': var_shape, - 'dtype': var.dtype, - self._op_device_key: cur_dev, - self._op_role_key: op_role, - 'use_calc_stream': True, - }, - ) - extra_index_info['index'] += 1 - return - - _check_stage(cur_id, prev_id) - - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='c_sync_calc_stream', - inputs={'X': [var]}, - outputs={'Out': [var]}, - attrs={ - self._op_device_key: prev_dev, - self._op_role_key: op_role, - }, - ) - extra_index_info['index'] += 1 - prefix_name = var.name.split('@')[0] - prefix_var = block.var(prefix_name) - is_param = ( - True if isinstance(prefix_var, Parameter) else False - ) - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='send_v2' - if not use_mp or is_param - else 'partial_send', - inputs={'X': var}, - attrs={ - self._op_device_key: prev_dev, - self._op_role_key: op_role, - 'use_calc_stream': False, - 'ring_id': ring_id, - 'peer': 1, - # if send_v2, num&id attr is not in op_attrs, will not insert - 'num': self.mp_degree, - 'id': self.mp_rank, - }, - ) - extra_index_info['index'] += 1 - insert_index = None - if int(op_role) == int(self._op_role.Backward): - insert_index = extra_index_info[ - 'first_optimize_index' - ] - new_op_role = self._op_role.Optimize - else: - insert_index = index - new_op_role = self._op_role.Backward - sync_comm_op = block._insert_op_without_sync( - index=insert_index + extra_index_info['index'], - type='c_sync_comm_stream', - inputs={'X': [var]}, - outputs={'Out': [var]}, - attrs={ - self._op_device_key: prev_dev, - self._op_role_key: new_op_role, - 'ring_id': ring_id, - }, - ) - if int(op_role) == int(self._op_role.Forward): - sync_comm_op._set_attr('pipeline_flag', '') - extra_index_info['index'] += 1 - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='recv_v2' - if not use_mp or is_param - else 'partial_recv', - outputs={'Out': [var]}, - attrs={ - 'out_shape': var_shape, - 'dtype': var.dtype, - self._op_device_key: cur_dev, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': 0, - 'ring_id': ring_id, - # if recv_v2, num&id attr is not in op_attrs, will not insert - 'num': self.mp_degree, - 'id': self.mp_rank, - }, - ) - extra_index_info['index'] += 1 - if use_mp and not is_param: - block._insert_op_without_sync( - index=index + extra_index_info['index'], - type='partial_allgather', - inputs={'X': [var]}, - outputs={'Out': [var]}, - attrs={ - self._op_device_key: cur_dev, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'ring_id': 0, - # if recv_v2, num&id attr is not in op_attrs, will not insert - 'nranks': self.mp_degree, - 'rank': self.mp_rank, - }, - ) - extra_index_info['index'] += 1 - else: - raise ValueError( - "Now only 'F-then-B' and '1F1B' are supported." - "The given value is {}.".format(self.schedule_mode) - ) - - _insert_send_recv( - int(cur_device.split(':')[1]), - int(prev_device.split(':')[1]), - ) - block._sync_with_cpp() - - def _insert_loss_scale(self, block): - """ - Scale the loss corresponding to number of micro-batches. - """ - if self._num_microbatches == 1: - return - for index, op in reversed(tuple(enumerate(list(block.ops)))): - if self._is_loss_grad_op(op): - assert op.type == 'fill_constant', ( - "loss_grad_op must be fill_constant op, " - "but this op is {}".format(op.type) - ) - assert op.has_attr('value') - loss_scale = float(op.attr('value')) - loss_scale = loss_scale / self._num_microbatches - op._set_attr('value', loss_scale) - break - - def _rename_gradient_var_name(self, block): - for index, op in enumerate(block.ops): - if not self._is_optimize_op(op): - continue - input_names = op.input_arg_names - output_names = op.output_arg_names - in_out_names = input_names + output_names - if op.type == 'cast' or op.type == "c_sync_comm_stream": - continue - # append "MERGED" to the names of parameter gradients, - # and mofify the op_role_var attribute (by rename_arg func). - for name in in_out_names: - if not core.grad_var_suffix() in name: - continue - param_name = name.strip(core.grad_var_suffix()) - new_grad_name = name + "@MERGED" - self._rename_arg(op, name, new_grad_name) - - def _accumulate_gradients( - self, block, pp_allreduce_in_optimize=False, strategy=None, shard=None - ): - """ - Create a new merged gradient for each parameter and accumulate the - corresponding gradient to it. - """ - fp16_allreduce = strategy.fp16_allreduce if strategy else False - if strategy and strategy.fuse_grad_merge: - fused_gradient_names = self._accumulate_gradients_with_fuse( - block, fp16_allreduce, strategy.fuse_grad_size_in_MB, shard - ) - return fused_gradient_names - - merged_gradient_names = [] - first_opt_op_idx = None - - merged_suffix = '@MERGED@FP16' if fp16_allreduce else '@MERGED' - dtype = paddle.float16 if fp16_allreduce else None - - for index, op in reversed(tuple(enumerate(list(block.ops)))): - # remove the cast op of fp16 grad to fp32 grad - if self._is_optimize_op(op) and op.type == 'cast': - in_name = op.input_arg_names[0] - out_name = op.output_arg_names[0] - if out_name.strip('@GRAD') in self._param_device_map: - assert in_name.replace('.cast_fp16', '') == out_name - block._remove_op(index) - continue - - if self._is_backward_op(op) and first_opt_op_idx is None: - first_opt_op_idx = index + 1 - # maybe have no optimize - # if first_opt_op_idx == len(block.ops): return - - if self._is_backward_op(op) and ( - self._op_role_var_key in op.attr_names - ): - op_role_var = op.attr(self._op_role_var_key) - if len(op_role_var) == 0: - continue - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - offset = 0 - param_name = op_role_var[i] - if not block.has_var(param_name): - continue - if '@BroadCast' in param_name: - continue - - param_grad_name = param_name + core.grad_var_suffix() - merged_param_grad_name = param_grad_name + merged_suffix - if not block.has_var(merged_param_grad_name): - self._create_var( - block, - block.vars[param_name], - merged_param_grad_name, - dtype, - ) - assert block.has_var(merged_param_grad_name) - - param_grad_var = block.var(param_grad_name) - merged_param_grad_var = block.var(merged_param_grad_name) - merged_param_grad_var.persistable = True - block._insert_op( - index=first_opt_op_idx + offset, - type='fill_constant', - inputs={}, - outputs={'Out': [merged_param_grad_var]}, - attrs={ - 'shape': merged_param_grad_var.shape, - 'dtype': merged_param_grad_var.dtype, - 'value': float(0), - # a trick to run this op once per mini-batch - self._op_role_key: self._op_role.Optimize.LRSched, - }, - ) - offset += 1 - grad_name = op_role_var[i + 1] - grad_var = block.vars[grad_name] - - is_fp16_grad = 'cast_fp16' in grad_name - need_cast = is_fp16_grad is not fp16_allreduce - - if need_cast: - # if fp16_allreduce: - # cast grad to fp16 to accumulate to merged gradient - # else: - # cast grad to fp32 to accumulate to merged gradient - cast_grad_var_name = param_grad_name + '@TMP' - cast_grad_var = self._create_var( - block, param_grad_var, cast_grad_var_name, dtype - ) - cast_grad_var.persistable = False - block._insert_op( - index=first_opt_op_idx + offset, - type='cast', - inputs={'X': grad_var}, - outputs={'Out': cast_grad_var}, - attrs={ - 'in_dtype': grad_var.dtype, - 'out_dtype': cast_grad_var.dtype, - self._op_role_key: self._op_role.Backward, - }, - ) - offset += 1 - grad_var = cast_grad_var - - block._insert_op( - index=first_opt_op_idx + offset, - type='sum', - inputs={'X': [merged_param_grad_var, grad_var]}, - outputs={'Out': merged_param_grad_var}, - attrs={ - self._op_role_key: self._op_role.Backward, - }, - ) - offset += 1 - merged_gradient_names.append(merged_param_grad_name) - - if not fp16_allreduce: - return merged_gradient_names - - first_opt_op_idx = None - for index, op in reversed(tuple(enumerate(list(block.ops)))): - if self._is_backward_op(op) and first_opt_op_idx is None: - first_opt_op_idx = index + 1 - break - assert first_opt_op_idx is not None - - # insert cast op from fp16->fp32 - # FIXME(wangxi): maybe put in sharding is better, for some grad - # is not in sharding device. - for fp16_grad_name in merged_gradient_names: - grad_name = fp16_grad_name.replace('@FP16', '') - param_name = fp16_grad_name.replace('@GRAD@MERGED@FP16', '') - - if not block.has_var(grad_name): - self._create_var(block, block.vars[param_name], grad_name) - assert block.has_var(grad_name) - - fp16_grad_var = block.var(fp16_grad_name) - grad_var = block.var(grad_name) - grad_var.persistable = False - - block._insert_op( - index=first_opt_op_idx, - type='cast', - inputs={'X': fp16_grad_var}, - outputs={'Out': grad_var}, - attrs={ - 'in_dtype': fp16_grad_var.dtype, - 'out_dtype': grad_var.dtype, - self._op_role_key: self._op_role.Optimize, - }, - ) - - return merged_gradient_names - - def _insert_accumulate_gradients_with_fuse( - self, main_block, fp16, fused_size, grad_param_pairs, first_opt_op_idx - ): - grad_param_pairs = self._sort_grad_param_by_dtype( - main_block, grad_param_pairs - ) - - grad_param_segments = [] - merged_suffix = '@MERGED@FP16' if fp16 else '@MERGED' - dtype = paddle.float16 if fp16 else paddle.float32 - cur_size = 0.0 - last_dtype = None - # split the grad based on dtype and fused size - for grad, param in grad_param_pairs: - real_grad = main_block.var(grad) - # create the gradient merged var for each grad - merged_grad_var = main_block.create_var( - name=param + core.grad_var_suffix() + merged_suffix, - dtype=dtype, - shape=real_grad.shape, - persistable=True, - stop_gradient=False, - ) - real_param = main_block.var(param) - if hasattr(real_param, 'is_distributed'): - merged_grad_var.is_distributed = real_param.is_distributed - tmp_size = self._get_var_size(real_grad) - # two strategies for splitting the grad - # 1. the current segment's size reach the user defined grad_size_in_MB - # 2. the upcoming grad holds different dtype compared with grads in current segment - if ( - len(grad_param_segments) == 0 - or cur_size + tmp_size > fused_size - or real_grad.dtype != last_dtype - ): - grad_param_segments.append( - ([real_grad], [real_param], [merged_grad_var]) - ) - last_dtype = real_grad.dtype - cur_size = 0.0 - else: - grad_param_segments[-1][0].append(real_grad) - grad_param_segments[-1][1].append(real_param) - grad_param_segments[-1][2].append(merged_grad_var) - cur_size += tmp_size - - fused_gradients = [] - fused_merged_gradients = [] - # create fused vars for grad and param - for grad_param_segment in grad_param_segments: - grad_segment = grad_param_segment[0] - merged_grad_segment = grad_param_segment[2] - fused_grad = main_block.create_var( - name='FusedGrad_{}'.format(grad_segment[0].name), - dtype=grad_segment[0].dtype, - persistable=False, - stop_gradient=False, - ) - # keep the '.cast_fp16' info in the fuse var name - fused_merged_grad_name_prefix = ( - 'FusedMergedGrad.cast_fp16.' - if merged_grad_segment[0].dtype == paddle.float16 - else 'FusedMergedGrad' - ) - fused_merged_grad_name = ( - fused_merged_grad_name_prefix - + '_{}'.format(merged_grad_segment[0].name) - ) - fused_merged_grad = main_block.create_var( - name=fused_merged_grad_name, - dtype=merged_grad_segment[0].dtype, - persistable=True, - stop_gradient=False, - ) - fused_gradients.append(fused_grad) - fused_merged_gradients.append(fused_merged_grad) - - assert len(fused_gradients) == len(grad_param_segments) - assert len(fused_merged_gradients) == len(grad_param_segments) - - # insert coalesce op at the start of the backward pass - # use param as the coalesce input to make sure the two Fused vars are in same shape - first_back_op_idx = None - for index, op in enumerate(main_block.ops): - if self._is_backward_op(op) and first_back_op_idx is None: - first_back_op_idx = index - break - assert first_back_op_idx is not None - offset = 0 - for i in range(len(grad_param_segments)): - fused_grad = fused_gradients[i] - fused_merged_grad = fused_merged_gradients[i] - grads = grad_param_segments[i][0] - params = grad_param_segments[i][1] - merged_grads = grad_param_segments[i][2] - main_block._insert_op_without_sync( - first_back_op_idx + offset, - type="coalesce_tensor", - inputs={"Input": params}, - outputs={"Output": grads, "FusedOutput": fused_grad}, - attrs={ - # Explanation of user_defined_size_of_dtype: - # In coalesce op, the align size is 256 bytes - # the float takes 4 bytes while fp16 takes 2 bytes. - # To meet the requirement, 128 fp16 or 64 float will be aligned - # Think the total shape of the input tensors if [64], - # if the dtype is float, then the shape of the fuse var is [64] - # however if the dytpe if fp16, the shape of the fuse var is [128], - # which will cause the fused vars' shape vary between each other. - # To make sure the shape of the fused vars are identical, - # we set the dtype of float and fp16 both to 2. - # Under this way, the fused vars' shape for float and fp16 are all [128] - "user_defined_size_of_dtype": 2, - "copy_data": False, - "use_align": True, - "dtype": grads[0].dtype, - self._op_role_key: self._op_role.Backward, - # On npu, the nan/inf check login is different with gpu. - # If there are some not initialized sections in the fused var, - # and the value in those sections are nan/inf, it will trigger the nan/inf check. - # To avoid these problematic triggers, set constant is needed for npu - "set_constant": core.is_compiled_with_custom_device('npu'), - "constant": float(0.0), - }, - ) - offset += 1 - # For the gradient_merged_fused_var, given a init value during the coalesce op - # this will remove a problematic fill_constant op. This op role of this coalesce - # is set to be LRSched to make this coalesce (with init) only run once - main_block._insert_op_without_sync( - first_back_op_idx + offset, - type="coalesce_tensor", - inputs={"Input": params}, - outputs={ - "Output": merged_grads, - "FusedOutput": fused_merged_grad, - }, - attrs={ - "user_defined_size_of_dtype": 2, - "set_constant": True, - "constant": float(0.0), - "copy_data": False, - "use_align": True, - "dtype": merged_grads[0].dtype, - self._op_role_key: self._op_role.Optimize.LRSched, - }, - ) - offset += 1 - - # insert gradient merge relating ops - first_opt_op_idx += offset - offset = 0 - for i in range(len(fused_gradients)): - fused_grad = fused_gradients[i] - fused_merged_grad = fused_merged_gradients[i] - is_fp16_grad = 'cast_fp16' in fused_grad.name - need_cast = is_fp16_grad is not fp16 - if need_cast: - # for fp16 allreduce, cast fp32 grad to fp16 - # for fp32 allreduce, cast fp16 grad to fp32 - cast_grad_var_name = fused_grad.name + '@TMP' - cast_grad_var = main_block.create_var( - name=cast_grad_var_name, - dtype=dtype, - persistable=False, - stop_gradient=False, - ) - main_block._insert_op( - index=first_opt_op_idx + offset, - type='cast', - inputs={'X': fused_grad}, - outputs={'Out': cast_grad_var}, - attrs={ - 'in_dtype': fused_grad.dtype, - 'out_dtype': cast_grad_var.dtype, - self._op_role_key: self._op_role.Backward, - }, - ) - offset += 1 - fused_grad = cast_grad_var - main_block._insert_op( - index=first_opt_op_idx + offset, - type='sum', - inputs={'X': [fused_merged_grad, fused_grad]}, - outputs={'Out': fused_merged_grad}, - attrs={self._op_role_key: self._op_role.Backward}, - ) - offset += 1 - - if fp16: - # if using fp16 allreduce, the optimizer needs fp32 grads, cast them back to fp32 - for grad, param in grad_param_pairs: - real_grad = main_block.var(grad) - fp16_grad_name = param + core.grad_var_suffix() + '@MERGED@FP16' - assert main_block.has_var(fp16_grad_name) - fp16_grad = main_block.var(fp16_grad_name) - fp32_grad_name = param + core.grad_var_suffix() + '@MERGED' - fp32_grad = main_block.create_var( - name=fp32_grad_name, - dtype=paddle.float32, - shape=real_grad.shape, - persistable=False, - stop_gradient=False, - ) - main_block._insert_op( - index=first_opt_op_idx + offset, - type='cast', - inputs={'X': fp16_grad}, - outputs={'Out': fp32_grad}, - attrs={ - 'in_dtype': paddle.float16, - 'out_dtype': paddle.float32, - self._op_role_key: self._op_role.Optimize, - }, - ) - offset += 1 - - # replace the var with it's name, which will be used for inserting allreduce - for i in range(len(fused_merged_gradients)): - fused_merged_gradients[i] = fused_merged_gradients[i].name - - return fused_merged_gradients, first_opt_op_idx - - def _accumulate_gradients_with_fuse( - self, main_block, fp16, fused_size, shard=None - ): - first_opt_op_idx = None - grad_param_pairs = [] - # obtain all param/grad pairs that needed to be fused - for index, op in reversed(tuple(enumerate(list(main_block.ops)))): - # remove the cast op of fp16 grad to fp32 grad - if self._is_optimize_op(op) and op.type == 'cast': - in_name = op.input_arg_names[0] - out_name = op.output_arg_names[0] - if out_name.strip('@GRAD') in self._param_device_map: - assert in_name.replace('.cast_fp16', '') == out_name - main_block._remove_op(index) - continue - - if self._is_backward_op(op) and first_opt_op_idx is None: - first_opt_op_idx = index + 1 - # no optimize phase - if first_opt_op_idx == len(main_block.ops): - return - - if self._is_backward_op(op) and ( - self._op_role_var_key in op.attr_names - ): - op_role_var = op.attr(self._op_role_var_key) - if len(op_role_var) == 0: - continue - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - param_name = op_role_var[i] - if not main_block.has_var(param_name): - continue - if '@BroadCast' in param_name: - continue - grad_param_pairs.append( - (op_role_var[i + 1], op_role_var[i]) - ) - - if len(grad_param_pairs) == 0: - return - - nranks = shard.worker_num if shard else 1 - device_to_pairs = [[] for _ in range(nranks)] - for pair in grad_param_pairs: - root_id = shard.device(pair[1]) if shard else 0 - assert 0 <= root_id < nranks - device_to_pairs[root_id].append(pair) - - all_fused_merged_gradients = [] - for pairs in device_to_pairs: - ( - fused_merged_gradients, - first_opt_op_idx, - ) = self._insert_accumulate_gradients_with_fuse( - main_block, fp16, fused_size, pairs, first_opt_op_idx - ) - all_fused_merged_gradients += fused_merged_gradients - - main_block._sync_with_cpp() - return all_fused_merged_gradients - - def _sort_grad_param_by_dtype(self, main_block, grad_param_pairs): - # sort the grad param paris by the dtype - fp16_pairs = [] - fp32_pairs = [] - other_pairs = [] - for pairs in grad_param_pairs: - dtype = main_block.var(pairs[0]).dtype - if dtype == paddle.float32: - fp32_pairs.append(pairs) - elif dtype == paddle.float16: - fp16_pairs.append(pairs) - else: - other_pairs.append(pairs) - sorted_pairs = fp16_pairs - sorted_pairs.extend(fp32_pairs) - sorted_pairs.extend(other_pairs) - return sorted_pairs - - def _get_var_size(self, var): - dtype_to_size = { - core.VarDesc.VarType.FP16: 2, - core.VarDesc.VarType.BF16: 2, - core.VarDesc.VarType.FP32: 4, - core.VarDesc.VarType.FP64: 8, - core.VarDesc.VarType.INT16: 2, - core.VarDesc.VarType.INT32: 4, - core.VarDesc.VarType.INT64: 8, - core.VarDesc.VarType.BOOL: 1, - core.VarDesc.VarType.UINT8: 1, - } - assert -1 not in var.shape - return ( - reduce(lambda x, y: x * y, var.shape, 1) - * dtype_to_size[var.dtype] - / 1024.0 - / 1024.0 - ) - - def _add_sub_blocks(self, main_block, program_list): - main_program = main_block.program - for prog in program_list: - for op in prog.block(0).ops: - if not op.has_attr('sub_block'): - continue - origin_sub_block_id = op.attr('sub_block').id - origin_sub_block = main_program.block(origin_sub_block_id) - new_sub_block = prog._create_block(parent_idx=0) - for sub_op in origin_sub_block.ops: - op_desc = sub_op.desc - ap_op = new_sub_block.desc.append_op() - ap_op.copy_from(op_desc) - new_sub_block._sync_with_cpp() - self._create_vars(new_sub_block, origin_sub_block) - op._set_attr('sub_block', new_sub_block) - - def _get_device_info(self, block): - for op in block.ops: - if not op._has_kernel(op.type): - continue - op_device = op.attr(self._op_device_key) - return op_device - - def _process_persistable_vars_in_multi_sections( - self, main_program, startup_prog, program_list - ): - """ - Special Case: process persistable vars that exist in - multiple sections, e.g., shared weight - """ - # var_info = {var_name: [program1, program2...]}, - # persistable var only - var_info = dict() - for prog in program_list: - block = prog.block(0) - for var_name in block.vars: - if var_name == "double_buffer_0": - continue - var = block.var(var_name) - if not var.persistable: - continue - if not var_name in var_info: - var_info[var_name] = [] - if not prog in var_info[var_name]: - var_info[var_name].append(prog) - for var_name in list(var_info.keys()): - if len(var_info[var_name]) == 1: - var_info.pop(var_name) - - # write_info = {var_name: program}, where program is the only program - # in which the var named var_name is written. - write_info = dict() - for var_name in var_info.keys(): - for prog in var_info[var_name]: - block = prog.block(0) - for op in block.ops: - if ( - op.type == "recv_v2" - or op.type == "create_py_reader" - or op.type == "read" - or op.type == "update_loss_scaling" - ): - continue - # We have processed lr related vars - if op.attr(self._op_role_key) == int( - self._op_role.Optimize.LRSched - ): - continue - if var_name in op.desc.output_arg_names(): - assert var_name not in write_info, ( - "two sections write the same var({}): second " - "op {}.".format(var_name, op) - ) - write_info[var_name] = prog - break - - for var_name in var_info.keys(): - # Case 1: read only variables, no special process - if not var_name in write_info: - continue - - # Case 2: one write multiple reads - write_prog = write_info[var_name] - write_block = write_prog.block(0) - write_device = self._get_device_info(write_block) - write_dev_index = int(write_device.split(':')[1]) - all_progs = var_info[var_name] - for prog in all_progs: - if prog == write_prog: - continue - read_block = prog.block(0) - read_device = self._get_device_info(read_block) - read_dev_index = int(read_device.split(':')[1]) - pair = (write_dev_index, read_dev_index) - pair_key = write_dev_index * 1000 + read_dev_index - if pair not in self._pipeline_pair: - self._pipeline_pair.append(pair) - self._pp_ring_map[pair_key] = self.ring_id - ring_id = self.ring_id - self.ring_id += 1 - else: - ring_id = self._pp_ring_map[pair_key] - - write_block._insert_op( - index=0, - type='send_v2', - inputs={ - 'X': write_block.var(var_name), - }, - attrs={ - self._op_device_key: write_device, - 'use_calc_stream': False, - # A trick to make the role LRSched to avoid copy every - # microbatch - self._op_role_key: self._op_role.LRSched, - 'peer': read_dev_index, - 'ring_id': ring_id, - }, - ) - read_block._insert_op( - index=0, - type='recv_v2', - outputs={'Out': [read_block.var(var_name)]}, - attrs={ - 'out_shape': read_block.var(var_name).shape, - 'dtype': read_block.var(var_name).dtype, - self._op_device_key: read_device, - 'use_calc_stream': False, - # A trick to make the role LRSched to avoid copy every - # microbatch - self._op_role_key: self._op_role.LRSched, - 'peer': write_dev_index, - 'ring_id': ring_id, - }, - ) - read_block._insert_op( - index=1, - type='c_sync_comm_stream', - inputs={'X': [read_block.var(var_name)]}, - outputs={'Out': [read_block.var(var_name)]}, - attrs={ - self._op_device_key: read_device, - # A trick to make the role LRSched to avoid copy every - # microbatch - self._op_role_key: self._op_role.LRSched, - 'ring_id': ring_id, - }, - ) - - def _is_gradient_clip_op(self, op): - return op.desc.has_attr("op_namescope") and op.desc.attr( - "op_namescope" - ).startswith("/gradient_clip") - - def _is_regularization_op(self, op): - return op.desc.has_attr("op_namescope") and op.desc.attr( - "op_namescope" - ).startswith("/regularization") - - def _is_weight_decay_op(self, op): - # in AdamW namescope is /optimizer_*/weight decay/ - return op.desc.has_attr( - "op_namescope" - ) and 'weight decay' in op.desc.attr("op_namescope") - - def _get_input_output_info(self, block): - ''' - Get info of op input and output. - ''' - # A map from output var to op which generate it. - output_var_to_op = defaultdict(list) - # A map from var to op which takes it as input. - input_var_to_op = defaultdict(list) - - for index, op in enumerate(block.ops): - for var_name in op.input_arg_names: - input_var_to_op[var_name].append([op, index]) - for var_name in op.output_arg_names: - output_var_to_op[var_name].append([op, index]) - - return output_var_to_op, input_var_to_op - - def _optimize_forward_send_sync(self, program): - """ - optimize forward send's sync_comm_stream schedule - """ - if self.schedule_mode != '1F1B': - return - - block = program.block(0) - - recv_type = 'recv_v2' if self.mp_degree == 1 else 'partial_recv' - backward_recv_index = None - for index, op in enumerate(block.ops): - if op.type == recv_type and self._is_backward_op(op): - backward_recv_index = index - break - - # last pipeline stage - if backward_recv_index is None: - return - - offset = 0 - for index, op in enumerate(list(block.ops)): - if index >= backward_recv_index: - break - if op.type == 'c_sync_comm_stream' and op.has_attr('pipeline_flag'): - var_name = op.input_arg_names[0] - var = block.var(var_name) - block._remove_op(index + offset, sync=False) - offset -= 1 - # NOTE: - # 1. When the backward recv is completed, it indicates - # that the forward send is completed too. So we only need - # to use the NOP op to prevent memory release. - # 2. Because we removed sync_comm_op, - # we will insert NOP after recv_op. - block._insert_op_without_sync( - index=backward_recv_index, - type='nop', - inputs={'X': [var]}, - outputs={'Out': [var]}, - attrs={self._op_role_key: self._op_role.Backward}, - ) - block._sync_with_cpp() - - def _mv_head_recv(self, program): - """ - A pass to move the recv op to the beginning of - the forward/backward phase - """ - forward_insert_index = 0 - backward_insert_index = None - block = program.global_block() - num_ops = len(program.global_block().ops) - for i in range(num_ops): - insert_index = None - op = program.global_block().ops[i] - op_role = int(op.attr(self._op_role_key)) - if ( - op_role == int(self._op_role.Backward) - and backward_insert_index is None - ): - backward_insert_index = i - if ( - op.type != "partial_recv" - and op.type != "partial_allgather" - and op.type != "nop" - and op.type != "recv_v2" - ): - continue - if op_role == int(self._op_role.Forward): - if i == forward_insert_index: - forward_insert_index += 1 - continue - insert_index = forward_insert_index - elif op_role == int(self._op_role.Backward): - if i == backward_insert_index: - backward_insert_index += 1 - continue - insert_index = backward_insert_index - else: - raise ValueError("Unknown op_role: {}".format(op_role)) - op_inputs = dict() - for name in op.input_names: - op_inputs[name] = op.input(name) - op_outputs = dict() - for name in op.output_names: - op_outputs[name] = op.output(name) - block._insert_op_without_sync( - index=insert_index, - type=op.type, - inputs=op_inputs, - outputs=op_outputs, - attrs=op.all_attrs(), - ) - block._remove_op(i + 1) - if op_role == int(self._op_role.Forward): - forward_insert_index += 1 - elif op_role == int(self._op_role.Backward): - backward_insert_index += 1 - block._sync_with_cpp() - - def _check_pipeline_persist_var(self, program): - """ - Pipeline may need multiple forward before - """ - block = program.global_block() - - persist_output = set() - used_in_backward = set() - for op in block.ops: - if self._is_forward_op(op): - for var_name in op.output_arg_names: - var = block.vars[var_name] - if var.persistable: - persist_output.add(var_name) - elif self._is_backward_op(op): - for var_name in op.input_arg_names: - if var_name in persist_output: - used_in_backward.add(var_name) - if len(used_in_backward) == 0: - return - warnings.warn( - "The pipeline requires multiple forward calculations before backward, " - "so when the persistable var is changed in the forward, it may cause " - "errors in the backward calculation who using this persistable var. " - "However, some backward op don't need this var(NoNeedBufferVars), " - "there will be no error at this time.\n" - "So please check these persistable vars which changed in " - "forward and used in backward:\n{}".format(used_in_backward) - ) - - def minimize( - self, loss, startup_program=None, parameter_list=None, no_grad_set=None - ): - main_block = loss.block - self.origin_main_block = main_block - main_program = main_block.program - if startup_program is None: - startup_program = default_startup_program() - - pipeline_opt = main_program._pipeline_opt - assert pipeline_opt, 'Please use pipeline with fleet.' - required_keys = [ - 'local_rank', - 'schedule_mode', - 'micro_batch_size', - 'ring_id', - 'global_ring_id', - 'use_sharding', - 'mp_degree', - 'mp_rank', - ] - for key in required_keys: - assert ( - key in pipeline_opt - ), 'Please use pipeline with fleet to use {}.'.format(key) - self.local_rank = pipeline_opt['local_rank'] - self.schedule_mode = pipeline_opt['schedule_mode'] - self.micro_batch_size = pipeline_opt['micro_batch_size'] - self.use_sharding = pipeline_opt['use_sharding'] - self.ring_id = pipeline_opt['ring_id'] - self.global_ring_id = pipeline_opt['global_ring_id'] - self.mp_degree = pipeline_opt['mp_degree'] - self.mp_rank = pipeline_opt['mp_rank'] - self.scale_gradient = pipeline_opt.get('scale_gradient', False) - assert self.mp_degree >= 1 - assert 0 <= self.mp_rank < self.mp_degree - - optimize_ops, params_grads = self._optimizer.minimize( - loss, startup_program, parameter_list, no_grad_set - ) - self._param_device_map = self._origin_optimizer._param_device_map - - ( - self.output_var_to_op, - self.input_var_to_op, - ) = self._get_input_output_info(main_block) - # Step1: add default op_device attribute for ops. - self._add_op_device_attr(main_block) - device_list = self._check_validation(main_block) - - def device_cmp(device1, device2): - dev1_id = int(device1.split(':')[1]) - dev2_id = int(device2.split(':')[1]) - if dev1_id < dev2_id: - return -1 - elif dev1_id > dev2_id: - return 1 - else: - return 0 - - sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp)) - assert sorted_device_list == device_list, ( - "With pipeline parallelism, you must use gpu devices one after " - "another in the order of their ids." - ) - # Step2: add send and recv ops between section boundaries - self._insert_sendrecv_ops_for_boundaries(main_block) - - # Step3: split program into sections and add pairs of - # send and recv ops for data var. - main_program = main_block.program - program_list = self._split_program(main_program, device_list) - for p in program_list: - self._create_vars(p.global_block(), main_block) - - if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): - self.local_rank = int(os.getenv("PADDLE_MANUAL_PIPELINE_STAGE")) - assert self.local_rank < len(device_list), ( - "Manually specified " - "pipeline stage must be less than total number of pipeline " - "stages." - ) - else: - self.local_rank %= len(device_list) - # Step3.5: optimize forward send sync_comm to overlap send and recv - self._optimize_forward_send_sync(program_list[self.local_rank]) - - # Step4: Special Case: process persistable vars that exist in - # multiple sections - # FIXME - # self._process_persistable_vars_in_multi_sections( - # main_program, startup_program, program_list) - - # Step5: Add sub blocks for section programs - self._add_sub_blocks(main_block, program_list) - - place_list = [] - for dev in device_list: - dev_index = int(dev.split(":")[1]) - if core.is_compiled_with_cuda(): - place_list.append(core.CUDAPlace(dev_index % 1)) - - # Step6: Split startup program - new_startup_program = self._split_startup_program( - startup_program, self.local_rank - ) - - startup_program._pipeline_opt = { - "startup_program": new_startup_program, - } - real_block = program_list[self.local_rank].global_block() - if not self.scale_gradient: - self._insert_loss_scale(real_block) - if not self.use_sharding: - # Step7: clear gradients before each mini-batch and - # accumulate gradients during backward - self._rename_gradient_var_name(real_block) - real_block._sync_with_cpp() - self._accumulate_gradients(real_block) - real_block._sync_with_cpp() - - if core.is_compiled_with_cuda(): - place_id = int(os.getenv("FLAGS_selected_gpus", "0")) - # A pass to move the recv op to the beginning of - # the forward/backward phase - self._mv_head_recv(program_list[self.local_rank]) - - # A pass to check pipeline persist var which changed in - # forward and used in backward - self._check_pipeline_persist_var(program_list[self.local_rank]) - - main_program._pipeline_opt = { - "trainer": "PipelineTrainer", - "device_worker": "Section", - "pipeline_stage": self.local_rank, - "num_pipeline_stages": len(device_list), - "schedule_mode": self.schedule_mode, - "inner_parallelism": len(device_list), - "section_program": program_list[self.local_rank], - "place": place_list[self.local_rank], - "place_id": place_id, - "sync_steps": -1, - "num_microbatches": self._num_microbatches, - "start_cpu_core_id": self._start_cpu_core_id, - } - return ( - optimize_ops, - params_grads, - program_list, - self._pipeline_pair, - self._pp_ring_map, - ) - - -class RecomputeOptimizer(Optimizer): - """ - :api_attr: Static Graph - - Recompute Optimizer Wrapper - - Normally, a training step contains three sub-steps: first, run forward - Operators to calculate the loss; second, run backward Operators to - calculate gradient of the parameters; third, apply optimization method - to update the value of the parameters. - - In the forward computation process, all variables that are needed by - backward computation process will be kept in memory, which occupy a great - amount of memory when the network becomes very deep. - - Recompute split the network to k segments. In each segment, It will - recompute the forward Operators, before running backward operators. It is - very helpful for saving memory. - - The Variables that separate a network to segments are called as checkpoints, - and users should set it manually. The usage is very simple: - - Args: - optimizer (Optimizer): The optimizer that is applied to parameters. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - paddle.enable_static() - - def gen_data(): - return {"x": np.random.random(size=(32, 32)).astype('float32'), - "y": np.random.randint(2, size=(32, 1)).astype('int64')} - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - print(input_x) - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - sgd.minimize(cost) - - print("Finished optimize") - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - step = 10 - - for i in range(step): - cost_val = exe.run(feed=gen_data(), - program=fluid.default_main_program(), - fetch_list=[cost.name]) - print("step=%d cost=%f" % (i, cost_val[0])) - - """ - - def __init__(self, optimizer): - if in_dygraph_mode(): - raise Exception("In dygraph, don't support RecomputeOptimizer.") - self._optimizer = optimizer - self._checkpoints = None - self._learning_rate = self._optimizer._learning_rate - self._learning_rate_map = self._optimizer._learning_rate_map - self.enable_offload = False - - def _set_checkpoints(self, checkpoints): - """ - Args: - checkpoints (list): List of Variable or string - """ - assert isinstance( - checkpoints, list - ), "_checkpoints should be a list of Variable or a list of String" - for ckpt in checkpoints: - assert isinstance(ckpt, str) or isinstance( - ckpt, Variable - ), "_checkpoints should be a list of Variable or a list of String" - self._checkpoints = checkpoints - - # should enable offload before calling backward - def _enable_offload(self): - self.enable_offload = True - - @framework.deprecate_stat_dict - def load(self, state_dict): - """ - :api_attr: Static Graph - - load function is not supported by Recompute Optimizer for now. - :return: None - - Args: - state_dict: the dict load by load_persistable method - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - try: - state_dict = {} - sgd.load(state_dict) - except NotImplementedError as e: - print(e) - """ - raise NotImplementedError( - "load function is not supported by Recompute Optimizer for now" - ) - - def apply_gradients(self, params_grads): - """ - call apply_gradients function of self._optimizer. - - Args: - params_grads (list): list of (param, grad) pair to do optimization. - - Returns: - list: A list of operators appended to the current program. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import paddle.fluid.framework as framework - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - program = cost.block.program - with framework.program_guard(program, None): - optimize_ops = sgd.apply_gradients(params_grads) - - print("Finished apply gradients") - """ - - return self._optimizer.apply_gradients(params_grads=params_grads) - - def _creat_vars(self, varname): - pinned_var_name = unique_name.generate(varname + "@Pinned") - fetched_var_name = unique_name.generate(varname + "@Fetch") - - pinned_var = self._main_program.global_block().create_var( - name=pinned_var_name, - shape=self.checkpoint_shape, - dtype=self._main_program.global_block().var(varname).dtype, - persistable=False, - stop_gradient=True, - ) - - fetch_var = self._main_program.global_block().create_var( - name=fetched_var_name, - shape=self.checkpoint_shape, - dtype=self._main_program.global_block().var(varname).dtype, - persistable=False, - stop_gradient=False, - ) - - return pinned_var_name, fetched_var_name - - def _append_fill_constant_ops(self, startup_program): - """ - add fill_constant_ops to the end of the prog - - we should fill the pinned vars before runing the main_prog - to instantiate their tensor hold_, which could tell us whether - the host memory could hold all the checkpoints from all the - GPU devices in this node. - """ - op_role = 0 - block = startup_program.global_block() - fill_constant_vars = self.checkpoint_name2pinned_name.values() - OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() - for varname in fill_constant_vars: - var = self._main_program.global_block().var(varname) - # NOTE (JZ-LIANG) to pre-allocate the CUDAPinned MEM - pinned_var = block.create_var( - name=varname, - shape=self.checkpoint_shape, - dtype=self._main_program.global_block().var(var.name).dtype, - persistable=False, - stop_gradient=True, - ) - block.append_op( - type='fill_constant', - outputs={'Out': varname}, - attrs={ - "shape": var.shape, - "dtype": var.dtype, - "value": 0.0, - "place_type": 2, - OP_ROLE_KEY: op_role, - }, - ) - - return - - def _insert_async_memcpy_op( - self, insert_idx, src_varname, dst_varname, op_role, dst_place_type - ): - OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() - self.block._insert_op_without_sync( - insert_idx, - type='memcpy', - inputs={'X': [self._main_program.global_block().var(src_varname)]}, - outputs={ - 'Out': [self._main_program.global_block().var(dst_varname)] - }, - attrs={"dst_place_type": int(dst_place_type), OP_ROLE_KEY: op_role}, - ) - - def _insert_fetch_op(self, idx, varname): - assert ( - varname in self.checkpoint_name2pinned_name - ), "Try to fetch {} from Pinned Memory, but it is NOT a checkpoint".format( - varname - ) - - pinned_varname = self.checkpoint_name2pinned_name[varname] - fetch_varname = self.checkpoint_name2fetch_name[varname] - self._insert_async_memcpy_op(idx, pinned_varname, fetch_varname, 1, 1) - - def _insert_offload_op(self, idx, varname): - assert ( - varname in self.checkpoint_name2pinned_name - ), "Try to offload {} to Pinned Memory, but it is NOT a checkpoint".format( - varname - ) - pinned_varname = self.checkpoint_name2pinned_name[varname] - self._insert_async_memcpy_op(idx, varname, pinned_varname, 0, 2) - - def _insert_sync_op(self, op_idx, checkpoint_name): - # single stream offload no need sync - pass - - def _record_fetch_op(self, idx): - assert ( - len(self.un_fetch_checkpoint_names) > 0 - ), "Could NOT found checkpoint to fetch" - checkpoint_name = self.un_fetch_checkpoint_names.pop(-1) - logging.debug("Record fetch [{}]".format(checkpoint_name)) - self.idx2insertions[idx] = ("fetch", checkpoint_name) - - return checkpoint_name - - def _record_offload_op(self, idx, checkpoint_name): - expected_checkpoint_name = self.un_offload_checkpoint_names.pop(0) - assert ( - checkpoint_name == expected_checkpoint_name - ), "expected to offload [{}] but got [{}]".format( - expected_checkpoint_name, checkpoint_name - ) - logging.debug("Record offload [{}]".format(checkpoint_name)) - self.idx2insertions[idx] = ("offload", checkpoint_name) - - def _record_sync_op(self, idx, checkpoint_name): - assert ( - checkpoint_name not in self.synced_checkpoints - ), "Try to sync the checkpoint [{}] twice".format(checkpoint_name) - self.synced_checkpoints.add(checkpoint_name) - logging.debug("Record offload sync [{}]".format(checkpoint_name)) - self.idx2insertions[idx] = ("sync", checkpoint_name) - - def _parse_backward(self): - self.idx2insertions = {} - # don't offload the last checkpoints, to favor throughput - self.un_fetch_checkpoint_names = self.sorted_checkpoint_names[:] - self.un_fetch_checkpoint_names.pop(-1) - need_fetch_checkpoint_names = self.un_fetch_checkpoint_names[:] - self.checkpoint_usage_count = {} - for checkpoint_name in self.un_fetch_checkpoint_names: - self.checkpoint_usage_count[checkpoint_name] = 0 - - self.bw_strart_op_idx = len(self.block.ops) - for idx, op in enumerate(self.block.ops): - if int(op.desc.attr("op_role")) == 1: - self.bw_strart_op_idx = idx - break - - assert self.bw_strart_op_idx < len( - self.block.ops - ), "Could NOT found backword op in prog" - - # fetch second to last checkpoint at the beginning of BW - fetched_checkpoint_varname = self._record_fetch_op( - self.bw_strart_op_idx - ) - last_last_fetch_checkpoint = None - - for i, op in enumerate(self.block.ops[self.bw_strart_op_idx :]): - idx = self.bw_strart_op_idx + i - input_vars = op.desc.input_arg_names() - - for input_var in input_vars: - if input_var in need_fetch_checkpoint_names: - if input_var not in self.un_fetch_checkpoint_names: - # fetch the offloade checkpoint when the first usage of its previous one - if self.checkpoint_usage_count[input_var] == 0: - # TODO (JZ-LIANG) sync memcpy_stream if extra stream for memcpy - second_to_last_fetch_checkpoint = ( - fetched_checkpoint_varname - ) - # there is NO fetch ahead the first checkpoint - if input_var != self.sorted_checkpoint_names[0]: - fetched_checkpoint_varname = ( - self._record_fetch_op(idx) - ) - - # should check the current used checkpoint is ths last fetch one - assert ( - second_to_last_fetch_checkpoint == input_var - ), "Current recompute segment should use [{}] BUT got [{}]".format( - second_to_last_fetch_checkpoint, input_var - ) - # rename - self.block.ops[idx]._rename_input( - input_var, - self.checkpoint_name2fetch_name[input_var], - ) - self.checkpoint_usage_count[input_var] += 1 - else: - raise ValueError( - "use checkpoint [{}] before fetch in BW".format( - input_var - ) - ) - - assert ( - len(self.un_fetch_checkpoint_names) == 0 - ), "{} checkpoints have NOT been Recorded".format( - self.un_fetch_checkpoint_names - ) - - def _update_backward(self): - if len(self.idx2insertions) == 0: - return - total_op = len(self.block.ops) - for op_idx in reversed(range(self.bw_strart_op_idx, total_op)): - if op_idx in self.idx2insertions: - operation, checkpoint_name = self.idx2insertions[op_idx] - if operation == "fetch": - self._insert_fetch_op(op_idx, checkpoint_name) - logging.debug( - "Insert [{}] fetch op.".format(checkpoint_name) - ) - del self.idx2insertions[op_idx] - elif operation == "sync": - self._insert_sync_op(op_idx, checkpoint_name) - logging.debug("Sync [{}] fetch op.".format(checkpoint_name)) - self.block._sync_with_cpp() - assert ( - len(self.idx2insertions) == 0 - ), "{} checkpoints left un-Fecthed".format( - [ele[1] for ele in self.idx2insertions.values()] - ) - - def _parse_forward(self): - self.idx2insertions = {} - # don't offload the last checkpoints, faster, less memory saving - self.un_offload_checkpoint_names = self.sorted_checkpoint_names[:] - last_checkpoint = self.un_offload_checkpoint_names.pop(-1) - need_offload_checkpoint_names = self.un_offload_checkpoint_names[:] - self.checkpoint_usage_count_and_idx = {} - for checkpoint_name in self.un_offload_checkpoint_names: - self.checkpoint_usage_count_and_idx[checkpoint_name] = { - 'count': 0, - 'idx': -1, - } - self.synced_checkpoints = set() - self.fw_strart_op_idx = len(self.block.ops) - for idx, op in enumerate(self.block.ops): - if int(op.desc.attr("op_role")) == 0: - self.fw_strart_op_idx = idx - break - - assert self.fw_strart_op_idx < len( - self.block.ops - ), "Could NOT found Forward op in prog" - last_offload_checkpoint = None - - for i, op in enumerate( - self.block.ops[self.fw_strart_op_idx : self.bw_strart_op_idx] - ): - idx = self.fw_strart_op_idx + i - output_vars = op.desc.output_arg_names() - input_vars = op.desc.input_arg_names() - - for output_var in output_vars: - if output_var in need_offload_checkpoint_names: - assert ( - len(output_vars) == 1 - ), "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format( - output_var, op - ) - - if output_var in self.un_offload_checkpoint_names: - # insert sync op if last checkpoint has not been sync - if last_offload_checkpoint is not None: - if ( - self.checkpoint_usage_count_and_idx[ - last_offload_checkpoint - ]['count'] - == 0 - ): - self._record_sync_op( - idx, last_offload_checkpoint - ) - else: - last_usage_idx = ( - self.checkpoint_usage_count_and_idx[ - last_offload_checkpoint - ]['idx'] - ) - assert ( - last_usage_idx > 0 - ), "last_usage_idx of checkpoint [{}] should large than 0".format( - last_offload_checkpoint - ) - self._record_sync_op( - last_usage_idx + 1, last_offload_checkpoint - ) - # insert offload op after the checkpoint's generation op - self._record_offload_op(idx + 1, output_var) - last_offload_checkpoint = output_var - else: - raise ValueError( - "There should be just ONE op that output checkpoint [{}]".format( - output_var - ) - ) - # need to sync the last need to offload checkpoint before the last checkpoint as output op - if output_var == last_checkpoint: - assert ( - len(output_vars) == 1 - ), "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format( - output_var, op - ) - assert ( - last_offload_checkpoint - == self.sorted_checkpoint_names[-2] - ), "the last offload chekpoint before [{}] is suppose to be [{}], but got [{}]".format( - last_checkpoint, - self.sorted_checkpoint_names[-2], - last_offload_checkpoint, - ) - # sync if last checkpoint has not been sync - if ( - self.checkpoint_usage_count_and_idx[ - last_offload_checkpoint - ]['idx'] - == 0 - ): - self._record_sync_op(idx, last_offload_checkpoint) - else: - last_usage_idx = self.checkpoint_usage_count_and_idx[ - last_offload_checkpoint - ]['idx'] - assert ( - last_usage_idx > 0 - ), "last_usage_idx of checkpoint [{}] should large than 0".format( - last_offload_checkpoint - ) - self._record_sync_op( - last_usage_idx + 1, last_offload_checkpoint - ) - # record checkpoint usage - for input_var in input_vars: - if input_var in need_offload_checkpoint_names: - assert ( - input_var not in self.synced_checkpoints - ), "checkpoint [{}] used after sync".format(input_var) - self.checkpoint_usage_count_and_idx[input_var]['count'] += 1 - self.checkpoint_usage_count_and_idx[input_var]['idx'] = idx - - assert ( - len(self.un_offload_checkpoint_names) == 0 - ), "{} checkpoints have NOT been Recorded".format( - self.un_fetch_checkpoint_names - ) - assert len(self.synced_checkpoints) == len( - need_offload_checkpoint_names - ), "{} checkpoints have NOT been Recorded".format( - set(need_offload_checkpoint_names) - set(self.synced_checkpoints) - ) - - def _update_forward(self): - if len(self.idx2insertions) == 0: - return - for op_idx in reversed( - range(self.fw_strart_op_idx, self.bw_strart_op_idx) - ): - if op_idx in self.idx2insertions: - operation, checkpoint_name = self.idx2insertions[op_idx] - if operation == "offload": - self._insert_offload_op(op_idx, checkpoint_name) - logging.debug( - "Insert [{}] offload op.".format(checkpoint_name) - ) - del self.idx2insertions[op_idx] - elif operation == "sync": - self._insert_sync_op(op_idx, checkpoint_name) - logging.debug( - "Insert [{}] offload_sync op.".format(checkpoint_name) - ) - del self.idx2insertions[op_idx] - - self.block._sync_with_cpp() - assert ( - len(self.idx2insertions) == 0 - ), "{} checkpoints left un-Offloaded".format( - [ele[1] for ele in self.idx2insertions.values()] - ) - - def _check_offload_fetch(self): - # TODO(JZ-LIANG) the single stream offload need no sync - pass - - def _offload(self, loss, startup_program=None): - """ - core steps for recompute offload - 1. create pinned vars and temp vars - 2. parse & update Forward pass: offload, sync - 3. parse & update Backward pass: rename, fetch, sync - 4. verify the correctness - """ - self._main_program = loss.block.program - self.block = loss.block - if startup_program is None: - startup_program = paddle.static.default_startup_program() - - with program_guard(self._main_program, startup_program): - assert ( - len(self.checkpoint_shape) > 0 - ), "checkpoints shape {} should be an non empty list like: [12, 512, 1024]".format( - self.checkpoint_shape - ) - assert all( - [ele > 0 for ele in self.checkpoint_shape] - ), "all ele in checkpoints shape {} should be a determined integer larger than 0".format( - self.checkpoint_shape - ) - self.checkpoint_name2pinned_name = dict() - self.checkpoint_name2fetch_name = dict() - for checkpoint_varname in self.sorted_checkpoint_names: - pinned_var_name, fetch_var_name = self._creat_vars( - checkpoint_varname - ) - self.checkpoint_name2pinned_name[ - checkpoint_varname - ] = pinned_var_name - self.checkpoint_name2fetch_name[ - checkpoint_varname - ] = fetch_var_name - self._append_fill_constant_ops(startup_program) - # TODO (JZ-LIANG) to provide two offload stragtegy in future - # step 2. parse & update FW: rename, offload, sync - self._parse_backward() - self._update_backward() - # step 3. parse & update BW: rename, offload, sync - self._parse_forward() - self._update_forward() - # step 4. verify the correctness - self._check_offload_fetch() - - return - - def backward( - self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None, - callbacks=None, - ): - """ - call append_backward with checkpoints. - - Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - parameter_list (list): list of Variables or Variable.names to update. - no_grad_set (set|None): set of Variables or Variables.names should be ignored. - callbacks (list|None): list of callables to run when appending backward - operator for one parameter. - checkpoints (list): list of Variables as checkpoints - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - print("Finished backward") - """ - assert ( - self._checkpoints is not None - ), "You should call _set_checkpoints first" - - if in_dygraph_mode(): - raise NotImplementedError( - "DyGraph current does not support recompute" - ) - - self._dtype = loss.dtype - program = loss.block.program - with program_guard(program, startup_program): - checkpoint_vars = [] - for ckpt in self._checkpoints: - if isinstance(ckpt, Variable): - checkpoint_vars.append(ckpt) - else: - checkpoint_vars.append(loss.block.var(ckpt)) - - # allow return to non-recompute when checkpoints is empty - if len(checkpoint_vars) > 0: - params_grads, sorted_checkpoint_names = append_backward( - loss, - parameter_list, - no_grad_set, - checkpoints=checkpoint_vars, - ) - else: - params_grads = append_backward( - loss, - parameter_list, - no_grad_set, - checkpoints=checkpoint_vars, - ) - - if self.enable_offload: - self.sorted_checkpoint_names = sorted_checkpoint_names - self._offload(loss, startup_program=startup_program) - - return params_grads - - def apply_optimize(self, loss, startup_program, params_grads): - """ - call the apply_optimize function of self._optimizer - Args: - loss (Variable): loss variable to run optimizations. - startup_program (Program): startup_program for initializing parameters - in `parameter_list`. - params_grads (list): list of (param, grad) pair to do optimization. - Examples: - .. code-block:: python - import paddle - import paddle.fluid as fluid - - paddle.enable_static() - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - print("Finished FF") - - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) - sgd._set_checkpoints([fc_1, pred]) - params_grads = sgd.backward( - cost, - startup_program=None, - parameter_list=None, - no_grad_set=None) - - optimize_ops = sgd.apply_optimize( - cost, startup_program=None, params_grads=params_grads) - - print("Finished apply_optimize") - """ - - func = ( - self._optimizer.apply_optimize - if hasattr(self._optimizer, 'apply_optimize') - else self._optimizer._apply_optimize - ) - return func( - loss, startup_program=startup_program, params_grads=params_grads - ) - - def minimize( - self, loss, startup_program=None, parameter_list=None, no_grad_set=None - ): - assert isinstance(loss, Variable), "The loss should be an Variable." - assert ( - self._checkpoints is not None - ), "You should call _set_checkpoints first" - if in_dygraph_mode(): - raise NotImplementedError( - "DyGraph current does not support recompute" - ) - params_grads = self.backward( - loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set, - ) - - optimize_ops = self.apply_optimize( - loss, startup_program=startup_program, params_grads=params_grads - ) - - return optimize_ops, params_grads - - -class LookaheadOptimizer: - r""" - :api_attr: Static Graph - - This implements the Lookahead optimizer of the - paper : https://arxiv.org/abs/1907.08610. - - Lookahead keeps two sets of params: the fast_params and - the slow_params. inner_optimizer update fast_params every - training step. Lookahead updates the slow_params and fast_params - every k training steps as follows: - - .. math:: - - slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1}) - - fast\_param_t &= slow\_param_t - - Args: - inner_optimizer (Optimizer): The optimizer that update fast params step by step. - alpha (float): The learning rate of Lookahead. - k (int): The slow params is updated every k steps. - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - import numpy.random as random - - paddle.enable_static() - - x = paddle.static.data(name='x', shape=[-1,2], dtype='float32') - label = paddle.static.data(name="label", shape=[-1,1], dtype="int64") - y = paddle.static.nn.fc(x=[x], size=2, activation="softmax") - loss = paddle.nn.functional.cross_entropy( - input=y, label=label, - reduction='none', use_softmax=False - ) - loss = paddle.mean(x=loss) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fluid.optimizer.LookaheadOptimizer(sgd, - alpha=0.5, - k=5) - optimizer.minimize(loss) - main_program = fluid.default_main_program() - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - def train_reader(limit=5): - for i in range(limit): - yield random.random([2]).astype('float32'), random.random([1]).astype('int64') - - feeder = fluid.DataFeeder(feed_list=[x, label], place=place) - reader = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=50000),batch_size=1) - - for batch_data in reader(): - exe.run(fluid.default_main_program(), - feed=feeder.feed(batch_data)) - - """ - - def __init__(self, inner_optimizer, alpha=0.5, k=5): - if in_dygraph_mode(): - raise Exception("In dygraph, don't support LookaheadOptimizer.") - assert inner_optimizer is not None, "inner optimizer can not be None" - assert ( - 0.0 <= alpha <= 1.0 - ), "alpha should be larger or equal to 0.0, and less or equal than 1.0" - assert isinstance(k, int) and k > 0, "k should be a positive integer" - - self.inner_optimizer = inner_optimizer - self.alpha = alpha - self.k = k - self.type = "lookahead" - - def minimize(self, loss, startup_program=None): - # Apply inner optimizer to the main_program - mini_out = self.inner_optimizer.minimize( - loss, startup_program=startup_program - ) - - # Get startup_program and main_program - if startup_program is None: - startup_program = default_startup_program() - main_block = loss.block - - # add some vars to the main_program - params = [param.name for param in main_block.all_parameters()] - param_to_slow = {} - for param in params: - fast_var = main_block.var(param) - assert fast_var is not None - slow_var = main_block.create_var( - name=param + "@SLOW", - shape=fast_var.shape, - dtype=fast_var.dtype, - persistable=True, - ) - param_to_slow[param] = slow_var - - # add some vars to the startup_program - startup_block = startup_program.global_block() - for param in params: - fast_var = startup_block.var(param) - assert fast_var is not None - slow_var = startup_block.create_var( - name=param + "@SLOW", - shape=fast_var.shape, - dtype=fast_var.dtype, - persistable=True, - ) - - startup_block.append_op( - type="assign", inputs={"X": fast_var}, outputs={"Out": slow_var} - ) - - with framework.program_guard(main_block.program, startup_program): - # Add Var k to main prog and startup prog - k = paddle.static.create_global_var( - name="lookahead_k", - shape=[1], - value=int(self.k), - dtype='int32', - persistable=True, - ) - - # Add Var alpha to main prog and startup prog - alpha = paddle.static.create_global_var( - name="lookahead_alpha", - shape=[1], - value=float(self.alpha), - dtype='float32', - persistable=True, - ) - - # Add Var step - step = paddle.static.create_global_var( - name="lookahead_step", - shape=[1], - value=int(0), - dtype='int32', - persistable=True, - ) - paddle.increment(x=step, value=1.0) - - # lookahead - zero_var = paddle.tensor.fill_constant( - shape=[1], dtype='float32', value=0.0 - ) - - one_var = paddle.tensor.fill_constant( - shape=[1], dtype='float32', value=1.0 - ) - - mod = paddle.remainder(step, k) - for param_name in params: - fast_var = main_block.var(param_name) - slow_var = param_to_slow[param_name] - tmp_var = paddle.add( - paddle.multiply(fast_var, alpha), - paddle.multiply(slow_var, paddle.subtract(one_var, alpha)), - ) - slow_val = paddle.static.nn.case( - [ - (step == one_var, lambda: fast_var), - (mod == zero_var, lambda: tmp_var), - ], - default=lambda: slow_var, - ) - paddle.assign(slow_val, slow_var) - - fast_val = paddle.static.nn.case( - [ - (mod == zero_var, lambda: tmp_var), - ], - default=lambda: fast_var, - ) - paddle.assign(fast_val, fast_var) - - return mini_out - - -class GradientMergeOptimizer: - """ - Gradient Merge, also called as Gradient Accumulation, - is a training strategy for larger batches. With this strategy, - the parameter will not be updated until specific steps. - - For each step, the forward network and the backward network - will run to calculate the gradient of the parameters. - - For every k step, the optimization network will run, - applying a specific optimization method (such as SGD, Adam) - to the parameters. - - Args: - inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam) - which update the parameters - k_steps (int): the update period of the parameters - avg (bool): whether to average the gradients of each mini-batch, - the default value is `True` - - Examples: - .. code-block:: python - - import paddle - import paddle.fluid as fluid - import numpy as np - - def gen_data(batch_size): - return {"x": np.random.random(size=(batch_size, 32)).astype('float32'), - "y": np.random.random(size=(batch_size, 1)).astype('int64')} - - def mlp(input_x, input_y, hid_dim=128, label_dim=2): - fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) - prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=input_y, - reduction='none', use_softmax=False - ) - sum_cost = paddle.mean(cost) - return sum_cost, fc_1, prediction - - input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') - input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') - cost, fc_1, pred = mlp(input_x, input_y) - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True) - sgd.minimize(cost) - - place = fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - for i in range(10): - cost_val = exe.run(feed=gen_data(32), - program=fluid.default_main_program(), - fetch_list=[cost.name]) - print("step=%d, cost=%f" % (i, cost_val[0])) - """ - - GRAD_MERGE_COND_NAME = "grad_merge_cond_name" - - def __init__(self, inner_optimizer, k_steps=1, avg=True): - if in_dygraph_mode(): - raise Exception( - "In dygraph, we don't support GradientMergeOptimizer." - "You can do Gradient merge by yourself with k-times forward + backward, " - "and one-time optimizer.minimize()" - ) - - assert inner_optimizer is not None, "inner optimizer can not be None" - assert ( - isinstance(k_steps, int) and k_steps > 0 - ), "k_steps should be a positive integer" - - self.inner_optimizer = inner_optimizer - self.k_steps = k_steps - self.type = "gradient_merge" - self.avg = avg - self._optimize_ops = None - - def _set_k_steps(self, k_steps): - self.k_steps = k_steps - - def _set_avg(self, avg): - self.avg = avg - - def backward( - self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None, - callbacks=None, - ): - assert isinstance(loss, Variable), "The loss should be an Variable." - assert ( - parameter_list is None - ), "The parameter_list should be None when using GradientMergeOptimizer" - assert ( - no_grad_set is None - ), "The no_grad_set should be None when using GradientMergeOptimizer" - - params_grads = self.inner_optimizer.backward( - loss, startup_program=startup_program - ) - return params_grads - - def apply_optimize(self, loss, startup_program, params_grads): - program = loss.block.program - with program_guard(program, startup_program): - optimize_ops = self.apply_gradients(params_grads) - return optimize_ops - - def _is_the_backward_op(self, op): - op_maker = core.op_proto_and_checker_maker - backward = core.op_proto_and_checker_maker.OpRole.Backward - if op_maker.kOpRoleVarAttrName() in op.attr_names and int( - op.all_attrs()[op_maker.kOpRoleAttrName()] - ) == int(backward): - return True - return False - - def _remove_op_role_var(self, param, grad): - op_maker = core.op_proto_and_checker_maker - op = grad.op - assert self._is_the_backward_op( - op - ), 'grad.op={} is not the backward op which produces the grad={}'.format( - op, grad.name - ) - - block = grad.block - var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] - assert ( - param.name in var_attr - ), 'when using GradientMergeOptimizer, param={} must be in var_attr={}'.format( - param.name, var_attr - ) - assert ( - grad.name in var_attr - ), 'when using GradientMergeOptimizer, grad={} must be in var_attr={}'.format( - param.name, var_attr - ) - - # remove (param, grad) from op_role_var - var_attr.remove(param.name) - var_attr.remove(grad.name) - if len(var_attr) > 1: - op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) - else: - op._remove_attr(op_maker.kOpRoleVarAttrName()) - - def _add_gm_op_role_var(self, op, param, grad, cond): - grad.op = op - op_maker = core.op_proto_and_checker_maker - backward = op_maker.OpRole.Backward - - # NOTE(wangxi). When distributed, we will insert grad_merge_all_reduce_op_handle - # in multi_devices_graph_pass, which will allreduce(grad) if cond is True, else - # do nothing. - # In this way, the gradient can be merged first, and then communicate when the - # condition is met, reducing the number of communications to increase the - # speed. - op._set_attr(self.GRAD_MERGE_COND_NAME, cond.name) - op._set_attr(op_maker.kOpRoleAttrName(), backward) - op._set_attr(op_maker.kOpRoleVarAttrName(), [param.name, grad.name]) - - def _get_gm_cond_var(self, main_block): - # Add const var - k_step_var = paddle.static.create_global_var( - name="gradient_merge_k", - shape=[1], - value=int(self.k_steps), - dtype='int32', - persistable=True, - force_cpu=True, - ) - - zero_var = paddle.static.create_global_var( - name="gradient_merge_zero", - shape=[1], - value=int(0), - dtype='int32', - persistable=True, - force_cpu=True, - ) - - # Add step var & cond var - step_var = paddle.static.create_global_var( - name="gradient_merge_step", - shape=[1], - value=int(0), - dtype='int32', - persistable=True, - force_cpu=True, - ) - - cond_var = main_block.create_var( - name="gradient_merge_cond", shape=[1], dtype='bool' - ) - - with device_guard("cpu"): - # step_var = (step_var + 1) % k_step - paddle.increment(x=step_var, value=1.0) - main_block.append_op( - type='elementwise_mod', - inputs={'X': step_var, 'Y': k_step_var}, - outputs={'Out': step_var}, - attrs={'axis': -1, 'use_mkldnn': False}, - ) - - # cond_var = (step_var == 0) - main_block.append_op( - type='equal', - inputs={'X': step_var, 'Y': zero_var}, - outputs={'Out': cond_var}, - ) - - return cond_var - - def apply_gradients(self, params_grads): - main_program = default_main_program() - startup_program = default_startup_program() - main_block = main_program.global_block() - startup_block = startup_program.global_block() - - cond = self._get_gm_cond_var(main_block) - - # TODO(mapingshuo) support sparse embedding - # step1: remove grad.op's op_role_var - for param, grad in params_grads: - assert ( - param.type != core.VarDesc.VarType.SELECTED_ROWS - ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now" - - self._remove_op_role_var(param, grad) - - param_to_grad = {k.name: v for (k, v) in params_grads} - param_names = param_to_grad.keys() - param_to_gradient_merge = {} - - new_params_grads = [] - # step2: create gradient_merge var and init with 0 - # and update op_role_var - for param, grad in params_grads: - param_name = param.name - param_var = main_block.var(param_name) - assert param_var is not None - gradient_merge_var = main_block.create_var( - name=param_name + "@GRAD@GradientMerge", - shape=param_var.shape, - dtype=param_var.dtype, - persistable=True, - ) - param_to_gradient_merge[param_name] = gradient_merge_var - - startup_gradient_merge_var = startup_block.create_var( - name=param_name + "@GRAD@GradientMerge", - shape=param_var.shape, - dtype=param_var.dtype, - persistable=True, - ) - startup_block.append_op( - type="fill_constant", - outputs={"Out": startup_gradient_merge_var}, - attrs={ - "shape": param_var.shape, - "dtype": param_var.dtype, - "value": float(0), - }, - ) - - # grad_merge += grad - new_grad_op = main_block.append_op( - type="elementwise_add", - inputs={'X': grad, 'Y': gradient_merge_var}, - outputs={'Out': gradient_merge_var}, - attrs={'axis': -1, 'use_mkldnn': False}, - ) - self._add_gm_op_role_var( - new_grad_op, param, gradient_merge_var, cond - ) - new_params_grads.append([param, gradient_merge_var]) - - def true_apply_gradient(): - cur_block_idx = main_program.current_block_idx - cur_block = main_program.current_block() - - # cur_block's forward_block & backward_block is itself - cur_block._set_forward_block_idx(cur_block_idx) - op_maker = core.op_proto_and_checker_maker - - if self.avg: - for param, new_grad in new_params_grads: - # grad /= k_steps - cur_block.append_op( - type='scale', - inputs={'X': new_grad}, - outputs={'Out': new_grad}, - attrs={ - 'scale': 1.0 / self.k_steps, - 'bias': 0.0, - 'bias_after_scale': False, - }, - ) - new_grad.op._set_attr( - op_maker.kOpRoleAttrName(), op_maker.OpRole.Backward - ) - - for param, new_grad in new_params_grads: - # NOTE. regularization will append ops to grad.block, - # while new_grad's real block is global_block, - # but we want append regularization ops to cur_block, - # so we set new_grad.block = cur_block - new_grad.block = cur_block - - self._optimize_ops = self.inner_optimizer.apply_gradients( - new_params_grads - ) - - # clear gradient_merge_vars - for param, new_grad in new_params_grads: - paddle.tensor.fill_constant( - shape=new_grad.shape, - dtype=new_grad.dtype, - value=0.0, - out=new_grad, - ) - new_grad.op._set_attr( - op_maker.kOpRoleAttrName(), op_maker.OpRole.Optimize - ) - - # step3. apply gradient - paddle.static.nn.cond(cond, true_fn=true_apply_gradient, false_fn=None) - - return self._optimize_ops - - def minimize( - self, loss, startup_program=None, parameter_list=None, no_grad_set=None - ): - assert isinstance(loss, Variable), "The loss should be an Variable." - - params_grads = self.backward( - loss, - startup_program=startup_program, - parameter_list=parameter_list, - no_grad_set=no_grad_set, - ) - - optimize_ops = self.apply_optimize( - loss, startup_program=startup_program, params_grads=params_grads - ) - - return optimize_ops, params_grads diff --git a/python/paddle/incubate/distributed/fleet/base.py b/python/paddle/incubate/distributed/fleet/base.py index c87f8a0cdea9fb663496b4883de5bf754c6c6bb8..ad00ebdb95e2b3d6c8b6a573ee3da206c94625c9 100644 --- a/python/paddle/incubate/distributed/fleet/base.py +++ b/python/paddle/incubate/distributed/fleet/base.py @@ -17,8 +17,7 @@ import abc from paddle import fluid from paddle.distributed.fleet.base.role_maker import RoleMakerBase from paddle.fluid.executor import Executor -from paddle.fluid.optimizer import SGD -from paddle.optimizer import SGD as SGD_v2 +from paddle.optimizer import SGD from paddle.static.amp.decorator import OptimizerWithMixedPrecision __all__ = [] @@ -293,8 +292,8 @@ class DistributedOptimizer(metaclass=abc.ABCMeta): def __init__(self, optimizer, strategy=None): if ( not isinstance(optimizer, SGD.__bases__) + and not isinstance(optimizer, fluid.optimizer.Optimizer) and not isinstance(optimizer, OptimizerWithMixedPrecision) - and not isinstance(optimizer, SGD_v2.__base__) ): raise TypeError("optimizer must be an instance of Optimizer") diff --git a/python/paddle/incubate/distributed/fleet/collective.py b/python/paddle/incubate/distributed/fleet/collective.py index 3a520b51899a8c8f2d95059de492cf3c3cfa2bc2..3c0d7831898841876c3bd90be9b27097b236f6a7 100644 --- a/python/paddle/incubate/distributed/fleet/collective.py +++ b/python/paddle/incubate/distributed/fleet/collective.py @@ -533,7 +533,7 @@ class CollectiveOptimizer(DistributedOptimizer): "forward_recompute", self._optimizer.__class__.__name__ ) - self._optimizer = fluid.optimizer.RecomputeOptimizer( + self._optimizer = paddle.incubate.optimizer.RecomputeOptimizer( self._optimizer ) self._optimizer._set_checkpoints(self._recompute_checkpoints) diff --git a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py index 800950e78f21993d87ac546b7844c4db3a7da43c..4b2baca637c7c7cb24dceac4747358646e0de7d9 100644 --- a/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py +++ b/python/paddle/incubate/distributed/fleet/parameter_server/distribute_transpiler/__init__.py @@ -367,7 +367,7 @@ class FleetTranspiler(Fleet): TranspilerOptimizer: subclass of DistributedOptimizer. """ - if not isinstance(optimizer, Optimizer): + if not isinstance(optimizer, paddle.optimizer.Optimizer): raise ValueError("optimizer must be an instance of Optimizer") if not self._is_initialized: raise ValueError( diff --git a/python/paddle/incubate/optimizer/__init__.py b/python/paddle/incubate/optimizer/__init__.py index 549702ab6cf531fdcdc0b9d367f017a86e1cf4e8..4c22c410add055047ab69f9615ae0e8aef8367e7 100644 --- a/python/paddle/incubate/optimizer/__init__.py +++ b/python/paddle/incubate/optimizer/__init__.py @@ -14,6 +14,10 @@ from .lookahead import LookAhead # noqa: F401 from .modelaverage import ModelAverage # noqa: F401 +from .lars_momentum import LarsMomentumOptimizer # noqa: F401 +from .recompute import RecomputeOptimizer # noqa: F401 +from .pipeline import PipelineOptimizer # noqa: F401 +from .gradient_merge import GradientMergeOptimizer # noqa: F401 from .distributed_fused_lamb import DistributedFusedLamb # noqa: F401 from .lbfgs import LBFGS # noqa: F401 from . import functional # noqa: F401 diff --git a/python/paddle/incubate/optimizer/gradient_merge.py b/python/paddle/incubate/optimizer/gradient_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..bd99b1aebb77978ed09c5616e7301169484882e6 --- /dev/null +++ b/python/paddle/incubate/optimizer/gradient_merge.py @@ -0,0 +1,383 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import paddle +from paddle.fluid import core +from paddle.fluid.framework import ( + Variable, + default_main_program, + default_startup_program, + device_guard, + in_dygraph_mode, + program_guard, +) + +__all__ = [] + + +class GradientMergeOptimizer: + """ + Gradient Merge, also called as Gradient Accumulation, + is a training strategy for larger batches. With this strategy, + the parameter will not be updated until specific steps. + + For each step, the forward network and the backward network + will run to calculate the gradient of the parameters. + + For every k step, the optimization network will run, + applying a specific optimization method (such as SGD, Adam) + to the parameters. + + Args: + inner_optimizer (Optimizer): The specific optimization (such as SGD, Adam) + which update the parameters + k_steps (int): the update period of the parameters + avg (bool): whether to average the gradients of each mini-batch, + the default value is `True` + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + def gen_data(batch_size): + return {"x": np.random.random(size=(batch_size, 32)).astype('float32'), + "y": np.random.random(size=(batch_size, 1)).astype('int64')} + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.GradientMergeOptimizer(sgd, k_steps=4, avg=True) + sgd.minimize(cost) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + for i in range(10): + cost_val = exe.run(feed=gen_data(32), + program=fluid.default_main_program(), + fetch_list=[cost.name]) + print("step=%d, cost=%f" % (i, cost_val[0])) + """ + + GRAD_MERGE_COND_NAME = "grad_merge_cond_name" + + def __init__(self, inner_optimizer, k_steps=1, avg=True): + if in_dygraph_mode(): + raise Exception( + "In dygraph, we don't support GradientMergeOptimizer." + "You can do Gradient merge by yourself with k-times forward + backward, " + "and one-time optimizer.minimize()" + ) + + assert inner_optimizer is not None, "inner optimizer can not be None" + assert ( + isinstance(k_steps, int) and k_steps > 0 + ), "k_steps should be a positive integer" + + self.inner_optimizer = inner_optimizer + self.k_steps = k_steps + self.type = "gradient_merge" + self.avg = avg + self._optimize_ops = None + + def _set_k_steps(self, k_steps): + self.k_steps = k_steps + + def _set_avg(self, avg): + self.avg = avg + + def backward( + self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None, + ): + assert isinstance(loss, Variable), "The loss should be an Variable." + assert ( + parameter_list is None + ), "The parameter_list should be None when using GradientMergeOptimizer" + assert ( + no_grad_set is None + ), "The no_grad_set should be None when using GradientMergeOptimizer" + + params_grads = self.inner_optimizer.backward( + loss, startup_program=startup_program + ) + return params_grads + + def apply_optimize(self, loss, startup_program, params_grads): + program = loss.block.program + with program_guard(program, startup_program): + optimize_ops = self.apply_gradients(params_grads) + return optimize_ops + + def _is_the_backward_op(self, op): + op_maker = core.op_proto_and_checker_maker + backward = core.op_proto_and_checker_maker.OpRole.Backward + if op_maker.kOpRoleVarAttrName() in op.attr_names and int( + op.all_attrs()[op_maker.kOpRoleAttrName()] + ) == int(backward): + return True + return False + + def _remove_op_role_var(self, param, grad): + op_maker = core.op_proto_and_checker_maker + op = grad.op + assert self._is_the_backward_op( + op + ), 'grad.op={} is not the backward op which produces the grad={}'.format( + op, grad.name + ) + + block = grad.block + var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] + assert ( + param.name in var_attr + ), 'when using GradientMergeOptimizer, param={} must be in var_attr={}'.format( + param.name, var_attr + ) + assert ( + grad.name in var_attr + ), 'when using GradientMergeOptimizer, grad={} must be in var_attr={}'.format( + param.name, var_attr + ) + + # remove (param, grad) from op_role_var + var_attr.remove(param.name) + var_attr.remove(grad.name) + if len(var_attr) > 1: + op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) + else: + op._remove_attr(op_maker.kOpRoleVarAttrName()) + + def _add_gm_op_role_var(self, op, param, grad, cond): + grad.op = op + op_maker = core.op_proto_and_checker_maker + backward = op_maker.OpRole.Backward + + # NOTE(wangxi). When distributed, we will insert grad_merge_all_reduce_op_handle + # in multi_devices_graph_pass, which will allreduce(grad) if cond is True, else + # do nothing. + # In this way, the gradient can be merged first, and then communicate when the + # condition is met, reducing the number of communications to increase the + # speed. + op._set_attr(self.GRAD_MERGE_COND_NAME, cond.name) + op._set_attr(op_maker.kOpRoleAttrName(), backward) + op._set_attr(op_maker.kOpRoleVarAttrName(), [param.name, grad.name]) + + def _get_gm_cond_var(self, main_block): + # Add const var + k_step_var = paddle.static.create_global_var( + name="gradient_merge_k", + shape=[1], + value=int(self.k_steps), + dtype='int32', + persistable=True, + force_cpu=True, + ) + + zero_var = paddle.static.create_global_var( + name="gradient_merge_zero", + shape=[1], + value=int(0), + dtype='int32', + persistable=True, + force_cpu=True, + ) + + # Add step var & cond var + step_var = paddle.static.create_global_var( + name="gradient_merge_step", + shape=[1], + value=int(0), + dtype='int32', + persistable=True, + force_cpu=True, + ) + + cond_var = main_block.create_var( + name="gradient_merge_cond", shape=[1], dtype='bool' + ) + + with device_guard("cpu"): + # step_var = (step_var + 1) % k_step + paddle.increment(x=step_var, value=1.0) + main_block.append_op( + type='elementwise_mod', + inputs={'X': step_var, 'Y': k_step_var}, + outputs={'Out': step_var}, + attrs={'axis': -1, 'use_mkldnn': False}, + ) + + # cond_var = (step_var == 0) + main_block.append_op( + type='equal', + inputs={'X': step_var, 'Y': zero_var}, + outputs={'Out': cond_var}, + ) + + return cond_var + + def apply_gradients(self, params_grads): + main_program = default_main_program() + startup_program = default_startup_program() + main_block = main_program.global_block() + startup_block = startup_program.global_block() + + cond = self._get_gm_cond_var(main_block) + + # TODO(mapingshuo) support sparse embedding + # step1: remove grad.op's op_role_var + for param, grad in params_grads: + assert ( + param.type != core.VarDesc.VarType.SELECTED_ROWS + ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now" + + self._remove_op_role_var(param, grad) + + param_to_grad = {k.name: v for (k, v) in params_grads} + param_names = param_to_grad.keys() + param_to_gradient_merge = {} + + new_params_grads = [] + # step2: create gradient_merge var and init with 0 + # and update op_role_var + for param, grad in params_grads: + param_name = param.name + param_var = main_block.var(param_name) + assert param_var is not None + gradient_merge_var = main_block.create_var( + name=param_name + "@GRAD@GradientMerge", + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True, + ) + param_to_gradient_merge[param_name] = gradient_merge_var + + startup_gradient_merge_var = startup_block.create_var( + name=param_name + "@GRAD@GradientMerge", + shape=param_var.shape, + dtype=param_var.dtype, + persistable=True, + ) + startup_block.append_op( + type="fill_constant", + outputs={"Out": startup_gradient_merge_var}, + attrs={ + "shape": param_var.shape, + "dtype": param_var.dtype, + "value": float(0), + }, + ) + + # grad_merge += grad + new_grad_op = main_block.append_op( + type="elementwise_add", + inputs={'X': grad, 'Y': gradient_merge_var}, + outputs={'Out': gradient_merge_var}, + attrs={'axis': -1, 'use_mkldnn': False}, + ) + self._add_gm_op_role_var( + new_grad_op, param, gradient_merge_var, cond + ) + new_params_grads.append([param, gradient_merge_var]) + + def true_apply_gradient(): + cur_block_idx = main_program.current_block_idx + cur_block = main_program.current_block() + + # cur_block's forward_block & backward_block is itself + cur_block._set_forward_block_idx(cur_block_idx) + op_maker = core.op_proto_and_checker_maker + + if self.avg: + for param, new_grad in new_params_grads: + # grad /= k_steps + cur_block.append_op( + type='scale', + inputs={'X': new_grad}, + outputs={'Out': new_grad}, + attrs={ + 'scale': 1.0 / self.k_steps, + 'bias': 0.0, + 'bias_after_scale': False, + }, + ) + new_grad.op._set_attr( + op_maker.kOpRoleAttrName(), op_maker.OpRole.Backward + ) + + for param, new_grad in new_params_grads: + # NOTE. regularization will append ops to grad.block, + # while new_grad's real block is global_block, + # but we want append regularization ops to cur_block, + # so we set new_grad.block = cur_block + new_grad.block = cur_block + + self._optimize_ops = self.inner_optimizer.apply_gradients( + new_params_grads + ) + + # clear gradient_merge_vars + for param, new_grad in new_params_grads: + paddle.tensor.fill_constant( + shape=new_grad.shape, + dtype=new_grad.dtype, + value=0.0, + out=new_grad, + ) + new_grad.op._set_attr( + op_maker.kOpRoleAttrName(), op_maker.OpRole.Optimize + ) + + # step3. apply gradient + paddle.static.nn.cond(cond, true_fn=true_apply_gradient, false_fn=None) + + return self._optimize_ops + + def minimize( + self, loss, startup_program=None, parameter_list=None, no_grad_set=None + ): + assert isinstance(loss, Variable), "The loss should be an Variable." + + params_grads = self.backward( + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set, + ) + + optimize_ops = self.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads + ) + + return optimize_ops, params_grads diff --git a/python/paddle/incubate/optimizer/lars_momentum.py b/python/paddle/incubate/optimizer/lars_momentum.py new file mode 100644 index 0000000000000000000000000000000000000000..57055b4a923cc09578f269a093314c584ec3c654 --- /dev/null +++ b/python/paddle/incubate/optimizer/lars_momentum.py @@ -0,0 +1,219 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings + +from paddle import _legacy_C_ops +from paddle.fluid import framework +from paddle.fluid.framework import in_dygraph_mode +from paddle.optimizer import Optimizer + + +class LarsMomentumOptimizer(Optimizer): + r""" + Momentum optimizer with LARS support + + The update equations are as follows: + + .. math:: + + & local\_learning\_rate = learning\_rate * lars\_coeff * \\ + \\frac{||param||}{||gradient|| + lars\_weight\_decay * ||param||} + + & velocity = mu * velocity + local\_learning\_rate * (gradient + lars\_weight\_decay * param + epsilon) + + & param = param - velocity + + Parameters: + learning_rate (float|Variable): The learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. \ + momentum (float): momentum factor + lars_coeff (float): Defines how much we trust the layer to change its weights. + lars_weight_decay (float): Weight decay coefficient for decaying using LARS. + parameter_list (Iterable, optional): Iterable of ``Variable`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. \ + The default value is None in static graph mode, at this time all parameters will be updated. + regularization (WeightDecayRegularizer, optional): The strategy of regularization. There are two method: \ + :ref:`api_fluid_regularizer_L1Decay` , :ref:`api_fluid_regularizer_L2Decay` . If a parameter has set \ + regularizer using :ref:`api_fluid_ParamAttr` already, the regularization setting here in optimizer will be \ + ignored for this parameter. Otherwise, the regularization setting here in optimizer will take effect. \ + Default None, meaning there is no regularization. + grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of + some derived class of ``GradientClipBase`` . There are three cliping strategies + ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , + :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. + name (str, optional): This parameter is used by developers to print debugging information. \ + For details, please refer to :ref:`api_guide_Name`. Default is None. + exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None. + epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0. + multi_precision (bool, optional): Whether to use multi-precision during weight updating. + rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \ + before updating. Often choose to be `1.0/batch_size`. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + paddle.enable_static() + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + inp = paddle.static.data( + name="inp", shape=[2, 2], dtype='float32') + out = paddle.static.nn.fc(inp, size=3) + out = paddle.sum(out) + optimizer = fluid.optimizer.LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + optimizer.minimize(out) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + exe.run( + feed={"inp": np_inp}, + fetch_list=[out.name]) + """ + _velocity_acc_str = "velocity" + + def __init__( + self, + learning_rate, + momentum, + lars_coeff=0.001, + lars_weight_decay=0.0005, + parameter_list=None, + regularization=None, + grad_clip=None, + name=None, + exclude_from_weight_decay=None, + epsilon=0, + multi_precision=False, + rescale_grad=1.0, + ): + assert learning_rate is not None + assert momentum is not None + super().__init__( + learning_rate=learning_rate, + parameters=parameter_list, + weight_decay=regularization, + grad_clip=grad_clip, + name=name, + ) + self.type = "lars_momentum" + self._momentum = momentum + self._lars_coeff = float(lars_coeff) + self._lars_weight_decay = float(lars_weight_decay) + self._epsilon = float(epsilon) + if exclude_from_weight_decay is None: + self._exclude_from_weight_decay = [] + else: + self._exclude_from_weight_decay = exclude_from_weight_decay + self._multi_precision = multi_precision + self._rescale_grad = float(rescale_grad) + self._master_weights = {} + + def _create_accumulators(self, block, parameters): + assert isinstance(block, framework.Block) + + for p in parameters: + if self._multi_precision and self._is_dtype_fp16_or_bf16(p.dtype): + master_p = self._create_master_weight(p) + self._add_accumulator(self._velocity_acc_str, master_p) + continue + if ( + self._is_dtype_fp16_or_bf16(p.dtype) + and not self._multi_precision + ): + warnings.warn( + "Accumulating with FP16/BF16 in optimizer can lead to poor accuracy or slow convergence." + "Consider using multi_precision=True option of the Lars optimizer." + ) + self._add_accumulator(self._velocity_acc_str, p) + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + _lars_weight_decay = self._lars_weight_decay + param_name = param_and_grad[0].name + if len(self._exclude_from_weight_decay) > 0: + for name in self._exclude_from_weight_decay: + if name in param_name: + _lars_weight_decay = 0.0 + break + + velocity_acc = self._get_accumulator_master( + self._velocity_acc_str, param_and_grad[0] + ) + lr = self._create_param_lr(param_and_grad) + + find_master = self._multi_precision and self._is_dtype_fp16_or_bf16( + param_and_grad[0].dtype + ) + master_weight = ( + self._master_weights[param_and_grad[0].name] + if find_master + else None + ) + + attrs = { + "mu": self._momentum, + "lars_coeff": self._lars_coeff, + "lars_weight_decay": [_lars_weight_decay], + "multi_precision": find_master, + "epsilon": self._epsilon, + "rescale_grad": self._rescale_grad, + } + + inputs = { + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "Velocity": velocity_acc, + "LearningRate": lr, + } + + outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc} + + if find_master: + inputs["MasterParam"] = master_weight + outputs["MasterParamOut"] = master_weight + + if in_dygraph_mode(): + tmp, tmp2 = _legacy_C_ops.lars_momentum( + [param_and_grad[0]], + [param_and_grad[1]], + [velocity_acc], + [lr], + [param_and_grad[0]], + [velocity_acc], + "mu", + self._momentum, + "lars_coeff", + self._lars_coeff, + "lars_weight_decay", + [_lars_weight_decay], + "multi_precision", + find_master, + "epsilon", + self._epsilon, + "rescale_grad", + self._rescale_grad, + ) + else: + # create the momentum optimize op + momentum_op = block.append_op( + type=self.type, + inputs=inputs, + outputs=outputs, + attrs=attrs, + stop_gradient=True, + ) + + return momentum_op diff --git a/python/paddle/incubate/optimizer/pipeline.py b/python/paddle/incubate/optimizer/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..4e903c07c673b991ef5e42aab65746aab812f646 --- /dev/null +++ b/python/paddle/incubate/optimizer/pipeline.py @@ -0,0 +1,1979 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import warnings +from collections import defaultdict +from functools import cmp_to_key, reduce + +import numpy as np + +import paddle +from paddle.fluid import core, unique_name +from paddle.fluid.framework import ( + Parameter, + Program, + default_startup_program, + in_dygraph_mode, +) +from paddle.fluid.optimizer import Optimizer + +__all__ = [] + + +class PipelineOptimizer: + """ + :api_attr: Static Graph + + Pipeline Optimizer: Make a program to run as pipeline, that is splitting a + program into multiple sections (sub-programs) and each section run on a + device to enable the training of large scale models and the use of + heterogeneous devices. Meanwhile, all sections run in the stype of pipeline. + + Args: + optimizer (Optimizer): The optimizer to use, such as SGD. + num_microbatches (int): Number of microbatches. [Optional. Default:1]. + start_cpu_core_id (int): The first cpu core id to use. [Optional. Default:0]. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import paddle.fluid.layers as layers + import numpy as np + + paddle.enable_static() + with fluid.device_guard("gpu:0"): + x = paddle.static.data(name='x', shape=[-1, 1], dtype='int64', lod_level=0) + y = paddle.static.data(name='y', shape=[-1, 1], dtype='int64', lod_level=0) + data_loader = fluid.io.DataLoader.from_generator( + feed_list=[x, y], + capacity=64, + use_double_buffer=True, + iterable=False) + + emb_x = layers.embedding(input=x, param_attr=fluid.ParamAttr(name="embx"), size=[10,2], is_sparse=False) + emb_y = layers.embedding(input=y, param_attr=fluid.ParamAttr(name="emby",learning_rate=0.9), size=[10,2], is_sparse=False) + + with fluid.device_guard("gpu:1"): + concat = layers.concat([emb_x, emb_y], axis=1) + fc = paddle.static.nn.fc(x=concat, name="fc", size=1, num_flatten_dims=1, bias_attr=False) + loss = paddle.mean(fc) + optimizer = paddle.optimizer.SGD(learning_rate=0.5) + optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) + optimizer.minimize(loss) + + def train_reader(): + for _ in range(4): + x = np.random.random(size=[1]).astype('int64') + y = np.random.random(size=[1]).astype('int64') + yield x, y + data_loader.set_sample_generator(train_reader, batch_size=1) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + batch_size = 1 + data_loader.start() + exe.train_from_dataset( + fluid.default_main_program()) + data_loader.reset() + """ + + def __init__(self, optimizer, num_microbatches=1, start_cpu_core_id=0): + self._device = 'cpu' + if core.is_compiled_with_cuda(): + self._device = "gpu" + if in_dygraph_mode(): + raise Exception("In dygraph, don't support PipelineOptimizer.") + valid_optimizers = ( + Optimizer, + paddle.optimizer.Optimizer, + paddle.static.amp.decorator.OptimizerWithMixedPrecision, + ) + if not isinstance(optimizer, valid_optimizers): + raise ValueError( + "The 'optimizer' parameter for " + "PipelineOptimizer must be an instance of " + "{}, but the given type is {}.".format( + valid_optimizers, type(optimizer) + ) + ) + self._optimizer = optimizer + + # Get the original optimizer defined by users, such as SGD + self._origin_optimizer = self._optimizer + while hasattr(self._origin_optimizer, "inner_opt"): + self._origin_optimizer = self._origin_optimizer.inner_opt + + assert ( + num_microbatches >= 1 + ), "num_microbatches must be a positive value." + self._num_microbatches = num_microbatches + assert ( + start_cpu_core_id >= 0 + ), "start_cpu_core_id must be a non-negative integer." + self._start_cpu_core_id = start_cpu_core_id + self._place_list = None + op_maker = core.op_proto_and_checker_maker + self._op_role = op_maker.OpRole + self._op_role_key = op_maker.kOpRoleAttrName() + self._op_role_var_key = op_maker.kOpRoleVarAttrName() + self._op_device_key = op_maker.kOpDeviceAttrName() + self._param_device_map = None + self._pipeline_pair = [] + self._pp_ring_map = {} + self.output_var_to_op = None + self.input_var_to_op = None + + # insert allreduce op to sync global information for global + # gradient clip and amp + def _insert_allreduce_op(self, op_idx, block): + """ + Insert allreduce op to sync global information for global + gradient clip and amp. + """ + op = block.ops[op_idx] + out_name = op.desc.output_arg_names()[0] + out_var = block.var(out_name) + offset = 0 + if op.type == "reduce_any": + # cast the bool var to int32 to use allreduce_max op + temp_var_name = unique_name.generate(out_name + "_cast_int32") + temp_var = block.create_var( + name=temp_var_name, shape=[1], dtype="int32" + ) + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': out_var}, + outputs={'Out': temp_var}, + attrs={ + 'in_dtype': out_var.dtype, + 'out_dtype': temp_var.dtype, + self._op_role_key: self._op_role.Optimize, + }, + ) + offset += 1 + block._insert_op( + op_idx + 1 + offset, + type='c_allreduce_max' + if op.type == "reduce_any" + else 'c_allreduce_sum', + inputs={'X': temp_var if op.type == "reduce_any" else out_var}, + outputs={'Out': temp_var if op.type == "reduce_any" else out_var}, + attrs={ + 'ring_id': self.global_ring_id, + self._op_role_key: self._op_role.Optimize, + 'use_calc_stream': True, + }, + ) + offset += 1 + if op.type == "reduce_any": + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': temp_var}, + outputs={'Out': out_var}, + attrs={ + 'in_dtype': temp_var.dtype, + 'out_dtype': out_var.dtype, + self._op_role_key: self._op_role.Optimize, + }, + ) + offset += 1 + return offset + + def _create_vars(self, block, ori_block): + # Create vars for block, copied from ori_block + used_var_set = set() + added_op_num = 0 + op_idx = 0 + op_size = block.desc.op_size() + while op_idx < op_size + added_op_num: + # Whether to insert allreduce_sum or allreduce_max op. + # For amp and global gradient clip strategies, we should + # get the global information, so allreduce op is needed. + should_insert = False + op = block.ops[op_idx] + # For op process vars on all devices, remove its input + # vars not in this block + reserved_x = [] + if op.type == 'reduce_any' and self._is_optimize_op(op): + should_insert = True + elif op.type == 'concat' and self._is_optimize_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + elif op.type == 'update_loss_scaling': + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + op.desc.set_output('Out', reserved_x) + elif op.type == 'check_finite_and_unscale': + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + op.desc.set_output('Out', reserved_x) + if len(reserved_x) == 0: + block._remove_op(op_idx) + op_size -= 1 + continue + elif op.type == 'sum' and self._is_gradient_clip_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + should_insert = True + + vars = op.desc.input_arg_names() + op.desc.output_arg_names() + for var in vars: + # a var whose name contains "blocking_queue" + # only exists in startup program + if var in used_var_set or "_blocking_queue" in var: + continue + used_var_set.add(var) + if block._find_var_recursive(str(var)): + continue + source_var = ori_block._var_recursive(str(var)) + if source_var.type == core.VarDesc.VarType.READER: + dest_var = block.create_var( + name=var, + type=core.VarDesc.VarType.READER, + persistable=source_var.persistable, + ) + elif isinstance(source_var, Parameter): + dest_var = block.create_parameter( + name=source_var.name, + shape=source_var.shape, + dtype=source_var.dtype, + type=source_var.type, + lod_level=source_var.lod_level, + stop_gradient=source_var.stop_gradient, + trainable=source_var.trainable, + optimize_attr=source_var.optimize_attr, + regularizer=source_var.regularizer, + error_clip=source_var.error_clip, + ) + else: + dest_var = block._clone_variable(source_var, False) + self._clone_var_attr(dest_var, source_var) + # When use with sharding, allreduce_sum and allreduce_max + # used for global gradient clip and amp will be added by sharding. + op_idx += 1 + if self.use_sharding or not should_insert: + continue + inserted_ops = self._insert_allreduce_op(op_idx - 1, block) + added_op_num += inserted_ops + op_idx += inserted_ops + block._sync_with_cpp() + + def _is_loss_grad_op(self, op): + assert self._op_role_key in op.attr_names + op_role = int(op.attr(self._op_role_key)) + return op_role & int(self._op_role.Backward) and op_role & int( + self._op_role.Loss + ) + + def _is_forward_op(self, op): + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) == int(self._op_role.Forward) + ) + + def _is_backward_op(self, op): + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Backward) + ) + + def _is_loss_op(self, op): + assert self._op_role_key in op.attr_names + return int(op.attr(self._op_role_key)) == int(self._op_role.Loss) + + def _is_optimize_op(self, op): + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Optimize) + ) + + def _is_update_op(self, op): + return ( + 'Param' in op.input_names + and 'Grad' in op.input_names + and ("LearningRate" in op.input_names) + ) + + def _split_program(self, main_program, devices): + """ + Split a program into sections according to devices that ops run on. + The op whose op_device attr is "gpu:all" is copied to all sections. + + Args: + main_program (Program): the main program + devices: all used devices + """ + # Map from device to its corresponding section program info + device_program_map = defaultdict(Program) + + block = main_program.block(0) + for op in block.ops: + device = op.attr(self._op_device_key) + # Copy ops whose op_device set to "gpu:all" to all sections. + if device == f"{self._device}:all": + for device in devices: + program = device_program_map[device] + op_desc = op.desc + ap_op = program.global_block().desc.append_op() + ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") + else: + program = device_program_map[device] + op_desc = op.desc + ap_op = program.global_block().desc.append_op() + ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") + + program_list = [] + for key in devices: + program = device_program_map[key] + program._sync_with_cpp() + program_list.append(program) + + return program_list + + def _get_op_device_for_startup_program(self, var_name): + """ + For adam optimizer, it will add accumulators and initialize them + with fill_constant, and force the op device to cpu. Hence, we should + get the real op_device attribute of the fill_constant as the device + where the corresponding parameters on. + """ + assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, ( + 'For accumulators for Adam, the name must contain beta1_pow_acc ' + 'or beta2_pow_acc.' + ) + param_name = var_name[0 : var_name.index('_beta')] + device = self._param_device_map[param_name] + return device + + def _split_startup_program(self, startup_program, device_id): + block = startup_program.global_block() + new_startup_program = Program() + for op in block.ops: + device = op.attr(self._op_device_key) + if device == "cpu": + assert op.type == "fill_constant", ( + "For ops in startup program with the op_device attribute " + "of cpu, they must be of type fill_constant." + ) + output_var = op.output_arg_names[0] + device = self._get_op_device_for_startup_program(output_var) + + if device: + device_index = int(device.split(':')[1]) + else: + # LR related ops + device = None + if device and device_index != device_id: + continue + op_desc = op.desc + ap_op = new_startup_program.global_block().desc.append_op() + ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") + new_startup_program._sync_with_cpp() + self._create_vars(new_startup_program.global_block(), block) + return new_startup_program + + def _find_post_op(self, index, var_name): + """ + Find the post op that has variable named var_name as input. + """ + # bugfix for uniform hybrid parallelism + if '.cast_fp32' in var_name: + var_name = var_name.replace('.cast_fp32', '') + if '.cast_fp16' in var_name: + var_name = var_name.replace('.cast_fp16', '') + + post_ops = self.input_var_to_op[var_name] + if post_ops is None: + return None + result_op = None + for post_op, post_idx in reversed(post_ops): + if post_idx > index: + result_op = post_op + break + return result_op + + def _find_prev_op(self, index, var_name): + """ + Find the previous op of op with index that outputs + variable named var_name. + """ + prev_ops = self.output_var_to_op[var_name] + if prev_ops is None: + return None + result_op = None + for prev_op, prev_idx in reversed(prev_ops): + if prev_idx < index: + result_op = prev_op + break + return result_op + + def _rename_arg(self, op, old_name, new_name): + op._rename_input(old_name, new_name) + op._rename_output(old_name, new_name) + + def _create_var(self, block, ref_var, name, dtype=None): + """ + Create a new var for block, which has the same type, + shape and dtype as ref_var, then rename it with the + name `name`. + """ + new_var = block.create_var( + name=name, + shape=ref_var.shape, + dtype=ref_var.dtype if dtype is None else dtype, + type=ref_var.type, + lod_level=ref_var.lod_level, + persistable=ref_var.persistable, + is_data=ref_var.is_data, + need_check_feed=ref_var.desc.need_check_feed(), + ) + self._clone_var_attr(new_var, ref_var) + return new_var + + def _clone_var_attr(self, dest, src): + dest.stop_gradient = src.stop_gradient + if hasattr(src, 'is_distributed'): + dest.is_distributed = src.is_distributed + + def _strip_grad_suffix(self, name): + """ + Strip the grad suffix from the given variable name + """ + pos = name.find(core.grad_var_suffix()) + return name[:pos] if pos != -1 else name + + def _append_grad_suffix(self, name): + """ + Append grad suffix to the given variable name + """ + return name + core.grad_var_suffix() + + def _get_op_device_attr(self, op): + """ + Get the op_device attribute of a op. + """ + device = ( + op.attr(self._op_device_key) + if op.has_attr(self._op_device_key) + else None + ) + if device: + assert device[0:3] == 'gpu', ( + "Now, only gpu devices are " + "supported in pipeline parallemism." + ) + return device + + def _add_op_device_attr_for_op(self, op, idx, block): + """ + Add op_device attrribute for ops that have not that attribute set. + We use "gpu:all" to represent the op should be put on all + sub-programs, such as lr-related ops. Note that: "gpu:all" + is only used by pipeline as an indicator. + """ + lrsched_role = int(self._op_role.LRSched) + if op.attr(self._op_role_key) == lrsched_role: + # For LRSched ops, we should put them on all sub-programs to + # make sure each sub-program update the lr correctly + op._set_attr(self._op_device_key, f"{self._device}:all") + # bugfix in hybrid parallelism + elif op.type == "sum" and self._is_backward_op(op): + # For sum ops that compute the sum of @RENAMED@ vars + for name in op.desc.input_arg_names(): + assert ( + '@RENAME@' in name + ), "The op must be sum used to accumulate renamed vars." + assert len(op.desc.output_arg_names()) == 1 + out_name = op.desc.output_arg_names()[0] + post_op = self._find_post_op(idx, out_name) + assert post_op.has_attr( + 'op_device' + ), "{} has no op_device attr for var {}".format( + post_op.type, out_name + ) + device = post_op.attr(self._op_device_key) + assert device, "The post op must have op_device set." + op._set_attr(self._op_device_key, device) + elif (op.type == "cast" or op.type == "scale") and ( + self._is_backward_op(op) or self._is_forward_op(op) + ): + prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) + op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) + elif op.type == "memcpy" and not self._is_optimize_op(op): + # for checkpoint offloading + assert ( + len(op.input_arg_names) == 1 and len(op.output_arg_names) == 1 + ) + input_name = op.input_arg_names[0] + output_name = op.output_arg_names[0] + if '@Fetch' in output_name: + post_op = self._find_post_op(idx, output_name) + op._set_attr( + self._op_device_key, post_op.attr(self._op_device_key) + ) + else: + prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) + op._set_attr( + self._op_device_key, prev_op.attr(self._op_device_key) + ) + elif self._is_loss_op(op): + # For loss * loss_scaling op added by AMP + offset = 1 + while not block.ops[idx + offset].has_attr( + self._op_device_key + ) or not block.ops[idx + offset].attr(self._op_device_key): + offset += 1 + device = block.ops[idx + offset].attr(self._op_device_key) + assert device, "Please put you program within device_guard scope." + for i in range(offset): + block.ops[idx + i]._set_attr(self._op_device_key, device) + elif self._is_optimize_op(op) and op.type == "cast": + # For fp16-->fp32 cast added by AMP + grad_name = op.output('Out') + assert len(grad_name) == 1 + param_name = self._strip_grad_suffix(grad_name[0]) + device = self._param_device_map[param_name] + op._set_attr(self._op_device_key, device) + elif self._is_gradient_clip_op(op) or self._is_regularization_op(op): + # For gradient clip and regularization ops, we set their op_device + # attribute to the device where their corresponding parameters on. + assert self._op_role_var_key in op.attr_names, ( + "gradient_clip " + "and regularization ops must have op_role_var attribute." + ) + op_role_var = op.attr(self._op_role_var_key) + assert len(op_role_var) == 2, ( + "op_role_var for gradient_clip " + "regularization ops must have two elements." + ) + param_name = op_role_var[0] + device = self._param_device_map[param_name] + # For sum op added by global gradient clip, it must be + # put on all devices + if ( + op.type == 'sum' + or op.type == 'sqrt' + or op.type == 'fill_constant' + or op.type == 'elementwise_max' + or op.type == 'elementwise_div' + ): + device = f"{self._device}:all" + op._set_attr(self._op_device_key, device) + elif op.type == "alloc_float_status" or op.type == "clear_float_status": + op._set_attr(self._op_device_key, f"{self._device}:all") + # NOTE(wangxi): NPU should only clear the float status + # once at each batch step + op._set_attr(self._op_role_key, self._op_role.LRSched) + + float_status_name = op.output_arg_names[0] + float_status_var = block.var(float_status_name) + # FIXME(wangxi): pipeline lr schedule will exec on sub_scope(0) + # while update will exec on sub_scope(last_micro_step), should + # set persistable to use global scope + float_status_var.persistable = True + else: + other_known_ops = [ + 'update_loss_scaling', + 'reduce_any', + 'concat', + 'sum', + 'check_finite_and_unscale', + 'memcpy', + ] + assert op.type in other_known_ops, ( + "For other ops without " + "op_device set, they must be one of {}, but it " + "is {}".format(other_known_ops, op.type) + ) + assert self._is_optimize_op(op) + op._set_attr(self._op_device_key, f"{self._device}:all") + + def _add_op_device_attr(self, block): + """ + Add op_device attrribute for ops in block that have + not that attribute set. + """ + for idx, op in enumerate(list(block.ops)): + if ( + op.type == "create_py_reader" + or op.type == "read" + or op.type == "create_double_buffer_reader" + ): + # Copy read related ops to all section to make them exit + # after each epoch. + # We use "gpu:all" to represent the op should be put on all + # sub-programs, such as lr-related ops. Note that: "gpu:all" + # is only used by pipeline as an indicator. + op._set_attr(self._op_device_key, f"{self._device}:all") + continue + # op_device attribute has been set + if self._get_op_device_attr(op): + continue + self._add_op_device_attr_for_op(op, idx, block) + + def _check_validation(self, block): + """ + Check whether ops in a block have both the op_device and the + op_role attributes set. + Then, return all devices in order. + """ + device_list = [] + # Section worker only supports the following op_role + valid_op_role_value = [ + int(self._op_role.LRSched), + int(self._op_role.Forward), + int(self._op_role.Backward), + int(self._op_role.Loss), + int(self._op_role.Optimize), + int(self._op_role.Backward) | int(self._op_role.Loss), + ] + for op in block.ops: + if not op._has_kernel(op.type): + assert op.type == "conditional_block" and ( + op.attr(self._op_role_key) == int(self._op_role.LRSched) + ), ( + "Now, the only supported op without kernel is " + "conditional_block, and its op role must be LRSched." + ) + assert op.has_attr( + self._op_role_key + ), f"op ({op.type}) has no {self._op_role_key} attribute." + op_role = op.attr(self._op_role_key) + assert ( + int(op_role) in valid_op_role_value + ), "op_role {} for op {} must be one of {}".format( + op_role, op.type, valid_op_role_value + ) + + assert op.has_attr( + self._op_device_key + ), "op ({}) has no {} attribute.".format( + op.type, self._op_device_key + ) + + device = op.attr(self._op_device_key) + assert ( + device + ), "op_device attribute for op " "{} has not been set.".format( + op.type + ) + if device == f"{self._device}:all": + continue + + dev_type = device.split(':')[0] + assert dev_type == "gpu", ( + "Now only gpu devices are supported " + "for pipeline parallelism." + ) + + if device not in device_list: + device_list.append(device) + + return device_list + + def _insert_sendrecv_ops_for_boundaries(self, block): + """ + Insert a pair of send and recv ops for every two + consecutive ops on different devices. + """ + # A map from var to device where op takes it as input, + # avoiding multiple send and recv ops. + input_var_to_device = {} + # bugfix hybrid parallelism + first_optimize_index = None + for index, op in enumerate(list(block.ops)): + if self._is_optimize_op(op): + first_optimize_index = index + break + extra_index_info = { + 'index': 0, + 'first_optimize_index': first_optimize_index, + } + + for index, op in enumerate(list(block.ops)): + cur_device = op.attr(self._op_device_key) + if cur_device == f"{self._device}:all": + continue + for var_name in op.input_arg_names: + var = block.var(var_name) + # skip data var + if var.is_data: + continue + prev_device = None + + prev_op = self._find_prev_op(index, var_name) + if prev_op is None: + if var_name not in self._param_device_map: + continue + prev_device = self._param_device_map[var_name] + + if not prev_device: + prev_device = ( + prev_op.attr(self._op_device_key) if prev_op else None + ) + + if prev_device is None or prev_device == f"{self._device}:all": + continue + + if prev_device == cur_device: + continue + + if var_name not in input_var_to_device: + input_var_to_device[var_name] = [] + if (cur_device, prev_device) in input_var_to_device[var_name]: + continue + + device_type = cur_device.split(':')[0] + ':' + + def _check_stage(cur_id, prev_id): + # check send/recv stage valid + is_forward = self._is_forward_op(op) + is_backward = self._is_backward_op(op) + assert is_forward or is_backward, ( + 'send/recv in pipeline should only be inserted in forward or backward,' + 'please check the op_role of op={}'.format(op) + ) + + if is_forward: + assert prev_id < cur_id, ( + "In forward, send/recv can only be passed forward, but now " + "prev_stage={} great than cur_stage={}, please check op_device of op={}".format( + prev_id, cur_id, op + ) + ) + elif is_backward: + assert prev_id > cur_id, ( + "In backward, send/recv can only be passed backward, but now " + "prev_stage={} less than cur_stage={}, please check op_device of op={}".format( + prev_id, cur_id, op + ) + ) + + def _insert_send_recv(cur_id, prev_id): + cur_dev = device_type + str(cur_id) + prev_dev = device_type + str(prev_id) + if (cur_dev, prev_dev) in input_var_to_device[var_name]: + return + + if cur_id - prev_id > 1: + _insert_send_recv(cur_id - 1, prev_id) + _insert_send_recv(cur_id, cur_id - 1) + input_var_to_device[var_name].append( + (cur_dev, prev_dev) + ) + return + elif cur_id - prev_id < -1: + _insert_send_recv(cur_id + 1, prev_id) + _insert_send_recv(cur_id, cur_id + 1) + input_var_to_device[var_name].append( + (cur_dev, prev_dev) + ) + return + + assert abs(cur_id - prev_id) == 1 + input_var_to_device[var_name].append((cur_dev, prev_dev)) + + op_role = op.attr(self._op_role_key) + var = block.vars[var_name] + pair = (prev_id, cur_id) + # 1000 is just a magic number + pair_key = prev_id * 1000 + cur_id + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] + + if self.schedule_mode == 'F-then-B': # F-then-B + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='send_v2', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_dev, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 1, + 'ring_id': ring_id, + }, + ) + extra_index_info['index'] += 1 + var_shape = list(var.shape) + var_shape[0] = ( + self.micro_batch_size + if var_shape[0] < 0 + else var_shape[0] + ) + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='recv_v2', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var_shape, + 'dtype': var.dtype, + self._op_device_key: cur_dev, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id, + }, + ) + extra_index_info['index'] += 1 + elif self.schedule_mode == '1F1B': # 1F1B + var_shape = list(var.shape) + var_shape[0] = ( + self.micro_batch_size + if var_shape[0] < 0 + else var_shape[0] + ) + + numel = np.prod(var_shape) + use_mp = (self.mp_degree > 1) and ( + numel % self.mp_degree == 0 + ) + + if 'subprog' in var.name: + # For recompute, if the checkpoints var is layer_norm_6.tmp_2 + # this var will be sent twice, layer_norm_6.tmp_2 for forward pass, + # layer_norm_6.tmp_2.subprog_* for recompute pass. + # We can store the first sent var and copy the value to the + # second one to reduce one send/recv op. + # The origin_ckpt_name is layer_norm_6.tmp_2, which will be used + # to find the stored var for the forward pass. + origin_name = var.name.split('subprog')[0][0:-1] + associate_var = block.var(origin_name) + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='assign', + inputs={'X': [associate_var]}, + outputs={'Out': [var]}, + attrs={ + 'out_shape': var_shape, + 'dtype': var.dtype, + self._op_device_key: cur_dev, + self._op_role_key: op_role, + 'use_calc_stream': True, + }, + ) + extra_index_info['index'] += 1 + return + + _check_stage(cur_id, prev_id) + + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='c_sync_calc_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_dev, + self._op_role_key: op_role, + }, + ) + extra_index_info['index'] += 1 + prefix_name = var.name.split('@')[0] + prefix_var = block.var(prefix_name) + is_param = ( + True if isinstance(prefix_var, Parameter) else False + ) + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='send_v2' + if not use_mp or is_param + else 'partial_send', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_dev, + self._op_role_key: op_role, + 'use_calc_stream': False, + 'ring_id': ring_id, + 'peer': 1, + # if send_v2, num&id attr is not in op_attrs, will not insert + 'num': self.mp_degree, + 'id': self.mp_rank, + }, + ) + extra_index_info['index'] += 1 + insert_index = None + if int(op_role) == int(self._op_role.Backward): + insert_index = extra_index_info[ + 'first_optimize_index' + ] + new_op_role = self._op_role.Optimize + else: + insert_index = index + new_op_role = self._op_role.Backward + sync_comm_op = block._insert_op_without_sync( + index=insert_index + extra_index_info['index'], + type='c_sync_comm_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_dev, + self._op_role_key: new_op_role, + 'ring_id': ring_id, + }, + ) + if int(op_role) == int(self._op_role.Forward): + sync_comm_op._set_attr('pipeline_flag', '') + extra_index_info['index'] += 1 + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='recv_v2' + if not use_mp or is_param + else 'partial_recv', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var_shape, + 'dtype': var.dtype, + self._op_device_key: cur_dev, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id, + # if recv_v2, num&id attr is not in op_attrs, will not insert + 'num': self.mp_degree, + 'id': self.mp_rank, + }, + ) + extra_index_info['index'] += 1 + if use_mp and not is_param: + block._insert_op_without_sync( + index=index + extra_index_info['index'], + type='partial_allgather', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: cur_dev, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'ring_id': 0, + # if recv_v2, num&id attr is not in op_attrs, will not insert + 'nranks': self.mp_degree, + 'rank': self.mp_rank, + }, + ) + extra_index_info['index'] += 1 + else: + raise ValueError( + "Now only 'F-then-B' and '1F1B' are supported." + "The given value is {}.".format(self.schedule_mode) + ) + + _insert_send_recv( + int(cur_device.split(':')[1]), + int(prev_device.split(':')[1]), + ) + block._sync_with_cpp() + + def _insert_loss_scale(self, block): + """ + Scale the loss corresponding to number of micro-batches. + """ + if self._num_microbatches == 1: + return + for index, op in reversed(tuple(enumerate(list(block.ops)))): + if self._is_loss_grad_op(op): + assert op.type == 'fill_constant', ( + "loss_grad_op must be fill_constant op, " + "but this op is {}".format(op.type) + ) + assert op.has_attr('value') + loss_scale = float(op.attr('value')) + loss_scale = loss_scale / self._num_microbatches + op._set_attr('value', loss_scale) + break + + def _rename_gradient_var_name(self, block): + for index, op in enumerate(block.ops): + if not self._is_optimize_op(op): + continue + input_names = op.input_arg_names + output_names = op.output_arg_names + in_out_names = input_names + output_names + if op.type == 'cast' or op.type == "c_sync_comm_stream": + continue + # append "MERGED" to the names of parameter gradients, + # and mofify the op_role_var attribute (by rename_arg func). + for name in in_out_names: + if not core.grad_var_suffix() in name: + continue + param_name = name.strip(core.grad_var_suffix()) + new_grad_name = name + "@MERGED" + self._rename_arg(op, name, new_grad_name) + + def _accumulate_gradients( + self, block, pp_allreduce_in_optimize=False, strategy=None, shard=None + ): + """ + Create a new merged gradient for each parameter and accumulate the + corresponding gradient to it. + """ + fp16_allreduce = strategy.fp16_allreduce if strategy else False + if strategy and strategy.fuse_grad_merge: + fused_gradient_names = self._accumulate_gradients_with_fuse( + block, fp16_allreduce, strategy.fuse_grad_size_in_MB, shard + ) + return fused_gradient_names + + merged_gradient_names = [] + first_opt_op_idx = None + + merged_suffix = '@MERGED@FP16' if fp16_allreduce else '@MERGED' + dtype = paddle.float16 if fp16_allreduce else None + + for index, op in reversed(tuple(enumerate(list(block.ops)))): + # remove the cast op of fp16 grad to fp32 grad + if self._is_optimize_op(op) and op.type == 'cast': + in_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + if out_name.strip('@GRAD') in self._param_device_map: + assert in_name.replace('.cast_fp16', '') == out_name + block._remove_op(index) + continue + + if self._is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + # maybe have no optimize + # if first_opt_op_idx == len(block.ops): return + + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names + ): + op_role_var = op.attr(self._op_role_var_key) + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + for i in range(0, len(op_role_var), 2): + offset = 0 + param_name = op_role_var[i] + if not block.has_var(param_name): + continue + if '@BroadCast' in param_name: + continue + + param_grad_name = param_name + core.grad_var_suffix() + merged_param_grad_name = param_grad_name + merged_suffix + if not block.has_var(merged_param_grad_name): + self._create_var( + block, + block.vars[param_name], + merged_param_grad_name, + dtype, + ) + assert block.has_var(merged_param_grad_name) + + param_grad_var = block.var(param_grad_name) + merged_param_grad_var = block.var(merged_param_grad_name) + merged_param_grad_var.persistable = True + block._insert_op( + index=first_opt_op_idx + offset, + type='fill_constant', + inputs={}, + outputs={'Out': [merged_param_grad_var]}, + attrs={ + 'shape': merged_param_grad_var.shape, + 'dtype': merged_param_grad_var.dtype, + 'value': float(0), + # a trick to run this op once per mini-batch + self._op_role_key: self._op_role.Optimize.LRSched, + }, + ) + offset += 1 + grad_name = op_role_var[i + 1] + grad_var = block.vars[grad_name] + + is_fp16_grad = 'cast_fp16' in grad_name + need_cast = is_fp16_grad is not fp16_allreduce + + if need_cast: + # if fp16_allreduce: + # cast grad to fp16 to accumulate to merged gradient + # else: + # cast grad to fp32 to accumulate to merged gradient + cast_grad_var_name = param_grad_name + '@TMP' + cast_grad_var = self._create_var( + block, param_grad_var, cast_grad_var_name, dtype + ) + cast_grad_var.persistable = False + block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': grad_var}, + outputs={'Out': cast_grad_var}, + attrs={ + 'in_dtype': grad_var.dtype, + 'out_dtype': cast_grad_var.dtype, + self._op_role_key: self._op_role.Backward, + }, + ) + offset += 1 + grad_var = cast_grad_var + + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [merged_param_grad_var, grad_var]}, + outputs={'Out': merged_param_grad_var}, + attrs={ + self._op_role_key: self._op_role.Backward, + }, + ) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + + if not fp16_allreduce: + return merged_gradient_names + + first_opt_op_idx = None + for index, op in reversed(tuple(enumerate(list(block.ops)))): + if self._is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + break + assert first_opt_op_idx is not None + + # insert cast op from fp16->fp32 + # FIXME(wangxi): maybe put in sharding is better, for some grad + # is not in sharding device. + for fp16_grad_name in merged_gradient_names: + grad_name = fp16_grad_name.replace('@FP16', '') + param_name = fp16_grad_name.replace('@GRAD@MERGED@FP16', '') + + if not block.has_var(grad_name): + self._create_var(block, block.vars[param_name], grad_name) + assert block.has_var(grad_name) + + fp16_grad_var = block.var(fp16_grad_name) + grad_var = block.var(grad_name) + grad_var.persistable = False + + block._insert_op( + index=first_opt_op_idx, + type='cast', + inputs={'X': fp16_grad_var}, + outputs={'Out': grad_var}, + attrs={ + 'in_dtype': fp16_grad_var.dtype, + 'out_dtype': grad_var.dtype, + self._op_role_key: self._op_role.Optimize, + }, + ) + + return merged_gradient_names + + def _insert_accumulate_gradients_with_fuse( + self, main_block, fp16, fused_size, grad_param_pairs, first_opt_op_idx + ): + grad_param_pairs = self._sort_grad_param_by_dtype( + main_block, grad_param_pairs + ) + + grad_param_segments = [] + merged_suffix = '@MERGED@FP16' if fp16 else '@MERGED' + dtype = paddle.float16 if fp16 else paddle.float32 + cur_size = 0.0 + last_dtype = None + # split the grad based on dtype and fused size + for grad, param in grad_param_pairs: + real_grad = main_block.var(grad) + # create the gradient merged var for each grad + merged_grad_var = main_block.create_var( + name=param + core.grad_var_suffix() + merged_suffix, + dtype=dtype, + shape=real_grad.shape, + persistable=True, + stop_gradient=False, + ) + real_param = main_block.var(param) + if hasattr(real_param, 'is_distributed'): + merged_grad_var.is_distributed = real_param.is_distributed + tmp_size = self._get_var_size(real_grad) + # two strategies for splitting the grad + # 1. the current segment's size reach the user defined grad_size_in_MB + # 2. the upcoming grad holds different dtype compared with grads in current segment + if ( + len(grad_param_segments) == 0 + or cur_size + tmp_size > fused_size + or real_grad.dtype != last_dtype + ): + grad_param_segments.append( + ([real_grad], [real_param], [merged_grad_var]) + ) + last_dtype = real_grad.dtype + cur_size = 0.0 + else: + grad_param_segments[-1][0].append(real_grad) + grad_param_segments[-1][1].append(real_param) + grad_param_segments[-1][2].append(merged_grad_var) + cur_size += tmp_size + + fused_gradients = [] + fused_merged_gradients = [] + # create fused vars for grad and param + for grad_param_segment in grad_param_segments: + grad_segment = grad_param_segment[0] + merged_grad_segment = grad_param_segment[2] + fused_grad = main_block.create_var( + name=f'FusedGrad_{grad_segment[0].name}', + dtype=grad_segment[0].dtype, + persistable=False, + stop_gradient=False, + ) + # keep the '.cast_fp16' info in the fuse var name + fused_merged_grad_name_prefix = ( + 'FusedMergedGrad.cast_fp16.' + if merged_grad_segment[0].dtype == paddle.float16 + else 'FusedMergedGrad' + ) + fused_merged_grad_name = ( + fused_merged_grad_name_prefix + + f'_{merged_grad_segment[0].name}' + ) + fused_merged_grad = main_block.create_var( + name=fused_merged_grad_name, + dtype=merged_grad_segment[0].dtype, + persistable=True, + stop_gradient=False, + ) + fused_gradients.append(fused_grad) + fused_merged_gradients.append(fused_merged_grad) + + assert len(fused_gradients) == len(grad_param_segments) + assert len(fused_merged_gradients) == len(grad_param_segments) + + # insert coalesce op at the start of the backward pass + # use param as the coalesce input to make sure the two Fused vars are in same shape + first_back_op_idx = None + for index, op in enumerate(main_block.ops): + if self._is_backward_op(op) and first_back_op_idx is None: + first_back_op_idx = index + break + assert first_back_op_idx is not None + offset = 0 + for i in range(len(grad_param_segments)): + fused_grad = fused_gradients[i] + fused_merged_grad = fused_merged_gradients[i] + grads = grad_param_segments[i][0] + params = grad_param_segments[i][1] + merged_grads = grad_param_segments[i][2] + main_block._insert_op_without_sync( + first_back_op_idx + offset, + type="coalesce_tensor", + inputs={"Input": params}, + outputs={"Output": grads, "FusedOutput": fused_grad}, + attrs={ + # Explanation of user_defined_size_of_dtype: + # In coalesce op, the align size is 256 bytes + # the float takes 4 bytes while fp16 takes 2 bytes. + # To meet the requirement, 128 fp16 or 64 float will be aligned + # Think the total shape of the input tensors if [64], + # if the dtype is float, then the shape of the fuse var is [64] + # however if the dytpe if fp16, the shape of the fuse var is [128], + # which will cause the fused vars' shape vary between each other. + # To make sure the shape of the fused vars are identical, + # we set the dtype of float and fp16 both to 2. + # Under this way, the fused vars' shape for float and fp16 are all [128] + "user_defined_size_of_dtype": 2, + "copy_data": False, + "use_align": True, + "dtype": grads[0].dtype, + self._op_role_key: self._op_role.Backward, + # On npu, the nan/inf check login is different with gpu. + # If there are some not initialized sections in the fused var, + # and the value in those sections are nan/inf, it will trigger the nan/inf check. + # To avoid these problematic triggers, set constant is needed for npu + "set_constant": core.is_compiled_with_custom_device('npu'), + "constant": float(0.0), + }, + ) + offset += 1 + # For the gradient_merged_fused_var, given a init value during the coalesce op + # this will remove a problematic fill_constant op. This op role of this coalesce + # is set to be LRSched to make this coalesce (with init) only run once + main_block._insert_op_without_sync( + first_back_op_idx + offset, + type="coalesce_tensor", + inputs={"Input": params}, + outputs={ + "Output": merged_grads, + "FusedOutput": fused_merged_grad, + }, + attrs={ + "user_defined_size_of_dtype": 2, + "set_constant": True, + "constant": float(0.0), + "copy_data": False, + "use_align": True, + "dtype": merged_grads[0].dtype, + self._op_role_key: self._op_role.Optimize.LRSched, + }, + ) + offset += 1 + + # insert gradient merge relating ops + first_opt_op_idx += offset + offset = 0 + for i in range(len(fused_gradients)): + fused_grad = fused_gradients[i] + fused_merged_grad = fused_merged_gradients[i] + is_fp16_grad = 'cast_fp16' in fused_grad.name + need_cast = is_fp16_grad is not fp16 + if need_cast: + # for fp16 allreduce, cast fp32 grad to fp16 + # for fp32 allreduce, cast fp16 grad to fp32 + cast_grad_var_name = fused_grad.name + '@TMP' + cast_grad_var = main_block.create_var( + name=cast_grad_var_name, + dtype=dtype, + persistable=False, + stop_gradient=False, + ) + main_block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': fused_grad}, + outputs={'Out': cast_grad_var}, + attrs={ + 'in_dtype': fused_grad.dtype, + 'out_dtype': cast_grad_var.dtype, + self._op_role_key: self._op_role.Backward, + }, + ) + offset += 1 + fused_grad = cast_grad_var + main_block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [fused_merged_grad, fused_grad]}, + outputs={'Out': fused_merged_grad}, + attrs={self._op_role_key: self._op_role.Backward}, + ) + offset += 1 + + if fp16: + # if using fp16 allreduce, the optimizer needs fp32 grads, cast them back to fp32 + for grad, param in grad_param_pairs: + real_grad = main_block.var(grad) + fp16_grad_name = param + core.grad_var_suffix() + '@MERGED@FP16' + assert main_block.has_var(fp16_grad_name) + fp16_grad = main_block.var(fp16_grad_name) + fp32_grad_name = param + core.grad_var_suffix() + '@MERGED' + fp32_grad = main_block.create_var( + name=fp32_grad_name, + dtype=paddle.float32, + shape=real_grad.shape, + persistable=False, + stop_gradient=False, + ) + main_block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': fp16_grad}, + outputs={'Out': fp32_grad}, + attrs={ + 'in_dtype': paddle.float16, + 'out_dtype': paddle.float32, + self._op_role_key: self._op_role.Optimize, + }, + ) + offset += 1 + + # replace the var with it's name, which will be used for inserting allreduce + for i in range(len(fused_merged_gradients)): + fused_merged_gradients[i] = fused_merged_gradients[i].name + + return fused_merged_gradients, first_opt_op_idx + + def _accumulate_gradients_with_fuse( + self, main_block, fp16, fused_size, shard=None + ): + first_opt_op_idx = None + grad_param_pairs = [] + # obtain all param/grad pairs that needed to be fused + for index, op in reversed(tuple(enumerate(list(main_block.ops)))): + # remove the cast op of fp16 grad to fp32 grad + if self._is_optimize_op(op) and op.type == 'cast': + in_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + if out_name.strip('@GRAD') in self._param_device_map: + assert in_name.replace('.cast_fp16', '') == out_name + main_block._remove_op(index) + continue + + if self._is_backward_op(op) and first_opt_op_idx is None: + first_opt_op_idx = index + 1 + # no optimize phase + if first_opt_op_idx == len(main_block.ops): + return + + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names + ): + op_role_var = op.attr(self._op_role_var_key) + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + for i in range(0, len(op_role_var), 2): + param_name = op_role_var[i] + if not main_block.has_var(param_name): + continue + if '@BroadCast' in param_name: + continue + grad_param_pairs.append( + (op_role_var[i + 1], op_role_var[i]) + ) + + if len(grad_param_pairs) == 0: + return + + nranks = shard.worker_num if shard else 1 + device_to_pairs = [[] for _ in range(nranks)] + for pair in grad_param_pairs: + root_id = shard.device(pair[1]) if shard else 0 + assert 0 <= root_id < nranks + device_to_pairs[root_id].append(pair) + + all_fused_merged_gradients = [] + for pairs in device_to_pairs: + ( + fused_merged_gradients, + first_opt_op_idx, + ) = self._insert_accumulate_gradients_with_fuse( + main_block, fp16, fused_size, pairs, first_opt_op_idx + ) + all_fused_merged_gradients += fused_merged_gradients + + main_block._sync_with_cpp() + return all_fused_merged_gradients + + def _sort_grad_param_by_dtype(self, main_block, grad_param_pairs): + # sort the grad param paris by the dtype + fp16_pairs = [] + fp32_pairs = [] + other_pairs = [] + for pairs in grad_param_pairs: + dtype = main_block.var(pairs[0]).dtype + if dtype == paddle.float32: + fp32_pairs.append(pairs) + elif dtype == paddle.float16: + fp16_pairs.append(pairs) + else: + other_pairs.append(pairs) + sorted_pairs = fp16_pairs + sorted_pairs.extend(fp32_pairs) + sorted_pairs.extend(other_pairs) + return sorted_pairs + + def _get_var_size(self, var): + dtype_to_size = { + core.VarDesc.VarType.FP16: 2, + core.VarDesc.VarType.BF16: 2, + core.VarDesc.VarType.FP32: 4, + core.VarDesc.VarType.FP64: 8, + core.VarDesc.VarType.INT16: 2, + core.VarDesc.VarType.INT32: 4, + core.VarDesc.VarType.INT64: 8, + core.VarDesc.VarType.BOOL: 1, + core.VarDesc.VarType.UINT8: 1, + } + assert -1 not in var.shape + return ( + reduce(lambda x, y: x * y, var.shape, 1) + * dtype_to_size[var.dtype] + / 1024.0 + / 1024.0 + ) + + def _add_sub_blocks(self, main_block, program_list): + main_program = main_block.program + for prog in program_list: + for op in prog.block(0).ops: + if not op.has_attr('sub_block'): + continue + origin_sub_block_id = op.attr('sub_block').id + origin_sub_block = main_program.block(origin_sub_block_id) + new_sub_block = prog._create_block(parent_idx=0) + for sub_op in origin_sub_block.ops: + op_desc = sub_op.desc + ap_op = new_sub_block.desc.append_op() + ap_op.copy_from(op_desc) + new_sub_block._sync_with_cpp() + self._create_vars(new_sub_block, origin_sub_block) + op._set_attr('sub_block', new_sub_block) + + def _get_device_info(self, block): + for op in block.ops: + if not op._has_kernel(op.type): + continue + op_device = op.attr(self._op_device_key) + return op_device + + def _process_persistable_vars_in_multi_sections( + self, main_program, startup_prog, program_list + ): + """ + Special Case: process persistable vars that exist in + multiple sections, e.g., shared weight + """ + # var_info = {var_name: [program1, program2...]}, + # persistable var only + var_info = {} + for prog in program_list: + block = prog.block(0) + for var_name in block.vars: + if var_name == "double_buffer_0": + continue + var = block.var(var_name) + if not var.persistable: + continue + if var_name not in var_info: + var_info[var_name] = [] + if prog not in var_info[var_name]: + var_info[var_name].append(prog) + for var_name in list(var_info.keys()): + if len(var_info[var_name]) == 1: + var_info.pop(var_name) + + # write_info = {var_name: program}, where program is the only program + # in which the var named var_name is written. + write_info = {} + for var_name in var_info.keys(): + for prog in var_info[var_name]: + block = prog.block(0) + for op in block.ops: + if ( + op.type == "recv_v2" + or op.type == "create_py_reader" + or op.type == "read" + or op.type == "update_loss_scaling" + ): + continue + # We have processed lr related vars + if op.attr(self._op_role_key) == int( + self._op_role.Optimize.LRSched + ): + continue + if var_name in op.desc.output_arg_names(): + assert var_name not in write_info, ( + "two sections write the same var({}): second " + "op {}.".format(var_name, op) + ) + write_info[var_name] = prog + break + + for var_name in var_info.keys(): + # Case 1: read only variables, no special process + if var_name not in write_info: + continue + + # Case 2: one write multiple reads + write_prog = write_info[var_name] + write_block = write_prog.block(0) + write_device = self._get_device_info(write_block) + write_dev_index = int(write_device.split(':')[1]) + all_progs = var_info[var_name] + for prog in all_progs: + if prog == write_prog: + continue + read_block = prog.block(0) + read_device = self._get_device_info(read_block) + read_dev_index = int(read_device.split(':')[1]) + pair = (write_dev_index, read_dev_index) + pair_key = write_dev_index * 1000 + read_dev_index + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] + + write_block._insert_op( + index=0, + type='send_v2', + inputs={ + 'X': write_block.var(var_name), + }, + attrs={ + self._op_device_key: write_device, + 'use_calc_stream': False, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'peer': read_dev_index, + 'ring_id': ring_id, + }, + ) + read_block._insert_op( + index=0, + type='recv_v2', + outputs={'Out': [read_block.var(var_name)]}, + attrs={ + 'out_shape': read_block.var(var_name).shape, + 'dtype': read_block.var(var_name).dtype, + self._op_device_key: read_device, + 'use_calc_stream': False, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'peer': write_dev_index, + 'ring_id': ring_id, + }, + ) + read_block._insert_op( + index=1, + type='c_sync_comm_stream', + inputs={'X': [read_block.var(var_name)]}, + outputs={'Out': [read_block.var(var_name)]}, + attrs={ + self._op_device_key: read_device, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'ring_id': ring_id, + }, + ) + + def _is_gradient_clip_op(self, op): + return op.desc.has_attr("op_namescope") and op.desc.attr( + "op_namescope" + ).startswith("/gradient_clip") + + def _is_regularization_op(self, op): + return op.desc.has_attr("op_namescope") and op.desc.attr( + "op_namescope" + ).startswith("/regularization") + + def _is_weight_decay_op(self, op): + # in AdamW namescope is /optimizer_*/weight decay/ + return op.desc.has_attr( + "op_namescope" + ) and 'weight decay' in op.desc.attr("op_namescope") + + def _get_input_output_info(self, block): + ''' + Get info of op input and output. + ''' + # A map from output var to op which generate it. + output_var_to_op = defaultdict(list) + # A map from var to op which takes it as input. + input_var_to_op = defaultdict(list) + + for index, op in enumerate(block.ops): + for var_name in op.input_arg_names: + input_var_to_op[var_name].append([op, index]) + for var_name in op.output_arg_names: + output_var_to_op[var_name].append([op, index]) + + return output_var_to_op, input_var_to_op + + def _optimize_forward_send_sync(self, program): + """ + optimize forward send's sync_comm_stream schedule + """ + if self.schedule_mode != '1F1B': + return + + block = program.block(0) + + recv_type = 'recv_v2' if self.mp_degree == 1 else 'partial_recv' + backward_recv_index = None + for index, op in enumerate(block.ops): + if op.type == recv_type and self._is_backward_op(op): + backward_recv_index = index + break + + # last pipeline stage + if backward_recv_index is None: + return + + offset = 0 + for index, op in enumerate(list(block.ops)): + if index >= backward_recv_index: + break + if op.type == 'c_sync_comm_stream' and op.has_attr('pipeline_flag'): + var_name = op.input_arg_names[0] + var = block.var(var_name) + block._remove_op(index + offset, sync=False) + offset -= 1 + # NOTE: + # 1. When the backward recv is completed, it indicates + # that the forward send is completed too. So we only need + # to use the NOP op to prevent memory release. + # 2. Because we removed sync_comm_op, + # we will insert NOP after recv_op. + block._insert_op_without_sync( + index=backward_recv_index, + type='nop', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={self._op_role_key: self._op_role.Backward}, + ) + block._sync_with_cpp() + + def _mv_head_recv(self, program): + """ + A pass to move the recv op to the beginning of + the forward/backward phase + """ + forward_insert_index = 0 + backward_insert_index = None + block = program.global_block() + num_ops = len(program.global_block().ops) + for i in range(num_ops): + insert_index = None + op = program.global_block().ops[i] + op_role = int(op.attr(self._op_role_key)) + if ( + op_role == int(self._op_role.Backward) + and backward_insert_index is None + ): + backward_insert_index = i + if ( + op.type != "partial_recv" + and op.type != "partial_allgather" + and op.type != "nop" + and op.type != "recv_v2" + ): + continue + if op_role == int(self._op_role.Forward): + if i == forward_insert_index: + forward_insert_index += 1 + continue + insert_index = forward_insert_index + elif op_role == int(self._op_role.Backward): + if i == backward_insert_index: + backward_insert_index += 1 + continue + insert_index = backward_insert_index + else: + raise ValueError(f"Unknown op_role: {op_role}") + op_inputs = {} + for name in op.input_names: + op_inputs[name] = op.input(name) + op_outputs = {} + for name in op.output_names: + op_outputs[name] = op.output(name) + block._insert_op_without_sync( + index=insert_index, + type=op.type, + inputs=op_inputs, + outputs=op_outputs, + attrs=op.all_attrs(), + ) + block._remove_op(i + 1) + if op_role == int(self._op_role.Forward): + forward_insert_index += 1 + elif op_role == int(self._op_role.Backward): + backward_insert_index += 1 + block._sync_with_cpp() + + def _check_pipeline_persist_var(self, program): + """ + Pipeline may need multiple forward before + """ + block = program.global_block() + + persist_output = set() + used_in_backward = set() + for op in block.ops: + if self._is_forward_op(op): + for var_name in op.output_arg_names: + var = block.vars[var_name] + if var.persistable: + persist_output.add(var_name) + elif self._is_backward_op(op): + for var_name in op.input_arg_names: + if var_name in persist_output: + used_in_backward.add(var_name) + if len(used_in_backward) == 0: + return + warnings.warn( + "The pipeline requires multiple forward calculations before backward, " + "so when the persistable var is changed in the forward, it may cause " + "errors in the backward calculation who using this persistable var. " + "However, some backward op don't need this var(NoNeedBufferVars), " + "there will be no error at this time.\n" + "So please check these persistable vars which changed in " + "forward and used in backward:\n{}".format(used_in_backward) + ) + + def minimize( + self, loss, startup_program=None, parameter_list=None, no_grad_set=None + ): + main_block = loss.block + self.origin_main_block = main_block + main_program = main_block.program + if startup_program is None: + startup_program = default_startup_program() + + pipeline_opt = main_program._pipeline_opt + assert pipeline_opt, 'Please use pipeline with fleet.' + required_keys = [ + 'local_rank', + 'schedule_mode', + 'micro_batch_size', + 'ring_id', + 'global_ring_id', + 'use_sharding', + 'mp_degree', + 'mp_rank', + ] + for key in required_keys: + assert ( + key in pipeline_opt + ), f'Please use pipeline with fleet to use {key}.' + self.local_rank = pipeline_opt['local_rank'] + self.schedule_mode = pipeline_opt['schedule_mode'] + self.micro_batch_size = pipeline_opt['micro_batch_size'] + self.use_sharding = pipeline_opt['use_sharding'] + self.ring_id = pipeline_opt['ring_id'] + self.global_ring_id = pipeline_opt['global_ring_id'] + self.mp_degree = pipeline_opt['mp_degree'] + self.mp_rank = pipeline_opt['mp_rank'] + self.scale_gradient = pipeline_opt.get('scale_gradient', False) + assert self.mp_degree >= 1 + assert 0 <= self.mp_rank < self.mp_degree + + optimize_ops, params_grads = self._optimizer.minimize( + loss, startup_program, parameter_list, no_grad_set + ) + self._param_device_map = self._origin_optimizer._param_device_map + + ( + self.output_var_to_op, + self.input_var_to_op, + ) = self._get_input_output_info(main_block) + # Step1: add default op_device attribute for ops. + self._add_op_device_attr(main_block) + device_list = self._check_validation(main_block) + + def device_cmp(device1, device2): + dev1_id = int(device1.split(':')[1]) + dev2_id = int(device2.split(':')[1]) + if dev1_id < dev2_id: + return -1 + elif dev1_id > dev2_id: + return 1 + else: + return 0 + + sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp)) + assert sorted_device_list == device_list, ( + "With pipeline parallelism, you must use gpu devices one after " + "another in the order of their ids." + ) + # Step2: add send and recv ops between section boundaries + self._insert_sendrecv_ops_for_boundaries(main_block) + + # Step3: split program into sections and add pairs of + # send and recv ops for data var. + main_program = main_block.program + program_list = self._split_program(main_program, device_list) + for p in program_list: + self._create_vars(p.global_block(), main_block) + + if os.getenv("PADDLE_MANUAL_PIPELINE_STAGE", None): + self.local_rank = int(os.getenv("PADDLE_MANUAL_PIPELINE_STAGE")) + assert self.local_rank < len(device_list), ( + "Manually specified " + "pipeline stage must be less than total number of pipeline " + "stages." + ) + else: + self.local_rank %= len(device_list) + # Step3.5: optimize forward send sync_comm to overlap send and recv + self._optimize_forward_send_sync(program_list[self.local_rank]) + + # Step4: Special Case: process persistable vars that exist in + # multiple sections + # FIXME + # self._process_persistable_vars_in_multi_sections( + # main_program, startup_program, program_list) + + # Step5: Add sub blocks for section programs + self._add_sub_blocks(main_block, program_list) + + place_list = [] + for dev in device_list: + dev_index = int(dev.split(":")[1]) + if core.is_compiled_with_cuda(): + place_list.append(core.CUDAPlace(dev_index % 1)) + + # Step6: Split startup program + new_startup_program = self._split_startup_program( + startup_program, self.local_rank + ) + + startup_program._pipeline_opt = { + "startup_program": new_startup_program, + } + real_block = program_list[self.local_rank].global_block() + if not self.scale_gradient: + self._insert_loss_scale(real_block) + if not self.use_sharding: + # Step7: clear gradients before each mini-batch and + # accumulate gradients during backward + self._rename_gradient_var_name(real_block) + real_block._sync_with_cpp() + self._accumulate_gradients(real_block) + real_block._sync_with_cpp() + + if core.is_compiled_with_cuda(): + place_id = int(os.getenv("FLAGS_selected_gpus", "0")) + # A pass to move the recv op to the beginning of + # the forward/backward phase + self._mv_head_recv(program_list[self.local_rank]) + + # A pass to check pipeline persist var which changed in + # forward and used in backward + self._check_pipeline_persist_var(program_list[self.local_rank]) + + main_program._pipeline_opt = { + "trainer": "PipelineTrainer", + "device_worker": "Section", + "pipeline_stage": self.local_rank, + "num_pipeline_stages": len(device_list), + "schedule_mode": self.schedule_mode, + "inner_parallelism": len(device_list), + "section_program": program_list[self.local_rank], + "place": place_list[self.local_rank], + "place_id": place_id, + "sync_steps": -1, + "num_microbatches": self._num_microbatches, + "start_cpu_core_id": self._start_cpu_core_id, + } + return ( + optimize_ops, + params_grads, + program_list, + self._pipeline_pair, + self._pp_ring_map, + ) diff --git a/python/paddle/incubate/optimizer/recompute.py b/python/paddle/incubate/optimizer/recompute.py new file mode 100644 index 0000000000000000000000000000000000000000..dde8d723e0d1e15d06bdd6f21aabad10d899b8c2 --- /dev/null +++ b/python/paddle/incubate/optimizer/recompute.py @@ -0,0 +1,816 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import paddle +from paddle.fluid import core, framework, unique_name +from paddle.fluid.backward import append_backward +from paddle.fluid.framework import Variable, in_dygraph_mode, program_guard +from paddle.optimizer import Optimizer + + +class RecomputeOptimizer(Optimizer): + """ + :api_attr: Static Graph + + Recompute Optimizer Wrapper + + Normally, a training step contains three sub-steps: first, run forward + Operators to calculate the loss; second, run backward Operators to + calculate gradient of the parameters; third, apply optimization method + to update the value of the parameters. + + In the forward computation process, all variables that are needed by + backward computation process will be kept in memory, which occupy a great + amount of memory when the network becomes very deep. + + Recompute split the network to k segments. In each segment, It will + recompute the forward Operators, before running backward operators. It is + very helpful for saving memory. + + The Variables that separate a network to segments are called as checkpoints, + and users should set it manually. The usage is very simple: + + Args: + optimizer (Optimizer): The optimizer that is applied to parameters. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import numpy as np + + paddle.enable_static() + + def gen_data(): + return {"x": np.random.random(size=(32, 32)).astype('float32'), + "y": np.random.randint(2, size=(32, 1)).astype('int64')} + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + print(input_x) + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + sgd.minimize(cost) + + print("Finished optimize") + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + step = 10 + + for i in range(step): + cost_val = exe.run(feed=gen_data(), + program=fluid.default_main_program(), + fetch_list=[cost.name]) + print("step=%d cost=%f" % (i, cost_val[0])) + + """ + + def __init__(self, optimizer): + if in_dygraph_mode(): + raise Exception("In dygraph, don't support RecomputeOptimizer.") + self._optimizer = optimizer + self._checkpoints = None + self._learning_rate = self._optimizer._learning_rate + self._learning_rate_map = self._optimizer._learning_rate_map + self.enable_offload = False + + def _set_checkpoints(self, checkpoints): + """ + Args: + checkpoints (list): List of Variable or string + """ + assert isinstance( + checkpoints, list + ), "_checkpoints should be a list of Variable or a list of String" + for ckpt in checkpoints: + assert isinstance(ckpt, str) or isinstance( + ckpt, Variable + ), "_checkpoints should be a list of Variable or a list of String" + self._checkpoints = checkpoints + + # should enable offload before calling backward + def _enable_offload(self): + self.enable_offload = True + + @framework.deprecate_stat_dict + def load(self, state_dict): + """ + :api_attr: Static Graph + + load function is not supported by Recompute Optimizer for now. + :return: None + + Args: + state_dict: the dict load by load_persistable method + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + paddle.enable_static() + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + try: + state_dict = {} + sgd.load(state_dict) + except NotImplementedError as e: + print(e) + """ + raise NotImplementedError( + "load function is not supported by Recompute Optimizer for now" + ) + + def apply_gradients(self, params_grads): + """ + call apply_gradients function of self._optimizer. + + Args: + params_grads (list): list of (param, grad) pair to do optimization. + + Returns: + list: A list of operators appended to the current program. + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + import paddle.fluid.framework as framework + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + program = cost.block.program + with framework.program_guard(program, None): + optimize_ops = sgd.apply_gradients(params_grads) + + print("Finished apply gradients") + """ + + return self._optimizer.apply_gradients(params_grads=params_grads) + + def _creat_vars(self, varname): + pinned_var_name = unique_name.generate(varname + "@Pinned") + fetched_var_name = unique_name.generate(varname + "@Fetch") + + pinned_var = self._main_program.global_block().create_var( + name=pinned_var_name, + shape=self.checkpoint_shape, + dtype=self._main_program.global_block().var(varname).dtype, + persistable=False, + stop_gradient=True, + ) + + fetch_var = self._main_program.global_block().create_var( + name=fetched_var_name, + shape=self.checkpoint_shape, + dtype=self._main_program.global_block().var(varname).dtype, + persistable=False, + stop_gradient=False, + ) + + return pinned_var_name, fetched_var_name + + def _append_fill_constant_ops(self, startup_program): + """ + add fill_constant_ops to the end of the prog + + we should fill the pinned vars before runing the main_prog + to instantiate their tensor hold_, which could tell us whether + the host memory could hold all the checkpoints from all the + GPU devices in this node. + """ + op_role = 0 + block = startup_program.global_block() + fill_constant_vars = self.checkpoint_name2pinned_name.values() + OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() + for varname in fill_constant_vars: + var = self._main_program.global_block().var(varname) + # NOTE (JZ-LIANG) to pre-allocate the CUDAPinned MEM + pinned_var = block.create_var( + name=varname, + shape=self.checkpoint_shape, + dtype=self._main_program.global_block().var(var.name).dtype, + persistable=False, + stop_gradient=True, + ) + block.append_op( + type='fill_constant', + outputs={'Out': varname}, + attrs={ + "shape": var.shape, + "dtype": var.dtype, + "value": 0.0, + "place_type": 2, + OP_ROLE_KEY: op_role, + }, + ) + + return + + def _insert_async_memcpy_op( + self, insert_idx, src_varname, dst_varname, op_role, dst_place_type + ): + OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() + self.block._insert_op_without_sync( + insert_idx, + type='memcpy', + inputs={'X': [self._main_program.global_block().var(src_varname)]}, + outputs={ + 'Out': [self._main_program.global_block().var(dst_varname)] + }, + attrs={"dst_place_type": int(dst_place_type), OP_ROLE_KEY: op_role}, + ) + + def _insert_fetch_op(self, idx, varname): + assert ( + varname in self.checkpoint_name2pinned_name + ), "Try to fetch {} from Pinned Memory, but it is NOT a checkpoint".format( + varname + ) + + pinned_varname = self.checkpoint_name2pinned_name[varname] + fetch_varname = self.checkpoint_name2fetch_name[varname] + self._insert_async_memcpy_op(idx, pinned_varname, fetch_varname, 1, 1) + + def _insert_offload_op(self, idx, varname): + assert ( + varname in self.checkpoint_name2pinned_name + ), "Try to offload {} to Pinned Memory, but it is NOT a checkpoint".format( + varname + ) + pinned_varname = self.checkpoint_name2pinned_name[varname] + self._insert_async_memcpy_op(idx, varname, pinned_varname, 0, 2) + + def _insert_sync_op(self, op_idx, checkpoint_name): + # single stream offload no need sync + pass + + def _record_fetch_op(self, idx): + assert ( + len(self.un_fetch_checkpoint_names) > 0 + ), "Could NOT found checkpoint to fetch" + checkpoint_name = self.un_fetch_checkpoint_names.pop(-1) + logging.debug(f"Record fetch [{checkpoint_name}]") + self.idx2insertions[idx] = ("fetch", checkpoint_name) + + return checkpoint_name + + def _record_offload_op(self, idx, checkpoint_name): + expected_checkpoint_name = self.un_offload_checkpoint_names.pop(0) + assert ( + checkpoint_name == expected_checkpoint_name + ), "expected to offload [{}] but got [{}]".format( + expected_checkpoint_name, checkpoint_name + ) + logging.debug(f"Record offload [{checkpoint_name}]") + self.idx2insertions[idx] = ("offload", checkpoint_name) + + def _record_sync_op(self, idx, checkpoint_name): + assert ( + checkpoint_name not in self.synced_checkpoints + ), f"Try to sync the checkpoint [{checkpoint_name}] twice" + self.synced_checkpoints.add(checkpoint_name) + logging.debug(f"Record offload sync [{checkpoint_name}]") + self.idx2insertions[idx] = ("sync", checkpoint_name) + + def _parse_backward(self): + self.idx2insertions = {} + # don't offload the last checkpoints, to favor throughput + self.un_fetch_checkpoint_names = self.sorted_checkpoint_names[:] + self.un_fetch_checkpoint_names.pop(-1) + need_fetch_checkpoint_names = self.un_fetch_checkpoint_names[:] + self.checkpoint_usage_count = {} + for checkpoint_name in self.un_fetch_checkpoint_names: + self.checkpoint_usage_count[checkpoint_name] = 0 + + self.bw_strart_op_idx = len(self.block.ops) + for idx, op in enumerate(self.block.ops): + if int(op.desc.attr("op_role")) == 1: + self.bw_strart_op_idx = idx + break + + assert self.bw_strart_op_idx < len( + self.block.ops + ), "Could NOT found backword op in prog" + + # fetch second to last checkpoint at the beginning of BW + fetched_checkpoint_varname = self._record_fetch_op( + self.bw_strart_op_idx + ) + last_last_fetch_checkpoint = None + + for i, op in enumerate(self.block.ops[self.bw_strart_op_idx :]): + idx = self.bw_strart_op_idx + i + input_vars = op.desc.input_arg_names() + + for input_var in input_vars: + if input_var in need_fetch_checkpoint_names: + if input_var not in self.un_fetch_checkpoint_names: + # fetch the offloade checkpoint when the first usage of its previous one + if self.checkpoint_usage_count[input_var] == 0: + # TODO (JZ-LIANG) sync memcpy_stream if extra stream for memcpy + second_to_last_fetch_checkpoint = ( + fetched_checkpoint_varname + ) + # there is NO fetch ahead the first checkpoint + if input_var != self.sorted_checkpoint_names[0]: + fetched_checkpoint_varname = ( + self._record_fetch_op(idx) + ) + + # should check the current used checkpoint is ths last fetch one + assert ( + second_to_last_fetch_checkpoint == input_var + ), "Current recompute segment should use [{}] BUT got [{}]".format( + second_to_last_fetch_checkpoint, input_var + ) + # rename + self.block.ops[idx]._rename_input( + input_var, + self.checkpoint_name2fetch_name[input_var], + ) + self.checkpoint_usage_count[input_var] += 1 + else: + raise ValueError( + "use checkpoint [{}] before fetch in BW".format( + input_var + ) + ) + + assert ( + len(self.un_fetch_checkpoint_names) == 0 + ), "{} checkpoints have NOT been Recorded".format( + self.un_fetch_checkpoint_names + ) + + def _update_backward(self): + if len(self.idx2insertions) == 0: + return + total_op = len(self.block.ops) + for op_idx in reversed(range(self.bw_strart_op_idx, total_op)): + if op_idx in self.idx2insertions: + operation, checkpoint_name = self.idx2insertions[op_idx] + if operation == "fetch": + self._insert_fetch_op(op_idx, checkpoint_name) + logging.debug(f"Insert [{checkpoint_name}] fetch op.") + del self.idx2insertions[op_idx] + elif operation == "sync": + self._insert_sync_op(op_idx, checkpoint_name) + logging.debug(f"Sync [{checkpoint_name}] fetch op.") + self.block._sync_with_cpp() + assert ( + len(self.idx2insertions) == 0 + ), "{} checkpoints left un-Fecthed".format( + [ele[1] for ele in self.idx2insertions.values()] + ) + + def _parse_forward(self): + self.idx2insertions = {} + # don't offload the last checkpoints, faster, less memory saving + self.un_offload_checkpoint_names = self.sorted_checkpoint_names[:] + last_checkpoint = self.un_offload_checkpoint_names.pop(-1) + need_offload_checkpoint_names = self.un_offload_checkpoint_names[:] + self.checkpoint_usage_count_and_idx = {} + for checkpoint_name in self.un_offload_checkpoint_names: + self.checkpoint_usage_count_and_idx[checkpoint_name] = { + 'count': 0, + 'idx': -1, + } + self.synced_checkpoints = set() + self.fw_strart_op_idx = len(self.block.ops) + for idx, op in enumerate(self.block.ops): + if int(op.desc.attr("op_role")) == 0: + self.fw_strart_op_idx = idx + break + + assert self.fw_strart_op_idx < len( + self.block.ops + ), "Could NOT found Forward op in prog" + last_offload_checkpoint = None + + for i, op in enumerate( + self.block.ops[self.fw_strart_op_idx : self.bw_strart_op_idx] + ): + idx = self.fw_strart_op_idx + i + output_vars = op.desc.output_arg_names() + input_vars = op.desc.input_arg_names() + + for output_var in output_vars: + if output_var in need_offload_checkpoint_names: + assert ( + len(output_vars) == 1 + ), "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format( + output_var, op + ) + + if output_var in self.un_offload_checkpoint_names: + # insert sync op if last checkpoint has not been sync + if last_offload_checkpoint is not None: + if ( + self.checkpoint_usage_count_and_idx[ + last_offload_checkpoint + ]['count'] + == 0 + ): + self._record_sync_op( + idx, last_offload_checkpoint + ) + else: + last_usage_idx = ( + self.checkpoint_usage_count_and_idx[ + last_offload_checkpoint + ]['idx'] + ) + assert ( + last_usage_idx > 0 + ), "last_usage_idx of checkpoint [{}] should large than 0".format( + last_offload_checkpoint + ) + self._record_sync_op( + last_usage_idx + 1, last_offload_checkpoint + ) + # insert offload op after the checkpoint's generation op + self._record_offload_op(idx + 1, output_var) + last_offload_checkpoint = output_var + else: + raise ValueError( + "There should be just ONE op that output checkpoint [{}]".format( + output_var + ) + ) + # need to sync the last need to offload checkpoint before the last checkpoint as output op + if output_var == last_checkpoint: + assert ( + len(output_vars) == 1 + ), "chekpoint should be the only Output of a certain op, but [{}] is from [{}]".format( + output_var, op + ) + assert ( + last_offload_checkpoint + == self.sorted_checkpoint_names[-2] + ), "the last offload chekpoint before [{}] is suppose to be [{}], but got [{}]".format( + last_checkpoint, + self.sorted_checkpoint_names[-2], + last_offload_checkpoint, + ) + # sync if last checkpoint has not been sync + if ( + self.checkpoint_usage_count_and_idx[ + last_offload_checkpoint + ]['idx'] + == 0 + ): + self._record_sync_op(idx, last_offload_checkpoint) + else: + last_usage_idx = self.checkpoint_usage_count_and_idx[ + last_offload_checkpoint + ]['idx'] + assert ( + last_usage_idx > 0 + ), "last_usage_idx of checkpoint [{}] should large than 0".format( + last_offload_checkpoint + ) + self._record_sync_op( + last_usage_idx + 1, last_offload_checkpoint + ) + # record checkpoint usage + for input_var in input_vars: + if input_var in need_offload_checkpoint_names: + assert ( + input_var not in self.synced_checkpoints + ), f"checkpoint [{input_var}] used after sync" + self.checkpoint_usage_count_and_idx[input_var]['count'] += 1 + self.checkpoint_usage_count_and_idx[input_var]['idx'] = idx + + assert ( + len(self.un_offload_checkpoint_names) == 0 + ), "{} checkpoints have NOT been Recorded".format( + self.un_fetch_checkpoint_names + ) + assert len(self.synced_checkpoints) == len( + need_offload_checkpoint_names + ), "{} checkpoints have NOT been Recorded".format( + set(need_offload_checkpoint_names) - set(self.synced_checkpoints) + ) + + def _update_forward(self): + if len(self.idx2insertions) == 0: + return + for op_idx in reversed( + range(self.fw_strart_op_idx, self.bw_strart_op_idx) + ): + if op_idx in self.idx2insertions: + operation, checkpoint_name = self.idx2insertions[op_idx] + if operation == "offload": + self._insert_offload_op(op_idx, checkpoint_name) + logging.debug(f"Insert [{checkpoint_name}] offload op.") + del self.idx2insertions[op_idx] + elif operation == "sync": + self._insert_sync_op(op_idx, checkpoint_name) + logging.debug( + f"Insert [{checkpoint_name}] offload_sync op." + ) + del self.idx2insertions[op_idx] + + self.block._sync_with_cpp() + assert ( + len(self.idx2insertions) == 0 + ), "{} checkpoints left un-Offloaded".format( + [ele[1] for ele in self.idx2insertions.values()] + ) + + def _check_offload_fetch(self): + # TODO(JZ-LIANG) the single stream offload need no sync + pass + + def _offload(self, loss, startup_program=None): + """ + core steps for recompute offload + 1. create pinned vars and temp vars + 2. parse & update Forward pass: offload, sync + 3. parse & update Backward pass: rename, fetch, sync + 4. verify the correctness + """ + self._main_program = loss.block.program + self.block = loss.block + if startup_program is None: + startup_program = paddle.static.default_startup_program() + + with program_guard(self._main_program, startup_program): + assert ( + len(self.checkpoint_shape) > 0 + ), "checkpoints shape {} should be an non empty list like: [12, 512, 1024]".format( + self.checkpoint_shape + ) + assert all( + ele > 0 for ele in self.checkpoint_shape + ), "all ele in checkpoints shape {} should be a determined integer larger than 0".format( + self.checkpoint_shape + ) + self.checkpoint_name2pinned_name = {} + self.checkpoint_name2fetch_name = {} + for checkpoint_varname in self.sorted_checkpoint_names: + pinned_var_name, fetch_var_name = self._creat_vars( + checkpoint_varname + ) + self.checkpoint_name2pinned_name[ + checkpoint_varname + ] = pinned_var_name + self.checkpoint_name2fetch_name[ + checkpoint_varname + ] = fetch_var_name + self._append_fill_constant_ops(startup_program) + # TODO (JZ-LIANG) to provide two offload stragtegy in future + # step 2. parse & update FW: rename, offload, sync + self._parse_backward() + self._update_backward() + # step 3. parse & update BW: rename, offload, sync + self._parse_forward() + self._update_forward() + # step 4. verify the correctness + self._check_offload_fetch() + + return + + def backward( + self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None, + ): + """ + call append_backward with checkpoints. + + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables or Variable.names to update. + no_grad_set (set|None): set of Variables or Variables.names should be ignored. + callbacks (list|None): list of callables to run when appending backward + operator for one parameter. + checkpoints (list): list of Variables as checkpoints + + Examples: + .. code-block:: python + + import paddle + import paddle.fluid as fluid + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + print("Finished backward") + """ + assert ( + self._checkpoints is not None + ), "You should call _set_checkpoints first" + + if in_dygraph_mode(): + raise NotImplementedError( + "DyGraph current does not support recompute" + ) + + self._dtype = loss.dtype + program = loss.block.program + with program_guard(program, startup_program): + checkpoint_vars = [] + for ckpt in self._checkpoints: + if isinstance(ckpt, Variable): + checkpoint_vars.append(ckpt) + else: + checkpoint_vars.append(loss.block.var(ckpt)) + + # allow return to non-recompute when checkpoints is empty + if len(checkpoint_vars) > 0: + params_grads, sorted_checkpoint_names = append_backward( + loss, + parameter_list, + no_grad_set, + checkpoints=checkpoint_vars, + ) + else: + params_grads = append_backward( + loss, + parameter_list, + no_grad_set, + checkpoints=checkpoint_vars, + ) + + if self.enable_offload: + self.sorted_checkpoint_names = sorted_checkpoint_names + self._offload(loss, startup_program=startup_program) + + return params_grads + + def apply_optimize(self, loss, startup_program, params_grads): + """ + call the apply_optimize function of self._optimizer + Args: + loss (Variable): loss variable to run optimizations. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + params_grads (list): list of (param, grad) pair to do optimization. + Examples: + .. code-block:: python + import paddle + import paddle.fluid as fluid + + paddle.enable_static() + + def mlp(input_x, input_y, hid_dim=128, label_dim=2): + fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim) + prediction = paddle.static.nn.fc(x=[fc_1], size=label_dim, activation='softmax') + cost = paddle.nn.functional.cross_entropy( + input=prediction, label=input_y, + reduction='none', use_softmax=False + ) + sum_cost = paddle.mean(cost) + return sum_cost, fc_1, prediction + + input_x = paddle.static.data(name="x", shape=[-1,32], dtype='float32') + input_y = paddle.static.data(name="y", shape=[-1,1], dtype='int64') + cost, fc_1, pred = mlp(input_x, input_y) + print("Finished FF") + + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) + sgd._set_checkpoints([fc_1, pred]) + params_grads = sgd.backward( + cost, + startup_program=None, + parameter_list=None, + no_grad_set=None) + + optimize_ops = sgd.apply_optimize( + cost, startup_program=None, params_grads=params_grads) + + print("Finished apply_optimize") + """ + + func = ( + self._optimizer.apply_optimize + if hasattr(self._optimizer, 'apply_optimize') + else self._optimizer._apply_optimize + ) + return func( + loss, startup_program=startup_program, params_grads=params_grads + ) + + def minimize( + self, loss, startup_program=None, parameter_list=None, no_grad_set=None + ): + assert isinstance(loss, Variable), "The loss should be an Variable." + assert ( + self._checkpoints is not None + ), "You should call _set_checkpoints first" + if in_dygraph_mode(): + raise NotImplementedError( + "DyGraph current does not support recompute" + ) + params_grads = self.backward( + loss, + startup_program=startup_program, + parameter_list=parameter_list, + no_grad_set=no_grad_set, + ) + + optimize_ops = self.apply_optimize( + loss, startup_program=startup_program, params_grads=params_grads + ) + + return optimize_ops, params_grads diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index f3e0ef1fba81a734e678a5b7eb1b93bec464e14b..c1a4019b76e3bdd5e302c9d582e77aaf70d220df 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -350,6 +350,13 @@ class Adam(Optimizer): "Beta1Pow": [beta1_pow_acc], "Beta2Pow": [beta2_pow_acc], } + + # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow + found_inf = self._get_auxiliary_var('found_inf') + + if found_inf: + inputs['SkipUpdate'] = found_inf + outputs = { "ParamOut": [param_and_grad[0]], "Moment1Out": [moment1], diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index c66de06a009024b26202d2929ed9611b44764bb2..1a55745a81b5254fe57644e25ae653174dd60642 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -19,6 +19,7 @@ from . import amp # noqa: F401 from . import nn # noqa: F401 from .nn.common import py_func # noqa: F401 +from .nn.common import ExponentialMovingAverage # noqa: F401 from .io import save_inference_model # noqa: F401 from .io import load_inference_model # noqa: F401 @@ -70,8 +71,6 @@ from ..fluid.framework import set_ipu_shard # noqa: F401 from .nn.control_flow import Print # noqa: F401 from ..fluid.param_attr import WeightNormParamAttr # noqa: F401 from ..fluid.optimizer import Optimizer # noqa: F401 -from ..fluid.optimizer import Adam # noqa: F401 -from ..fluid.optimizer import ExponentialMovingAverage # noqa: F401 from ..fluid.layers import exponential_decay # noqa: F401 from ..fluid.layers import learning_rate_scheduler # noqa: F401 diff --git a/python/paddle/static/amp/decorator.py b/python/paddle/static/amp/decorator.py index 75e7f28955ed7686e7f18c07d11b65b511d25fad..9dfa612600958854942ac1dddb3f8055f3795edc 100644 --- a/python/paddle/static/amp/decorator.py +++ b/python/paddle/static/amp/decorator.py @@ -483,7 +483,7 @@ class OptimizerWithMixedPrecision: real_optimizer = real_optimizer.inner_opt if isinstance( real_optimizer, - (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW), + (paddle.optimizer.Adam, paddle.optimizer.AdamW), ): # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we # copy it in advance to avoid multiple time copies. diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py index f06bfeed9e87e98b36d5eecec1198c815573350c..f04dc277e4e0da5a8f328d2a84df97c02bfd1731 100644 --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -24,11 +24,20 @@ from paddle.common_ops_import import ( check_type, check_variable_and_dtype, ) -from paddle.fluid import core +from paddle.fluid import core, layers, unique_name from paddle.fluid.data_feeder import check_dtype -from paddle.fluid.framework import Variable, in_dygraph_mode, static_only +from paddle.fluid.framework import ( + Program, + Variable, + default_main_program, + in_dygraph_mode, + name_scope, + program_guard, + static_only, +) from paddle.fluid.layers.layer_function_generator import templatedoc from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.wrapped_decorator import signature_safe_contextmanager from paddle.nn.initializer import Constant, Normal __all__ = [] @@ -3999,3 +4008,259 @@ def sparse_embedding( }, ) return tmp + + +class ExponentialMovingAverage: + r""" + + Compute the moving average of parameters with exponential decay. + Given a parameter :math:`\\theta`, its exponential moving average (EMA) + will be + + .. math:: + + \text{EMA}_0 & = 0 + + \text{EMA}_t & = \text{decay} * \text{EMA}_{t-1} + (1 - \text{decay}) * \theta_t + + The average results calculated by **update()** method will be saved in + temporary variables which are created and maintained by the object, and can + be applied to parameters of current model by calling **apply()** method. And + the **restore()** method is used to restore the parameters. + + **Bias correction**. All EMAs are initialized to :math:`0` and hence they will be + zero biased, which can be corrected by divided by a factor + :math:`(1 - \text{decay}^t)` , i.e., the actual EMAs applied to parameters + when calling **apply()** method would be + + .. math:: + + \widehat{\text{EMA}}_t = \frac{\text{EMA}_t}{1 - \text{decay}^t} + + **Decay rate scheduling**. A large decay rate very close to 1 would result + in that the averages move very slowly. And a better strategy is to set a + relative smaller decay rate in the very beginning. The argument **thres_steps** + allows users to pass a Variable to schedule the decay rate, in this case, + the actual decay rate becomes + + .. math:: + + \min(\text{decay}, \frac{1 + \text{thres_steps}}{10 + \text{thres_steps}}) + + Usually **thres_steps** can be the global training steps. + + + Args: + decay (float, optional): The exponential decay rate, usually close to 1, such as 0.999, 0.9999, ... . Default 0.999. + thres_steps (Variable|None, optional): If not `None`, schedule the decay rate. Default None. + name (str|None, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. + + + Examples: + + .. code-block:: python + + import numpy + import paddle + import paddle.static as static + from paddle.static import ExponentialMovingAverage + + paddle.enable_static() + + data = static.data(name='x', shape=[-1, 5], dtype='float32') + hidden = static.nn.fc(x=data, size=10) + cost = paddle.mean(hidden) + + test_program = static.default_main_program().clone(for_test=True) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(cost) + + ema = ExponentialMovingAverage(0.999) + ema.update() + + place = paddle.CPUPlace() + exe = static.Executor(place) + exe.run(static.default_startup_program()) + + for pass_id in range(3): + for batch_id in range(6): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=static.default_main_program(), + feed={'x': data}, + fetch_list=[cost.name]) + + # usage 1 + with ema.apply(exe): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=test_program, + feed={'x': data}, + fetch_list=[hidden.name]) + + # usage 2 + with ema.apply(exe, need_restore=False): + data = numpy.random.random(size=(10, 5)).astype('float32') + exe.run(program=test_program, + feed={'x': data}, + fetch_list=[hidden.name]) + ema.restore(exe) + + """ + + def __init__(self, decay=0.999, thres_steps=None, name=None): + if in_dygraph_mode(): + raise Exception( + "In dygraph, don't support ExponentialMovingAverage." + ) + self._decay = decay + self._thres_steps = thres_steps + self._name = name if name is not None else '' + self._decay_var = self._get_ema_decay() + + self._step_counter_name = "@EMA_STEP_COUNTER@" + self._params_tmps = [] + for param in default_main_program().global_block().all_parameters(): + if param.do_model_average: + tmp = param.block.create_var( + name=unique_name.generate( + ".".join([self._name + param.name, 'ema_tmp']) + ), + dtype=param.dtype, + persistable=False, + stop_gradient=True, + ) + self._params_tmps.append((param, tmp)) + + self._ema_vars = {} + for param, tmp in self._params_tmps: + with param.block.program._optimized_guard([param, tmp]), name_scope( + 'moving_average' + ): + self._ema_vars[param.name] = self._create_ema_vars(param) + + self.apply_program = Program() + block = self.apply_program.global_block() + with program_guard(main_program=self.apply_program): + decay_pow, global_step = self._get_decay_pow(block) + for param, tmp in self._params_tmps: + param = block._clone_variable(param) + tmp = block._clone_variable(tmp) + ema = block._clone_variable(self._ema_vars[param.name]) + paddle.assign(param, output=tmp) + # bias correction + param_val = paddle.static.nn.cond( + global_step > 0, + lambda: ema / (1.0 - decay_pow), + lambda: ema, + ) + paddle.assign(param_val, output=param) + self.restore_program = Program() + block = self.restore_program.global_block() + with program_guard(main_program=self.restore_program): + for param, tmp in self._params_tmps: + tmp = block._clone_variable(tmp) + param = block._clone_variable(param) + paddle.assign(tmp, output=param) + + def _get_ema_decay(self): + with default_main_program()._lr_schedule_guard(): + decay_var = paddle.static.create_global_var( + shape=[1], + value=self._decay, + dtype='float32', + persistable=True, + name="scheduled_ema_decay_rate", + ) + + if self._thres_steps is not None: + decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0) + decay_val = paddle.static.nn.cond( + decay_t < self._decay, + lambda: decay_t, + lambda: np.array([self._decay], dtype=np.float32), + ) + paddle.assign(decay_val, decay_var) + return decay_var + + def _get_decay_pow(self, block): + global_step = paddle.static.create_global_var( + name=self._step_counter_name, + shape=[1], + value=0, + dtype='int64', + persistable=True, + ) + global_step = paddle.cast(global_step, "float32") + decay_var = block._clone_variable(self._decay_var) + decay_pow_acc = paddle.pow(decay_var, global_step) + return decay_pow_acc, global_step + + def _create_ema_vars(self, param): + param_ema = paddle.static.create_global_var( + name=unique_name.generate(self._name + param.name + '_ema'), + shape=param.shape, + value=0.0, + dtype=param.dtype, + persistable=True, + ) + + return param_ema + + def update(self): + """ + Update Exponential Moving Average. Should only call this method in + train program. + """ + global_step = layers.autoincreased_step_counter( + counter_name=self._step_counter_name + ) + param_master_emas = [] + for param, tmp in self._params_tmps: + with param.block.program._optimized_guard([param, tmp]), name_scope( + 'moving_average' + ): + param_ema = self._ema_vars[param.name] + if param.name + '.master' in self._ema_vars: + master_ema = self._ema_vars[param.name + '.master'] + param_master_emas.append([param_ema, master_ema]) + else: + ema_t = param_ema * self._decay_var + param * ( + 1 - self._decay_var + ) + paddle.assign(ema_t, output=param_ema) + + # for fp16 params + for param_ema, master_ema in param_master_emas: + default_main_program().global_block().append_op( + type="cast", + inputs={"X": master_ema}, + outputs={"Out": param_ema}, + attrs={ + "in_dtype": master_ema.dtype, + "out_dtype": param_ema.dtype, + }, + ) + + @signature_safe_contextmanager + def apply(self, executor, need_restore=True): + """ + Apply moving average to parameters for evaluation. + + Args: + executor (Executor): The Executor to execute applying. + need_restore (bool, optional): Whether to restore parameters after + applying. Default True. + """ + executor.run(self.apply_program) + try: + yield + finally: + if need_restore: + self.restore(executor) + + def restore(self, executor): + """Restore parameters. + + Args: + executor (Executor): The Executor to execute restoring. + """ + executor.run(self.restore_program) diff --git a/test/asp/asp_pruning_base.py b/test/asp/asp_pruning_base.py index 6b3e92ad80fd6bf82d532d70c7dd938aff467db3..c6d7ff440cc118feb606428032beaf3c9ad0a2c1 100644 --- a/test/asp/asp_pruning_base.py +++ b/test/asp/asp_pruning_base.py @@ -72,7 +72,7 @@ class TestASPHelperPruningBase(unittest.TestCase): ) ) optimizer = paddle.incubate.asp.decorate( - fluid.optimizer.SGD(learning_rate=0.01) + paddle.optimizer.SGD(learning_rate=0.01) ) optimizer.minimize(loss, self.startup_program) diff --git a/test/asp/test_asp_customized_pruning.py b/test/asp/test_asp_customized_pruning.py index 846a17bb2c647b25dff17092f2accbc269860689..c63d12449c03d62cbc60a48900f776d6112c00a8 100644 --- a/test/asp/test_asp_customized_pruning.py +++ b/test/asp/test_asp_customized_pruning.py @@ -275,7 +275,7 @@ class TestASPStaticCustomerizedPruneFunc(unittest.TestCase): ) ) optimizer = sparsity.decorate( - fluid.optimizer.SGD(learning_rate=0.01) + paddle.optimizer.SGD(learning_rate=0.01) ) optimizer.minimize(loss, self.startup_program) diff --git a/test/asp/test_asp_optimize_static.py b/test/asp/test_asp_optimize_static.py index 767c505bd883eed96414398b5260e4cd9d675925..863bbe4f0c04ed369026daef1416f8e27e4ca9b0 100644 --- a/test/asp/test_asp_optimize_static.py +++ b/test/asp/test_asp_optimize_static.py @@ -56,7 +56,7 @@ class TestASPStaticOptimize(unittest.TestCase): use_softmax=False, ) ) - self.optimizer = fluid.optimizer.SGD(learning_rate=0.01) + self.optimizer = paddle.optimizer.SGD(learning_rate=0.01) def test_get_not_ASP_relevant_vars(self): def check_params(params, params_from_asp): diff --git a/test/asp/test_asp_pruning_static.py b/test/asp/test_asp_pruning_static.py index f7c3ecab190a7a0bafaa52d3d136f784e2148692..3f87a2174eba85465072a155a7fda9be5810c348 100644 --- a/test/asp/test_asp_pruning_static.py +++ b/test/asp/test_asp_pruning_static.py @@ -77,7 +77,7 @@ class TestASPStaticPruningBase(unittest.TestCase): ) ) optimizer = paddle.incubate.asp.decorate( - fluid.optimizer.SGD(learning_rate=0.01) + paddle.optimizer.SGD(learning_rate=0.01) ) optimizer.minimize(loss, self.startup_program) diff --git a/test/asp/test_asp_save_load.py b/test/asp/test_asp_save_load.py index 523579d78bbfcfc318a4d8b4a684368c95aae84c..698aacbd9b40181f02e1a1b9fafc725d25f7a93a 100644 --- a/test/asp/test_asp_save_load.py +++ b/test/asp/test_asp_save_load.py @@ -153,7 +153,7 @@ class TestASPStaticOptimize(unittest.TestCase): use_softmax=False, ) ) - self.optimizer = fluid.optimizer.SGD(learning_rate=0.01) + self.optimizer = paddle.optimizer.SGD(learning_rate=0.01) self.optimizer = paddle.incubate.asp.decorate(self.optimizer) self.optimizer.minimize(self.loss, self.startup_program) diff --git a/test/asp/test_fleet_with_asp_sharding.py b/test/asp/test_fleet_with_asp_sharding.py index 8f7aaa3d4181f5643dc52f7fa03947a7de72eb5a..4dabb3549df3370508384a345690cf0d6ac48ef4 100644 --- a/test/asp/test_fleet_with_asp_sharding.py +++ b/test/asp/test_fleet_with_asp_sharding.py @@ -92,7 +92,7 @@ class TestFleetWithASPSharding(unittest.TestCase): ) with fluid.program_guard(train_prog, startup_prog): - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy ) diff --git a/test/asp/test_fleet_with_asp_static.py b/test/asp/test_fleet_with_asp_static.py index cfe0948c6ecbfcbeaadb7157ca3015d86636b595..c0763f309e7a43686b34b6cbede823a028ac9fb9 100644 --- a/test/asp/test_fleet_with_asp_static.py +++ b/test/asp/test_fleet_with_asp_static.py @@ -71,7 +71,7 @@ class TestFleetWithASPStatic(unittest.TestCase): ) with fluid.program_guard(train_prog, startup_prog): - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy ) diff --git a/test/auto_parallel/auto_parallel_relaunch_model.py b/test/auto_parallel/auto_parallel_relaunch_model.py index 6fa3bc9eaa1ff8c9de483471ef4ec35905ef1127..65fc730a1dab7697f614952ebbf8a8ee9650c150 100644 --- a/test/auto_parallel/auto_parallel_relaunch_model.py +++ b/test/auto_parallel/auto_parallel_relaunch_model.py @@ -132,7 +132,7 @@ def train(): train_program, start_program ) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py b/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py index 6f61cafbcd883041533daf916a15c65d1c5cdf7f..286a492884f699f8020156024d47752dbbae7416 100644 --- a/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py +++ b/test/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py @@ -118,7 +118,7 @@ def train(): vocab_size, ) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/auto_parallel_relaunch_with_planner.py b/test/auto_parallel/auto_parallel_relaunch_with_planner.py index 4ad1dfb196581d29778b0e1259c8f65b661cbf9b..ad2cd04b695f7b2d9cb1525ccb9355e30ba3ee8d 100644 --- a/test/auto_parallel/auto_parallel_relaunch_with_planner.py +++ b/test/auto_parallel/auto_parallel_relaunch_with_planner.py @@ -38,7 +38,7 @@ def train(): train_program, start_program ) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/engine_api_dp.py b/test/auto_parallel/engine_api_dp.py index 053d9966e70a1c2fcb39b788944e0483eb2a9e5d..fd2dbef7560567c9416ac7e737bd95b2684249f9 100644 --- a/test/auto_parallel/engine_api_dp.py +++ b/test/auto_parallel/engine_api_dp.py @@ -94,7 +94,7 @@ def train(fetch): initializer_range=0.02, ) loss = paddle.nn.CrossEntropyLoss() - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/optimization_tuner_api.py b/test/auto_parallel/optimization_tuner_api.py index e103020612266b36bab66920120fa23d134e5337..2cf3326e9b220b3329d8105fb276bdf3ffb523db 100644 --- a/test/auto_parallel/optimization_tuner_api.py +++ b/test/auto_parallel/optimization_tuner_api.py @@ -75,7 +75,7 @@ def train(fetch): initializer_range=0.02, ) loss = paddle.nn.CrossEntropyLoss() - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/test_base_cost.py b/test/auto_parallel/test_base_cost.py index 62c695b9e1d2eecb6ffe30b6cc22fb3bd7155451..a3f9349e1c2a78ba5a89009e8da979da4c37d861 100644 --- a/test/auto_parallel/test_base_cost.py +++ b/test/auto_parallel/test_base_cost.py @@ -131,7 +131,7 @@ def get_prog(train_program, startup_program, dist_context, rank_id): ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/auto_parallel/test_parallel_tuner.py b/test/auto_parallel/test_parallel_tuner.py index 1c1ab3400970ad6868b87774a196d3c7b3992655..22f9b9e77117259f9c03a5f4ff2e70eed9edb500 100644 --- a/test/auto_parallel/test_parallel_tuner.py +++ b/test/auto_parallel/test_parallel_tuner.py @@ -111,7 +111,7 @@ def get_program_v3(): criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/test_parallel_tuner_full.py b/test/auto_parallel/test_parallel_tuner_full.py index 7065b61abf4dd104777805cf1eaa084e5f500582..47ac9304f98e932cabfdaaf32be6e02b7875d6d8 100644 --- a/test/auto_parallel/test_parallel_tuner_full.py +++ b/test/auto_parallel/test_parallel_tuner_full.py @@ -113,7 +113,7 @@ def get_program_v3(): criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/auto_parallel/test_parallel_tuner_predict.py b/test/auto_parallel/test_parallel_tuner_predict.py index 9ea9f152814782f3cbf7b146f3bdcf43c42d074e..a26782c9f0f72f8959a2ab486cd5721a012c38bb 100644 --- a/test/auto_parallel/test_parallel_tuner_predict.py +++ b/test/auto_parallel/test_parallel_tuner_predict.py @@ -111,7 +111,7 @@ def get_program_v3(): criterion = GPTPretrainingCriterion() loss = criterion(preds, labels, loss_mask) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/book/test_fit_a_line.py b/test/book/test_fit_a_line.py index ae1a549f0d538e4189ad313d5f31b49c050572ed..0d1e77ce7fcb14872acfa3e881fbae22249fde25 100644 --- a/test/book/test_fit_a_line.py +++ b/test/book/test_fit_a_line.py @@ -75,7 +75,7 @@ def train(use_cuda, save_dirname, is_local, use_bf16, pure_bf16): avg_cost = paddle.mean(cost) lr = 5e-3 if use_bf16 else 1e-3 - sgd_optimizer = fluid.optimizer.SGD(learning_rate=lr) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=lr) if use_bf16: sgd_optimizer = amp.bf16.decorate_bf16( diff --git a/test/book/test_image_classification.py b/test/book/test_image_classification.py index 18a250ae53c69a29fedcd0f57935f194dc537698..9396e15fa03a479db1d7ad43cbc2ca1323019404 100644 --- a/test/book/test_image_classification.py +++ b/test/book/test_image_classification.py @@ -132,7 +132,7 @@ def train(net_type, use_cuda, save_dirname, is_local): # Test program test_program = fluid.default_main_program().clone(for_test=True) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_cost) BATCH_SIZE = 128 diff --git a/test/book/test_recognize_digits.py b/test/book/test_recognize_digits.py index b1d99b3a28fe67cb32b63521616a9d7e630fa4c3..5a64bcd3ed5720bdf0cd226340df5504f03b9c48 100644 --- a/test/book/test_recognize_digits.py +++ b/test/book/test_recognize_digits.py @@ -96,7 +96,7 @@ def train( test_program = fluid.default_main_program().clone(for_test=True) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/book/test_recommender_system.py b/test/book/test_recommender_system.py index f6605a13149d7d1bd20630b337bf827629e89094..5087e8a0b370f10a7c6bb7f36920195c07edab68 100644 --- a/test/book/test_recommender_system.py +++ b/test/book/test_recommender_system.py @@ -27,7 +27,7 @@ import paddle from paddle import fluid from paddle.fluid import framework from paddle.fluid.executor import Executor -from paddle.fluid.optimizer import SGDOptimizer +from paddle.optimizer import SGD paddle.enable_static() @@ -188,7 +188,7 @@ def train(use_cuda, save_dirname, is_local=True): # test program test_program = fluid.default_main_program().clone(for_test=True) - sgd_optimizer = SGDOptimizer(learning_rate=0.2) + sgd_optimizer = SGD(learning_rate=0.2) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/book/test_word2vec_book.py b/test/book/test_word2vec_book.py index 0c59f005a228737a8fee05c1128de56cde60d6a0..2a511191743d56f339e58beb79f9fdd1899a3a92 100644 --- a/test/book/test_word2vec_book.py +++ b/test/book/test_word2vec_book.py @@ -123,7 +123,7 @@ def train( else: raise NotImplementedError() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) if use_bf16: sgd_optimizer = paddle.static.amp.bf16.decorate_bf16( sgd_optimizer, diff --git a/test/collective/fleet/auto_parallel_parallelizer.py b/test/collective/fleet/auto_parallel_parallelizer.py index 5767d237a3f3c35bd473670074e9cf53b103599f..acf090033301198a8bc6704c40a288e95b8a1f4a 100755 --- a/test/collective/fleet/auto_parallel_parallelizer.py +++ b/test/collective/fleet/auto_parallel_parallelizer.py @@ -119,7 +119,7 @@ class TestMLPAutoParallelizer(unittest.TestCase): train_program, start_program ) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/collective/fleet/dist_mnist_gradient_merge.py b/test/collective/fleet/dist_mnist_gradient_merge.py index e1115bf6d7e34b88a0595b3f1508818cca73215d..deeb3b32de857377b5d5a39127285b0d01564024 100644 --- a/test/collective/fleet/dist_mnist_gradient_merge.py +++ b/test/collective/fleet/dist_mnist_gradient_merge.py @@ -53,10 +53,8 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) - opt = fluid.optimizer.GradientMergeOptimizer(opt, 2) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opt = paddle.incubate.optimizer.GradientMergeOptimizer(opt, 2) if single_device: opt.minimize(avg_cost) else: diff --git a/test/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py b/test/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py index 0a03aa90ae7289347d6df51a2ccba95d90187598..369e163c3528c217231c469c2ff39307ff104588 100644 --- a/test/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py +++ b/test/collective/fleet/dist_mnist_gradient_merge_raw_optimizer.py @@ -68,7 +68,7 @@ class TestDistMnistGradientMergeRawOptimizer(TestDistRunnerBase): test_program = paddle.static.default_main_program().clone(for_test=True) optimizer = paddle.optimizer.Adam(learning_rate=1e-3) if single_device: - optimizer = fluid.optimizer.GradientMergeOptimizer( + optimizer = paddle.incubate.optimizer.GradientMergeOptimizer( optimizer, k_steps=strategy.gradient_merge_configs["k_steps"], avg=strategy.gradient_merge_configs["avg"], diff --git a/test/collective/fleet/fused_attention_pass_with_mp.py b/test/collective/fleet/fused_attention_pass_with_mp.py index 2f1e657cfc8be464c2755deeae326f5cfb866973..71ca3ef2527114ea63d70ac4917c0397f9efd2bc 100644 --- a/test/collective/fleet/fused_attention_pass_with_mp.py +++ b/test/collective/fleet/fused_attention_pass_with_mp.py @@ -175,7 +175,7 @@ class TestFusedAttentionPassWithMP(unittest.TestCase): out = multi_head_attn(attn_input, attn_mask) loss = paddle.mean(out) - sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(loss) startup_block = startup_prog.global_block() diff --git a/test/collective/fleet/parallel_dygraph_se_resnext.py b/test/collective/fleet/parallel_dygraph_se_resnext.py index ae688f1a32d7244893b424dc74523a584a64b12b..05e9088c9c98042ada3d88b3e90f192190849c36 100644 --- a/test/collective/fleet/parallel_dygraph_se_resnext.py +++ b/test/collective/fleet/parallel_dygraph_se_resnext.py @@ -55,22 +55,23 @@ def optimizer_setting(params, parameter_list=None): bd = [step * e for e in ls["epochs"]] lr = params["lr"] num_epochs = params["num_epochs"] + if fluid.in_dygraph_mode(): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=fluid.layers.cosine_decay( learning_rate=lr, step_each_epoch=step, epochs=num_epochs ), momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), + weight_decay=paddle.regularizer.L2Decay(l2_decay), parameter_list=parameter_list, ) else: - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=fluid.layers.cosine_decay( learning_rate=lr, step_each_epoch=step, epochs=num_epochs ), momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), + weight_decay=paddle.regularizer.L2Decay(l2_decay), ) return optimizer diff --git a/test/collective/fleet/parallel_dygraph_sync_batch_norm.py b/test/collective/fleet/parallel_dygraph_sync_batch_norm.py index 3d0010629bbc0bb99b4ddce80b993fcc212a1b4a..df501c337aae1e265f8e078c564553d01e42a39a 100644 --- a/test/collective/fleet/parallel_dygraph_sync_batch_norm.py +++ b/test/collective/fleet/parallel_dygraph_sync_batch_norm.py @@ -19,7 +19,6 @@ from legacy_test.test_dist_base import ( ) import paddle -from paddle import fluid from paddle.fluid.dygraph.base import to_variable from paddle.nn import Conv2D, SyncBatchNorm @@ -79,8 +78,8 @@ class TestSyncBatchNorm(TestParallelDyGraphRunnerBase): batch_size=32, drop_last=True, ) - opt = fluid.optimizer.Adam( - learning_rate=1e-3, parameter_list=model.parameters() + opt = paddle.optimizer.Adam( + learning_rate=1e-3, parameters=model.parameters() ) return model, train_reader, opt diff --git a/test/collective/fleet/parallel_dygraph_transformer.py b/test/collective/fleet/parallel_dygraph_transformer.py index a79e1d65d67bf7fb560fbbf876e4002681e68a51..b92c14a11d40f150753533f7f1c78bf78c7780da 100644 --- a/test/collective/fleet/parallel_dygraph_transformer.py +++ b/test/collective/fleet/parallel_dygraph_transformer.py @@ -1089,11 +1089,11 @@ class TestTransformer(TestParallelDyGraphRunnerBase): fake_data_reader(), TrainTaskConfig.batch_size ) if naive_optimize: - optimizer = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=model.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.001, parameters=model.parameters() ) else: - optimizer = fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=NoamDecay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, @@ -1102,7 +1102,7 @@ class TestTransformer(TestParallelDyGraphRunnerBase): beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps, - parameter_list=model.parameters(), + parameters=model.parameters(), ) return model, train_reader, optimizer diff --git a/test/collective/fleet/pipeline_mnist_multi_device.py b/test/collective/fleet/pipeline_mnist_multi_device.py index c0796e6fcf5e761d35c5a9c262fd7d1cb63341a6..6baac996f8833be47444292fd9bc505e6a702828 100644 --- a/test/collective/fleet/pipeline_mnist_multi_device.py +++ b/test/collective/fleet/pipeline_mnist_multi_device.py @@ -121,8 +121,8 @@ class TestDistMnist2x2(TestDistRunnerBase): steps_per_pass = 10 bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) - opt = fluid.optimizer.Momentum( + lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr) + opt = paddle.optimizer.Momentum( learning_rate=lr_val, momentum=0.9, grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0), diff --git a/test/collective/fleet/pipeline_mnist_one_device.py b/test/collective/fleet/pipeline_mnist_one_device.py index ed4b85c54891d4c866a8a1e53e48e3cb6a76c73e..3d844148295805b01d155a8062fb25046358f37a 100644 --- a/test/collective/fleet/pipeline_mnist_one_device.py +++ b/test/collective/fleet/pipeline_mnist_one_device.py @@ -113,8 +113,8 @@ class TestDistMnist2x2(TestDistRunnerBase): steps_per_pass = 10 bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) - opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) + lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr) + opt = paddle.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) # Reader train_reader = paddle.batch( diff --git a/test/collective/fleet/static_model_parallel_by_col.py b/test/collective/fleet/static_model_parallel_by_col.py index 88bbccdac7e2ec53eb4094a24bd9c54604bac79b..f481e72f0b8e9a85cb9182bd36d30c524c10852d 100644 --- a/test/collective/fleet/static_model_parallel_by_col.py +++ b/test/collective/fleet/static_model_parallel_by_col.py @@ -91,7 +91,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/collective/fleet/static_model_parallel_by_row.py b/test/collective/fleet/static_model_parallel_by_row.py index 649f1efc45fbc095609b6d566c8cd341f99f070a..93c76ea71afb4c415091a1a7fd6bbfef8b073bdd 100644 --- a/test/collective/fleet/static_model_parallel_by_row.py +++ b/test/collective/fleet/static_model_parallel_by_row.py @@ -95,7 +95,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/collective/fleet/static_model_parallel_embedding.py b/test/collective/fleet/static_model_parallel_embedding.py index 21bbb31275aaa11e8c72516aa41dc6e8b692f20c..c762b1c960740a380ecab438b26085582ca433ce 100644 --- a/test/collective/fleet/static_model_parallel_embedding.py +++ b/test/collective/fleet/static_model_parallel_embedding.py @@ -85,7 +85,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/collective/fleet/test_communicator_half_async.py b/test/collective/fleet/test_communicator_half_async.py index 1f917f937c0eb4595bd4027e31bbc35cef2b5a3d..ae6d02a24d5891787dfa9876be8066fe5356d6c8 100644 --- a/test/collective/fleet/test_communicator_half_async.py +++ b/test/collective/fleet/test_communicator_half_async.py @@ -49,7 +49,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase): def run_pserver(self, role, strategy): fleet.init(role) avg_cost, x, y = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) @@ -62,7 +62,7 @@ class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase): fleet.init(role) avg_cost, x, y = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_communicator_sync.py b/test/collective/fleet/test_communicator_sync.py index 8ec2b84af29eaddfec6cf8277b8c47c43d6bbcbd..5cb37d7ee547f55a74144e355c485495b23597ac 100644 --- a/test/collective/fleet/test_communicator_sync.py +++ b/test/collective/fleet/test_communicator_sync.py @@ -20,7 +20,6 @@ import paddle paddle.enable_static() -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -48,7 +47,7 @@ class TestCommunicator(unittest.TestCase): fleet.init(role_maker.PaddleCloudRoleMaker()) avg_cost = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False diff --git a/test/collective/fleet/test_dgc_optimizer.py b/test/collective/fleet/test_dgc_optimizer.py index 3101b21257e3b0c62744e448e32648f55e1fd45e..e356f842c9ea6bdd5f6c7f3f6a1f1008c93be811 100644 --- a/test/collective/fleet/test_dgc_optimizer.py +++ b/test/collective/fleet/test_dgc_optimizer.py @@ -16,7 +16,7 @@ import unittest import paddle from paddle import regularizer -from paddle.fluid import framework, optimizer +from paddle.fluid import framework from paddle.nn import clip paddle.enable_static() @@ -79,8 +79,10 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ) if use_recompute: - dgc_momentum_optimizer = optimizer.RecomputeOptimizer( - dgc_momentum_optimizer + dgc_momentum_optimizer = ( + paddle.incubate.optimizer.RecomputeOptimizer( + dgc_momentum_optimizer + ) ) dgc_momentum_optimizer._set_checkpoints([]) dgc_momentum_optimizer.get_accumulators = ( diff --git a/test/collective/fleet/test_distributed_strategy.py b/test/collective/fleet/test_distributed_strategy.py index 67bd12092cfbcf0a29054b97a684c8ae4f89d0c0..4d76d7d3ad00eb513446dc0f2898638a04ba201a 100644 --- a/test/collective/fleet/test_distributed_strategy.py +++ b/test/collective/fleet/test_distributed_strategy.py @@ -242,7 +242,7 @@ class TestCreateDefaultStrategy(unittest.TestCase): fleet.init(role) def type_error_optimizer(): - optimizer = fluid.optimizer.SGD(0.0001) + optimizer = paddle.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer) self.assertRaises(TypeError, type_error_optimizer) @@ -264,7 +264,7 @@ class TestHalfAsyncStrategy(unittest.TestCase): half_async_config.geo_sgd_mode = False half_async_config.runtime_split_send_recv = False - optimizer = fluid.optimizer.SGD(0.0001) + optimizer = paddle.optimizer.SGD(0.0001) optimizer = fleet.distributed_optimizer(optimizer, half_async_config) @@ -284,7 +284,7 @@ class TestDebugInfo(unittest.TestCase): ) fleet.init(role) - optimizer = fluid.optimizer.SGD(0.0001) + optimizer = paddle.optimizer.SGD(0.0001) strategy = StrategyFactory.create_sync_strategy() strategy.set_debug_opt( { diff --git a/test/collective/fleet/test_fleet_amp_meta_optimizer.py b/test/collective/fleet/test_fleet_amp_meta_optimizer.py index 36458028d12b26afb73637aff12a595a9eeeb8bd..75a7f445eea2b7ddaab6380a146a2891697fe956 100644 --- a/test/collective/fleet/test_fleet_amp_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_amp_meta_optimizer.py @@ -31,9 +31,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer): train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = AMPOptimizer(opt) self.set_strategy(strategy, 'amp') @@ -50,9 +48,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer): train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = AMPOptimizer(opt) self.set_strategy(strategy, 'amp') @@ -71,9 +67,7 @@ class TestFleetAMPOptimizer(TestFleetMetaOptimizer): train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = AMPOptimizer(opt) self.set_strategy(strategy, 'amp') diff --git a/test/collective/fleet/test_fleet_checkpoint.py b/test/collective/fleet/test_fleet_checkpoint.py index 9296282c9ba545f967803193dcd85c5210d52fe4..4b86e6d57fd7a47e074447353ec3141f5f384a86 100644 --- a/test/collective/fleet/test_fleet_checkpoint.py +++ b/test/collective/fleet/test_fleet_checkpoint.py @@ -47,7 +47,7 @@ class FleetTest(unittest.TestCase): input=predict, label=label, reduction='none', use_softmax=False ) avg_loss = paddle.mean(loss) - optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.001) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) dist_optimizer = fleet.distributed_optimizer(optimizer) dist_optimizer.minimize(avg_loss) diff --git a/test/collective/fleet/test_fleet_dgc_meta_optimizer.py b/test/collective/fleet/test_fleet_dgc_meta_optimizer.py index d0732d72f090307613af21caca1f1dc35417bb4a..057e0b0d6c509463646a41c80fc4aefd100bfa45 100755 --- a/test/collective/fleet/test_fleet_dgc_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_dgc_meta_optimizer.py @@ -33,9 +33,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'dgc') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) dgc_opt = DGCOptimizer(opt) role = role_maker.PaddleCloudRoleMaker(is_collective=True) dgc_opt._set_basic_info(avg_cost, role, opt, strategy) @@ -50,9 +48,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'dgc') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) dgc_opt = DGCOptimizer(opt) role = role_maker.PaddleCloudRoleMaker(is_collective=True) dgc_opt._set_basic_info(avg_cost, role, opt, strategy) @@ -70,9 +66,7 @@ class TestFleetDGCOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'dgc') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) dgc_opt = DGCOptimizer(opt) role = role_maker.PaddleCloudRoleMaker(is_collective=True) dgc_opt._set_basic_info(avg_cost, role, opt, strategy) diff --git a/test/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py b/test/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py index a49d1721e607c5269e475cad2cd5188e7b65414d..f78f8319691f0d25189f9d35758b9ee6a4ece57f 100644 --- a/test/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_fp16_allreduce_meta_optimizer.py @@ -56,7 +56,7 @@ class TestFleetFP16CompressOptimizer(unittest.TestCase): train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -83,7 +83,7 @@ class TestFleetFP16CompressOptimizer(unittest.TestCase): train_prog, startup_prog = fluid.Program(), fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog, dtype='float16') - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_fleet_lamb_meta_optimizer.py b/test/collective/fleet/test_fleet_lamb_meta_optimizer.py index c32135bafc1922c15f3cde7cad759415d8939996..5708578c951ddba678d0dfa0cd9daa57181b7e66 100755 --- a/test/collective/fleet/test_fleet_lamb_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_lamb_meta_optimizer.py @@ -70,7 +70,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -83,9 +83,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.1, momentum=0.9 - ) + optimizer = paddle.optimizer.Momentum(learning_rate=0.1, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -98,7 +96,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['.b_0'], @@ -146,7 +144,7 @@ class TestFleetLambMetaOptimizer(unittest.TestCase): 'exclude_from_weight_decay': [], } - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_fleet_lars_meta_optimizer.py b/test/collective/fleet/test_fleet_lars_meta_optimizer.py index 41a3e2bc27a6dc42cb4158094b568c4d8ea0b0c7..8f6e3307191758c1bc963b3604c224f5556131b1 100755 --- a/test/collective/fleet/test_fleet_lars_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_lars_meta_optimizer.py @@ -72,9 +72,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9 - ) + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -87,7 +85,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -100,9 +98,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): startup_prog = fluid.Program() train_prog = fluid.Program() avg_cost, strategy = self.net(train_prog, startup_prog) - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9 - ) + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -153,9 +149,7 @@ class TestFleetLarsMetaOptimizer(unittest.TestCase): "exclude_from_weight_decay": ["batch_norm", ".b"], } - optimizer = paddle.fluid.optimizer.Momentum( - learning_rate=0.01, momentum=0.9 - ) + optimizer = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_fleet_meta_optimizer_base.py b/test/collective/fleet/test_fleet_meta_optimizer_base.py index 3ce7b9e27aa6c4b71d7a82fdb6941b289dd3eaf8..7f8db79edf22b1b2ec214da7fddf7a677a9f8d9f 100755 --- a/test/collective/fleet/test_fleet_meta_optimizer_base.py +++ b/test/collective/fleet/test_fleet_meta_optimizer_base.py @@ -51,7 +51,7 @@ class TestFleetMetaOptimizerBase(unittest.TestCase): ) avg_cost = paddle.mean(x=cost) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) opt = MetaOptimizerBase(optimizer) opt_ops, params_grads = opt.minimize(avg_cost) opt.apply_optimize( diff --git a/test/collective/fleet/test_fleet_pipeline_meta_optimizer.py b/test/collective/fleet/test_fleet_pipeline_meta_optimizer.py index b75f146fe26b4f712bcd9d7a38d9a4b5a8689e27..2780c51e26d6d3d6f8949b548e8149e1fa1b5bae 100644 --- a/test/collective/fleet/test_fleet_pipeline_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_pipeline_meta_optimizer.py @@ -78,7 +78,7 @@ class TestFleetMetaOptimizer(unittest.TestCase): with fluid.unique_name.guard(): avg_cost = self.net() - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy ) @@ -102,7 +102,7 @@ class TestFleetMetaOptimizer(unittest.TestCase): with fluid.unique_name.guard(): avg_cost = self.net() - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy ) diff --git a/test/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py b/test/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py index a4ca770d1db14a0d36de6c2294df85954282503a..a0f3b1d9bb35f02bc84d93c0d3147ce781cf2bd0 100644 --- a/test/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py +++ b/test/collective/fleet/test_fleet_pipeline_meta_optimizer_with_recompute.py @@ -74,7 +74,7 @@ class TestFleetMetaOptimizer(unittest.TestCase): "checkpoint_shape": [], } - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_fleet_qat_meta_optimizer.py b/test/collective/fleet/test_fleet_qat_meta_optimizer.py index 2e1208d0c6f3541d9fd86316c88f78b12d6efbdf..7559bd01973d347b52b99414db1c70f905198d62 100644 --- a/test/collective/fleet/test_fleet_qat_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_qat_meta_optimizer.py @@ -64,7 +64,7 @@ class TestFleetWithQAT(unittest.TestCase): mse = paddle.nn.MSELoss() out = model(input_x) loss = mse(out, input_y) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer( optimizer, strategy=strategy ) diff --git a/test/collective/fleet/test_fleet_raw_program_meta_optimizer.py b/test/collective/fleet/test_fleet_raw_program_meta_optimizer.py index 97402658759c5465d68d631fefed3cf1f03683bd..bd119b6a5f506ead5f00a841448a604f524fdaac 100644 --- a/test/collective/fleet/test_fleet_raw_program_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_raw_program_meta_optimizer.py @@ -47,7 +47,7 @@ class TestFleetMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.without_graph_optimization = True - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/collective/fleet/test_fleet_recompute_meta_optimizer.py b/test/collective/fleet/test_fleet_recompute_meta_optimizer.py index bf97a442c6ff040cd81ed5167ca0fe4204e0b18a..e0315fbf407ffcc88bca557c97a655fb17d1ea69 100644 --- a/test/collective/fleet/test_fleet_recompute_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_recompute_meta_optimizer.py @@ -30,9 +30,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'recompute') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = RecomputeOptimizer(opt) opt.user_defined_strategy = strategy params_grads = opt.backward(avg_cost, startup_prog) @@ -48,9 +46,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'recompute') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = RecomputeOptimizer(opt) opt.user_defined_strategy = strategy params_grads = opt.backward(avg_cost, startup_prog) @@ -68,9 +64,7 @@ class TestFleetRecomputeMetaOptimizer(TestFleetMetaOptimizer): avg_cost, strategy = self.net(train_prog, startup_prog) self.set_strategy(strategy, 'recompute') - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = RecomputeOptimizer(opt) opt.user_defined_strategy = strategy params_grads = opt.backward(avg_cost, startup_prog) diff --git a/test/collective/fleet/test_fleet_sharding_meta_optimizer.py b/test/collective/fleet/test_fleet_sharding_meta_optimizer.py index e42dad2b6c7106c752f4b13d2deaac7b44a4d07d..9df8b670c7e997da81dbbfeda69dd703d9e59172 100755 --- a/test/collective/fleet/test_fleet_sharding_meta_optimizer.py +++ b/test/collective/fleet/test_fleet_sharding_meta_optimizer.py @@ -609,12 +609,6 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', - 'scale', - 'sum', - 'scale', - 'sum', - 'scale', - 'sum', 'momentum', 'momentum', 'momentum', diff --git a/test/collective/fleet/test_fleet_tensor_parallel_extra_sync.py b/test/collective/fleet/test_fleet_tensor_parallel_extra_sync.py index d77930cc992405f7033b88c52e8398ed54d97f8c..eba8f6926e0b4112f050afe23e05768b043c2f8b 100644 --- a/test/collective/fleet/test_fleet_tensor_parallel_extra_sync.py +++ b/test/collective/fleet/test_fleet_tensor_parallel_extra_sync.py @@ -106,7 +106,7 @@ class TestFleetMetaOptimizer(unittest.TestCase): y = model_a(input_x) loss = paddle.mean(y) - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) ref_ops = [ diff --git a/test/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/test/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py index 6f71afb296efbd48c0cca7fd1389a9f59c75000e..05bd76127542c28ca62680b3aa9f0328f83d839a 100644 --- a/test/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py +++ b/test/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py @@ -222,8 +222,8 @@ class TestAmpScaler(unittest.TestCase): stride=2, act='relu', ) - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters() ) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = fluid.dygraph.to_variable(inp_np) @@ -331,8 +331,8 @@ class TestAmpScaler(unittest.TestCase): params_init = {} for param in model.parameters(): params_init[param.name] = param.numpy() - optimizer = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=model.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=model.parameters() ) scaler = paddle.amp.AmpScaler(init_loss_scaling=1024) data = fluid.dygraph.to_variable(inp_np) diff --git a/test/collective/fleet/test_mixed_precision.py b/test/collective/fleet/test_mixed_precision.py index b2368f0e01e909856d3101505362b8adcd78ef46..1ab3f2023ba9a3fb81a895cd0165621bd1a9ca56 100644 --- a/test/collective/fleet/test_mixed_precision.py +++ b/test/collective/fleet/test_mixed_precision.py @@ -58,8 +58,8 @@ class AMPTest(unittest.TestCase): out = model(x) loss = mse(out, label) - opt = paddle.fluid.optimizer.Adam( - learning_rate=0.0001, parameter_list=model.parameters() + opt = paddle.optimizer.Adam( + learning_rate=0.0001, parameters=model.parameters() ) # 定义优化器 opt = paddle.static.amp.decorate( opt, init_loss_scaling=128.0, use_dynamic_loss_scaling=True diff --git a/test/collective/test_collective_optimizer.py b/test/collective/test_collective_optimizer.py index 08d84bd2bf11fc59226333db9cdf939de76d23fa..bea4a683a9e8ba290348e7cf6677346484c47583 100644 --- a/test/collective/test_collective_optimizer.py +++ b/test/collective/test_collective_optimizer.py @@ -27,7 +27,6 @@ import unittest import paddle -from paddle import fluid from paddle.incubate.distributed.fleet.collective import ( CollectiveOptimizer, DistributedStrategy, @@ -36,11 +35,11 @@ from paddle.incubate.distributed.fleet.collective import ( class CollectiveOptimizerTest(unittest.TestCase): def test_ds_as_None(self): - optimizer = fluid.optimizer.AdamOptimizer() + optimizer = paddle.optimizer.Adam() dist_optimizer = CollectiveOptimizer(optimizer, strategy=None) def test_recompute_checkpoints(self): - optimizer = fluid.optimizer.AdamOptimizer() + optimizer = paddle.optimizer.Adam() dist_strategy = DistributedStrategy() dist_strategy.forward_recompute = True dist_strategy.recompute_checkpoints = "NoneListTest" @@ -52,8 +51,8 @@ class CollectiveOptimizerTest(unittest.TestCase): self.assertRaises(ValueError, dist_optimizer.minimize, None) def test_recompute_strategy(self): - optimizer = fluid.optimizer.AdamOptimizer() - optimizer = fluid.optimizer.RecomputeOptimizer(optimizer) + optimizer = paddle.optimizer.Adam() + optimizer = paddle.incubate.optimizer.RecomputeOptimizer(optimizer) dist_strategy = DistributedStrategy() dist_strategy.forward_recompute = True dist_strategy.recompute_checkpoints = ["Test"] @@ -61,7 +60,7 @@ class CollectiveOptimizerTest(unittest.TestCase): self.assertRaises(ValueError, dist_optimizer.minimize, None) def test_amp_strategy(self): - optimizer = fluid.optimizer.AdamOptimizer() + optimizer = paddle.optimizer.Adam() optimizer = paddle.static.amp.decorate( optimizer, init_loss_scaling=1.0, use_dynamic_loss_scaling=True ) diff --git a/test/contrib/test_correlation.py b/test/contrib/test_correlation.py index a12c2a08e3928ced00fed59f77296de553610aa1..39f9c20da42ead13c6a6b77cab8cc3c88de516e2 100644 --- a/test/contrib/test_correlation.py +++ b/test/contrib/test_correlation.py @@ -125,7 +125,7 @@ class TestCorrelationOp(unittest.TestCase): ) loss = paddle.mean(out) - optimizer = fluid.optimizer.Momentum(0.0001, 0.9) + optimizer = paddle.optimizer.Momentum(0.0001, 0.9) optimizer.minimize(loss) place = fluid.CUDAPlace(0) diff --git a/test/contrib/test_multi_precision_fp16_train.py b/test/contrib/test_multi_precision_fp16_train.py index bb1783a74a654cb4d49c59616cc9cc835e603146..0b5f58e550462d8cbfdfc0388122a753cfe9556c 100644 --- a/test/contrib/test_multi_precision_fp16_train.py +++ b/test/contrib/test_multi_precision_fp16_train.py @@ -126,7 +126,7 @@ def train(use_pure_fp16=True, use_nesterov=False, optimizer=""): multi_precision=True, ) elif optimizer == "Lars": - optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer( + optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9, multi_precision=use_pure_fp16 ) else: diff --git a/test/distributed_passes/auto_parallel_pass_test_base.py b/test/distributed_passes/auto_parallel_pass_test_base.py index 7dd70a0f28afb3e29a51d07554c8d1ed55728fe7..11dbd3f361d34bb64cc69e312066f7fbd9707f57 100644 --- a/test/distributed_passes/auto_parallel_pass_test_base.py +++ b/test/distributed_passes/auto_parallel_pass_test_base.py @@ -222,7 +222,7 @@ class AutoPallelPassTestBase(DistPassTestBase): clip = paddle.nn.ClipGradByNorm(clip_norm=1.0) if kwargs.get('optimizer', None) == "LarsMomentum": - optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer( + optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9 ) else: diff --git a/test/distributed_passes/test_auto_parallel_gradient_merge_pass.py b/test/distributed_passes/test_auto_parallel_gradient_merge_pass.py index c34cf1ac8428b43b609c58bdef284b4c7a52008e..9e90288912b2dbfe301cd661bad0d97c3ef5572e 100644 --- a/test/distributed_passes/test_auto_parallel_gradient_merge_pass.py +++ b/test/distributed_passes/test_auto_parallel_gradient_merge_pass.py @@ -183,7 +183,7 @@ class TestGradientMergePass(AutoPallelPassTestBase): loss = mlp_forward(input, label, hidden_size) - optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) ( _, diff --git a/test/dygraph_to_static/test_bert.py b/test/dygraph_to_static/test_bert.py index 3b70956cb77d05b61d3aef535c8f1a15bad13c96..8d26ee57a441dd15a7a97edf13ec3c5905f1c5a5 100644 --- a/test/dygraph_to_static/test_bert.py +++ b/test/dygraph_to_static/test_bert.py @@ -106,7 +106,7 @@ class TestBert(unittest.TestCase): config=bert_config, weight_sharing=False, use_fp16=False ) - optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters()) + optimizer = paddle.optimizer.Adam(parameters=bert.parameters()) step_idx = 0 speed_list = [] for input_data in data_loader(): diff --git a/test/dygraph_to_static/test_bmn.py b/test/dygraph_to_static/test_bmn.py index 4ed7556f0d1d236908ef486792be36dc68f45c1a..8203bcec450a809e6103bf5a2a01a121c4d806e6 100644 --- a/test/dygraph_to_static/test_bmn.py +++ b/test/dygraph_to_static/test_bmn.py @@ -448,10 +448,10 @@ def optimizer(cfg, parameter_list): lr_decay = cfg.learning_rate_decay l2_weight_decay = cfg.l2_weight_decay lr = [base_lr, base_lr * lr_decay] - optimizer = fluid.optimizer.Adam( - fluid.layers.piecewise_decay(boundaries=bd, values=lr), - parameter_list=parameter_list, - regularization=paddle.regularizer.L2Decay(coeff=l2_weight_decay), + optimizer = paddle.optimizer.Adam( + paddle.optimizer.lr.PiecewiseDecay(boundaries=bd, values=lr), + parameters=parameter_list, + weight_decay=paddle.regularizer.L2Decay(coeff=l2_weight_decay), ) return optimizer diff --git a/test/dygraph_to_static/test_cache_program.py b/test/dygraph_to_static/test_cache_program.py index 750de4efda40e409292486b851cc148706f3ea1c..eaf45b49a61e2df98a3ef6768e4aab3e94150b68 100644 --- a/test/dygraph_to_static/test_cache_program.py +++ b/test/dygraph_to_static/test_cache_program.py @@ -96,8 +96,8 @@ class TestCacheProgramWithOptimizer(unittest.TestCase): with fluid.dygraph.guard(fluid.CPUPlace()): dygraph_net = self.dygraph_class() - adam = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, parameter_list=dygraph_net.parameters() + adam = paddle.optimizer.Adam( + learning_rate=0.001, parameters=dygraph_net.parameters() ) loss_data = [] for batch_id in range(self.batch_num): diff --git a/test/dygraph_to_static/test_cycle_gan.py b/test/dygraph_to_static/test_cycle_gan.py index bf8a30ab685ae702e9fbf14a73f333c9118e22b1..dbe3d07dfd2649237081928000d2af6a9e61b415 100644 --- a/test/dygraph_to_static/test_cycle_gan.py +++ b/test/dygraph_to_static/test_cycle_gan.py @@ -531,8 +531,8 @@ class Args: def optimizer_setting(parameters): lr = 0.0002 - optimizer = fluid.optimizer.Adam( - learning_rate=fluid.layers.piecewise_decay( + optimizer = paddle.optimizer.Adam( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( boundaries=[ 100 * step_per_epoch, 120 * step_per_epoch, @@ -542,7 +542,7 @@ def optimizer_setting(parameters): ], values=[lr, lr * 0.8, lr * 0.6, lr * 0.4, lr * 0.2, lr * 0.1], ), - parameter_list=parameters, + parameters=parameters, beta1=0.5, ) return optimizer diff --git a/test/dygraph_to_static/test_lac.py b/test/dygraph_to_static/test_lac.py index ead65802136293ae404f6e6ce1aeb68f47ff51c9..720727e4a4fe420160d4fd4b62d1afe477d5743b 100644 --- a/test/dygraph_to_static/test_lac.py +++ b/test/dygraph_to_static/test_lac.py @@ -545,9 +545,9 @@ class TestLACModel(unittest.TestCase): train_loader = create_dataloader(reader, place) model = LexNet(args) - optimizer = fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=args.base_learning_rate, - parameter_list=model.parameters(), + parameters=model.parameters(), ) chunk_eval = ChunkEval( int(math.ceil((args.num_labels - 1) / 2.0)), "IOB" diff --git a/test/dygraph_to_static/test_mnist.py b/test/dygraph_to_static/test_mnist.py index 23e2dbb56efe79caf1df0b7893871e00009ad4b1..2cb556680427be208f640c1eae11697465f33bcc 100644 --- a/test/dygraph_to_static/test_mnist.py +++ b/test/dygraph_to_static/test_mnist.py @@ -25,9 +25,9 @@ import paddle from paddle import fluid from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph.base import switch_to_static_graph -from paddle.fluid.optimizer import AdamOptimizer from paddle.jit.translated_layer import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn import Linear +from paddle.optimizer import Adam SEED = 2020 @@ -196,9 +196,7 @@ class TestMNISTWithToStatic(TestMNIST): mnist = MNIST() if to_static: mnist = paddle.jit.to_static(mnist) - adam = AdamOptimizer( - learning_rate=0.001, parameter_list=mnist.parameters() - ) + adam = Adam(learning_rate=0.001, parameters=mnist.parameters()) for epoch in range(self.epoch_num): start = time() diff --git a/test/dygraph_to_static/test_mnist_amp.py b/test/dygraph_to_static/test_mnist_amp.py index 88684865367eb554c640ec2a41f70c8342afa948..667181adbbe860c25df0ed44f626f61f055ab4e6 100644 --- a/test/dygraph_to_static/test_mnist_amp.py +++ b/test/dygraph_to_static/test_mnist_amp.py @@ -20,7 +20,7 @@ from dygraph_to_static_util import test_and_compare_with_new_ir from test_mnist import MNIST, SEED, TestMNIST import paddle -from paddle.fluid.optimizer import AdamOptimizer +from paddle.optimizer import Adam if paddle.fluid.is_compiled_with_cuda(): paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True}) @@ -58,9 +58,7 @@ class TestAMP(TestMNIST): print("Successfully to apply @to_static.") mnist = paddle.jit.to_static(mnist) - adam = AdamOptimizer( - learning_rate=0.001, parameter_list=mnist.parameters() - ) + adam = Adam(learning_rate=0.001, parameters=mnist.parameters()) scaler = paddle.amp.GradScaler(init_loss_scaling=1024) diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py index c2fd027ea88091953c9e8d5a95381b42c91d7b88..db8d420094e1575da3e6d4c85bc3267e83dfda44 100644 --- a/test/dygraph_to_static/test_mobile_net.py +++ b/test/dygraph_to_static/test_mobile_net.py @@ -446,11 +446,11 @@ class MobileNetV2(paddle.nn.Layer): def create_optimizer(args, parameter_list): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=args.lr, momentum=args.momentum_rate, - regularization=paddle.regularizer.L2Decay(args.l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(args.l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/dygraph_to_static/test_ptb_lm.py b/test/dygraph_to_static/test_ptb_lm.py index b2b02489db55cf1e37049e9dfbe970f1399bbc2f..540586c4ee97238f1fbc1f90434ef3e71c0d2e91 100644 --- a/test/dygraph_to_static/test_ptb_lm.py +++ b/test/dygraph_to_static/test_ptb_lm.py @@ -21,8 +21,8 @@ import numpy as np import paddle from paddle import fluid from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer from paddle.jit.api import to_static +from paddle.optimizer import SGD PRINT_STEP = 20 SEED = 2020 @@ -247,9 +247,7 @@ def train(place): dropout=dropout, ) - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=ptb_model.parameters() - ) + sgd = SGD(learning_rate=1e-3, parameters=ptb_model.parameters()) for epoch_id in range(max_epoch): total_loss = 0.0 diff --git a/test/dygraph_to_static/test_resnet.py b/test/dygraph_to_static/test_resnet.py index bcdc5008d253e6e951a8a285dc15d58e43b06939..9299922f91dc0ab36b45354ef3d8b26370951c99 100644 --- a/test/dygraph_to_static/test_resnet.py +++ b/test/dygraph_to_static/test_resnet.py @@ -46,11 +46,11 @@ if fluid.is_compiled_with_cuda(): def optimizer_setting(parameter_list=None): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=base_lr, momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/dygraph_to_static/test_save_inference_model.py b/test/dygraph_to_static/test_save_inference_model.py index 84e3280a8a4bced83d9abd0097cd8882e8caf421..231dd38bccdfbca2dfcd86192322a8298849ef77 100644 --- a/test/dygraph_to_static/test_save_inference_model.py +++ b/test/dygraph_to_static/test_save_inference_model.py @@ -64,8 +64,8 @@ class TestDyToStaticSaveInferenceModel(unittest.TestCase): x = fluid.dygraph.to_variable(x_data) layer = SimpleFcLayer(fc_size) - adam = fluid.optimizer.SGD( - learning_rate=0.1, parameter_list=layer.parameters() + adam = paddle.optimizer.SGD( + learning_rate=0.1, parameters=layer.parameters() ) for i in range(5): diff --git a/test/dygraph_to_static/test_save_load.py b/test/dygraph_to_static/test_save_load.py index b2dc916ddbabc062dcba19043429ce66e07a3809..d817f77913fd8cea50b8546b1c24698e72f301fe 100644 --- a/test/dygraph_to_static/test_save_load.py +++ b/test/dygraph_to_static/test_save_load.py @@ -24,8 +24,8 @@ import paddle import paddle.nn.functional as F from paddle import fluid, nn from paddle.fluid import core -from paddle.fluid.optimizer import AdamOptimizer from paddle.nn import BatchNorm +from paddle.optimizer import Adam np.random.seed(2020) @@ -75,9 +75,7 @@ class TestDyToStaticSaveLoad(unittest.TestCase): paddle.jit.enable_to_static(True) x = fluid.dygraph.to_variable(x_data) net = Linear(32, 64) - adam = AdamOptimizer( - learning_rate=0.1, parameter_list=net.parameters() - ) + adam = Adam(learning_rate=0.1, parameters=net.parameters()) for i in range(batch_num): static_out, static_loss = net(x) diff --git a/test/dygraph_to_static/test_se_resnet.py b/test/dygraph_to_static/test_se_resnet.py index a06a603715075ea3555e214406854364609b9e77..5e0eb039188d9159ec97323c7fca25070ad31cab 100644 --- a/test/dygraph_to_static/test_se_resnet.py +++ b/test/dygraph_to_static/test_se_resnet.py @@ -78,13 +78,13 @@ def optimizer_setting(params, parameter_list): bd = [step * e for e in ls["epochs"]] lr = params["lr"] num_epochs = params["num_epochs"] - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.cosine_decay( - learning_rate=lr, step_each_epoch=step, epochs=num_epochs + optimizer = paddle.optimizer.Momentum( + learning_rate=paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=lr, T_max=num_epochs ), momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/dygraph_to_static/test_seq2seq.py b/test/dygraph_to_static/test_seq2seq.py index ec8df28d8bb2018629680ae1cd9ceed22d6b3bec..3478bf47efed3e73aaae6accbfbc1fba4e774fa7 100644 --- a/test/dygraph_to_static/test_seq2seq.py +++ b/test/dygraph_to_static/test_seq2seq.py @@ -70,11 +70,11 @@ def train(args, attn_model=False): dropout=args.dropout, ) - global_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm) - optimizer = fluid.optimizer.SGD( + gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm) + optimizer = paddle.optimizer.SGD( args.learning_rate, - parameter_list=model.parameters(), - grad_clip=global_norm_clip, + parameters=model.parameters(), + grad_clip=gloabl_norm_clip, ) model.train() diff --git a/test/dygraph_to_static/test_simnet.py b/test/dygraph_to_static/test_simnet.py index fc966092ae9665695c1d384863a5b73563d5c312..2c69cf2072cf92658455594e23b5b109d7a6339a 100644 --- a/test/dygraph_to_static/test_simnet.py +++ b/test/dygraph_to_static/test_simnet.py @@ -141,12 +141,12 @@ def train(conf_dict, to_static): net = BOW(conf_dict) loss = HingeLoss(conf_dict) - optimizer = fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08, - parameter_list=net.parameters(), + parameters=net.parameters(), ) metric = paddle.metric.Auc(name="auc") diff --git a/test/dygraph_to_static/test_tsm.py b/test/dygraph_to_static/test_tsm.py index 1f5c01b017b85b50e98778781e669e4ef717e42b..3febac5d22799e06f1cccc4dbf4da6be07f6b299 100644 --- a/test/dygraph_to_static/test_tsm.py +++ b/test/dygraph_to_static/test_tsm.py @@ -281,11 +281,13 @@ def create_optimizer(cfg, params): l2_weight_decay = cfg.l2_weight_decay momentum = cfg.momentum - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), + optimizer = paddle.optimizer.Momentum( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( + boundaries=bd, values=lr + ), momentum=momentum, - regularization=paddle.regularizer.L2Decay(l2_weight_decay), - parameter_list=params, + weight_decay=paddle.regularizer.L2Decay(l2_weight_decay), + parameters=params, ) return optimizer diff --git a/test/dygraph_to_static/test_word2vec.py b/test/dygraph_to_static/test_word2vec.py index 72b4af985c28762159907c0585ab88be1fa3e6a8..424d1f3a7ef83e1e0d39a834a67da748a47c5e45 100644 --- a/test/dygraph_to_static/test_word2vec.py +++ b/test/dygraph_to_static/test_word2vec.py @@ -292,9 +292,9 @@ def train(to_static): skip_gram_model = SkipGram( "skip_gram_model", vocab_size, embedding_size ) - adam = fluid.optimizer.AdamOptimizer( + adam = paddle.optimizer.Adam( learning_rate=learning_rate, - parameter_list=skip_gram_model.parameters(), + parameters=skip_gram_model.parameters(), ) step = 0 diff --git a/test/dygraph_to_static/test_yolov3.py b/test/dygraph_to_static/test_yolov3.py index 0d4ed8901d5c291a2a8102471180438e0bac3b3d..4b649107bc303a69662d0824bb9586b900938bbf 100644 --- a/test/dygraph_to_static/test_yolov3.py +++ b/test/dygraph_to_static/test_yolov3.py @@ -105,12 +105,11 @@ def train(to_static): start_lr=0.0, end_lr=cfg.learning_rate, ) - - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=lr, - regularization=paddle.regularizer.L2Decay(cfg.weight_decay), + weight_decay=paddle.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum, - parameter_list=model.parameters(), + parameters=model.parameters(), ) start_time = time.time() diff --git a/test/ir/inference/test_trt_conv_quant_dequant_pass.py b/test/ir/inference/test_trt_conv_quant_dequant_pass.py index a258381a34a130bcaa28fc0823f2023ba424e565..c7f7d03cf3ca65c1032ee74fb17907d69bce4ee9 100644 --- a/test/ir/inference/test_trt_conv_quant_dequant_pass.py +++ b/test/ir/inference/test_trt_conv_quant_dequant_pass.py @@ -72,7 +72,7 @@ class QuantDequantTensorRTSubgraphPassConvTest(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -182,7 +182,7 @@ class DynamicShapeQuantDequantTensorRTSubgraphPassConvTest(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -290,7 +290,7 @@ class QuantDequantTensorRTSubgraphPassConvTransposeTest(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( diff --git a/test/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py b/test/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py index bf98055c3781e401ecf060861c16453b0fa722ba..5e93b651f24fcafa24525def2f168e050b2eee2b 100644 --- a/test/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py +++ b/test/ir/inference/test_trt_fc_fuse_quant_dequant_pass.py @@ -57,7 +57,7 @@ class FCQuantDequantFusePassTRTDims3Cols1Test(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -131,7 +131,7 @@ class FCQuantDequantFusePassTRTDims3Cols2Test(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -207,7 +207,7 @@ class FCQuantDequantFusePassTRTDims3Cols3Test(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( diff --git a/test/ir/inference/test_trt_matmul_quant_dequant.py b/test/ir/inference/test_trt_matmul_quant_dequant.py index 7c37837c40646b793a3912fd8eca1b0ebf317baa..808df546b624938a13b916168e87758f56099d57 100644 --- a/test/ir/inference/test_trt_matmul_quant_dequant.py +++ b/test/ir/inference/test_trt_matmul_quant_dequant.py @@ -66,7 +66,7 @@ class TensorRTMatMulQuantDequantDims3Test(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -178,7 +178,7 @@ class TensorRTMatMulQuantDequantDims4Test(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( @@ -289,7 +289,7 @@ class TensorRTMatMulQuantDequantDims3DynamicTest(QuantDequantTest): with fluid.unique_name.guard(): with fluid.program_guard(self.main_program, self.startup_program): self.loss, result = network() - opt = fluid.optimizer.Adam(learning_rate=0.0001) + opt = paddle.optimizer.Adam(learning_rate=0.0001) opt.minimize(self.loss) with fluid.unique_name.guard(): with fluid.program_guard( diff --git a/test/legacy_test/auto_checkpoint_utils.py b/test/legacy_test/auto_checkpoint_utils.py index 558b1fe7d86b13cab8a7aa80170854c3f8b7b3df..6f8a58a267655a9a06d0529fc9247c4a9457db76 100644 --- a/test/legacy_test/auto_checkpoint_utils.py +++ b/test/legacy_test/auto_checkpoint_utils.py @@ -77,7 +77,7 @@ class AutoCheckpointBase(unittest.TestCase): fc_tmp, label ) loss = paddle.mean(cross_entropy) - sgd = fluid.optimizer.SGD(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) if minimize: sgd.minimize(loss) return sgd, loss, image, label diff --git a/test/legacy_test/auto_parallel_autoconvert.py b/test/legacy_test/auto_parallel_autoconvert.py index 2a947adc030200fa9e18a7151793e7d652e47a3b..5e7b501ed623b9172153d2431bf7748621a1e511 100644 --- a/test/legacy_test/auto_parallel_autoconvert.py +++ b/test/legacy_test/auto_parallel_autoconvert.py @@ -134,7 +134,7 @@ def get_distributed_program(): loss, train_program, startup_program = mlp_forward( train_program, startup_program ) - optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, _, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program diff --git a/test/legacy_test/auto_parallel_data_unshard.py b/test/legacy_test/auto_parallel_data_unshard.py index 3a2fc901c7174b6551a6c8a37a5ebf67e08a3d64..2d8552e9d24c83acac22d3c14a274f70e8ff9305 100644 --- a/test/legacy_test/auto_parallel_data_unshard.py +++ b/test/legacy_test/auto_parallel_data_unshard.py @@ -65,7 +65,7 @@ class TestDataUnshard(unittest.TestCase): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, @@ -145,7 +145,7 @@ class TestDataUnshard(unittest.TestCase): dist_strategy = fleet.DistributedStrategy() dist_strategy.semi_auto = True fleet.init(is_collective=True, strategy=dist_strategy) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/legacy_test/auto_parallel_save_load.py b/test/legacy_test/auto_parallel_save_load.py index 2bb88d9c16ae9b62f41870ece939aece3c0e9e31..8cbe8ffacf199231abfe523d4781e3d1e4d2da6f 100644 --- a/test/legacy_test/auto_parallel_save_load.py +++ b/test/legacy_test/auto_parallel_save_load.py @@ -131,7 +131,7 @@ def get_distributed_program(): train_program, startup_program ) - optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer) _, _, dist_startup_prog, dist_main_prog = optimizer.minimize( loss, startup_program diff --git a/test/legacy_test/check_nan_inf_base.py b/test/legacy_test/check_nan_inf_base.py index 8db773ef27c28f71c9538f45a7f13b8d4b6dd156..fad2ebaa752e51dc04f1b7d4eb0e5b219b0abdbb 100644 --- a/test/legacy_test/check_nan_inf_base.py +++ b/test/legacy_test/check_nan_inf_base.py @@ -67,7 +67,7 @@ def net(): acc_top1 = paddle.static.accuracy(input=y_predict, label=y, k=1) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.05) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.05) sgd_optimizer.minimize(avg_cost) return y_predict, avg_cost, acc_top1 diff --git a/test/legacy_test/dist_allreduce_op.py b/test/legacy_test/dist_allreduce_op.py index 96f6b03fa041d10fc0f781999d549d033d8e5d50..2a35dbdf18143272ef2634a641099f2f37f7d49c 100644 --- a/test/legacy_test/dist_allreduce_op.py +++ b/test/legacy_test/dist_allreduce_op.py @@ -104,9 +104,9 @@ class TestDistMnist2x2(TestDistRunnerBase): # Optimization # TODO(typhoonzero): fix distributed adam optimizer - # opt = fluid.optimizer.AdamOptimizer( + # opt = paddle.optimizer.Adam( # learning_rate=0.001, beta1=0.9, beta2=0.999) - opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + opt = paddle.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) if single_device: opt.minimize(avg_cost) else: diff --git a/test/legacy_test/dist_ctr.py b/test/legacy_test/dist_ctr.py index 148203d61ec68b45b6f5427a0414fd27e71c4607..3edee71832412b5e4ac260771269936e24106933 100644 --- a/test/legacy_test/dist_ctr.py +++ b/test/legacy_test/dist_ctr.py @@ -122,8 +122,8 @@ class TestDistCTR2x2(TestDistRunnerBase): gamma=0.999, ) - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=lr, regularization=regularization + sgd_optimizer = paddle.optimizer.SGD( + learning_rate=lr, weight_decay=regularization ) sgd_optimizer.minimize(avg_cost) diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer.py b/test/legacy_test/dist_fleet_raw_program_optimizer.py index 8532b09da91f6370feb67e210f8d64641a14d0f6..69b5bf88702f4229968616eb3999f05bcd97e925 100644 --- a/test/legacy_test/dist_fleet_raw_program_optimizer.py +++ b/test/legacy_test/dist_fleet_raw_program_optimizer.py @@ -103,7 +103,7 @@ class TestFleetMetaOptimizerPrecision(TestDistRunnerBase): paddle.dataset.mnist.test(), batch_size=batch_size ) - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) if single_device: optimizer.minimize(avg_cost) else: diff --git a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py index 5a4ca8efa61d247e7b50e10a588bf138edd418a1..f19bb7424c6ab58df561b2efe152817ff87ccde9 100644 --- a/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py +++ b/test/legacy_test/dist_fleet_raw_program_optimizer_fuse_allreduce.py @@ -103,7 +103,7 @@ class TestFleetMetaOptimizerFuseAllReducePrecision(TestDistRunnerBase): paddle.dataset.mnist.test(), batch_size=batch_size ) - optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) if single_device: optimizer.minimize(avg_cost) else: diff --git a/test/legacy_test/dist_fleet_sync_batch_norm.py b/test/legacy_test/dist_fleet_sync_batch_norm.py index fff2f5bdea44b8dcc116717319e1352baaded964..09e11bda2d67134359f43ca406e00228b22e11a8 100644 --- a/test/legacy_test/dist_fleet_sync_batch_norm.py +++ b/test/legacy_test/dist_fleet_sync_batch_norm.py @@ -62,7 +62,7 @@ def get_program(args): sigmoid = paddle.nn.functional.sigmoid(bn) out = paddle.sum(sigmoid) if not args.only_forward: - sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) + sgd_opt = paddle.optimizer.SGD(learning_rate=0.0) opt = fleet.distributed_optimizer(sgd_opt) opt.minimize(out) return main, startup, [out, conv, bn] diff --git a/test/legacy_test/dist_hapi_mnist_dynamic.py b/test/legacy_test/dist_hapi_mnist_dynamic.py index 66b5f66119b4f8e5eea03477aaba55134a66b13f..64983a50548537c7e5172f308a174dab353fecf2 100644 --- a/test/legacy_test/dist_hapi_mnist_dynamic.py +++ b/test/legacy_test/dist_hapi_mnist_dynamic.py @@ -63,8 +63,8 @@ class TestDistTraining(unittest.TestCase): labels = [Input([None, 1], 'int64', 'label')] model = Model(LeNet(), inputs, labels) - optim = fluid.optimizer.Momentum( - learning_rate=0.001, momentum=0.9, parameter_list=model.parameters() + optim = paddle.optimizer.Momentum( + learning_rate=0.001, momentum=0.9, parameters=model.parameters() ) model.prepare(optim, CrossEntropyLoss(), Accuracy()) diff --git a/test/legacy_test/dist_hapi_mnist_static.py b/test/legacy_test/dist_hapi_mnist_static.py index c465ef7fe85a368e4226bdb920ade3b950fb4f3a..9229e34529ba5bf99da875a8eab6364a5197505b 100644 --- a/test/legacy_test/dist_hapi_mnist_static.py +++ b/test/legacy_test/dist_hapi_mnist_static.py @@ -64,8 +64,8 @@ class TestDistTraining(unittest.TestCase): labels = [Input([None, 1], 'int64', 'label')] model = Model(LeNet(), inputs, labels) - optim = fluid.optimizer.Momentum( - learning_rate=0.001, momentum=0.9, parameter_list=model.parameters() + optim = paddle.optimizer.Momentum( + learning_rate=0.001, momentum=0.9, parameters=model.parameters() ) model.prepare(optim, CrossEntropyLoss(), Accuracy()) diff --git a/test/legacy_test/dist_mnist.py b/test/legacy_test/dist_mnist.py index 31d38716e18d56d9ae2324fb55062c89fc5d734b..925a6f4f6739e037ff3cbff948a4e7bff3ddef60 100644 --- a/test/legacy_test/dist_mnist.py +++ b/test/legacy_test/dist_mnist.py @@ -95,10 +95,10 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization # TODO(typhoonzero): fix distributed adam optimizer - # opt = fluid.optimizer.AdamOptimizer( + # opt = paddle.optimizer.Adam( # learning_rate=0.001, beta1=0.9, beta2=0.999) if not use_dgc: - opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + opt = paddle.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) else: opt = paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer( learning_rate=self.lr, momentum=0.9, rampup_begin_step=2 diff --git a/test/legacy_test/dist_mnist_batch_merge.py b/test/legacy_test/dist_mnist_batch_merge.py index 41ac77c265ef3cad2ac7fe7cdcf2829a6cc0199c..e6a5070a53c99103b19b275e389b78dafdb90e23 100644 --- a/test/legacy_test/dist_mnist_batch_merge.py +++ b/test/legacy_test/dist_mnist_batch_merge.py @@ -58,7 +58,7 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) # Reader train_reader = paddle.batch(test_merge_reader, batch_size=batch_size) diff --git a/test/legacy_test/dist_mnist_dgc.py b/test/legacy_test/dist_mnist_dgc.py index 6919c7b8ed2129b384126bf1681cf1b793d4d549..6f9d892cbd5702cf59609b0c2cd9a1c940787b2e 100644 --- a/test/legacy_test/dist_mnist_dgc.py +++ b/test/legacy_test/dist_mnist_dgc.py @@ -97,7 +97,7 @@ class TestDistMnistDGC(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() if not use_dgc: - opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) + opt = paddle.optimizer.Momentum(learning_rate=self.lr, momentum=0.9) else: opt = paddle.distributed.fleet.meta_optimizers.DGCMomentumOptimizer( learning_rate=self.lr, diff --git a/test/legacy_test/dist_mnist_fp16_allreduce.py b/test/legacy_test/dist_mnist_fp16_allreduce.py index 0e225dec1dbf49b5fbf30bbf4b45bdcdb5f57206..44626be4f01dad7e6db34950b5d4a0ea967d1dd9 100644 --- a/test/legacy_test/dist_mnist_fp16_allreduce.py +++ b/test/legacy_test/dist_mnist_fp16_allreduce.py @@ -53,9 +53,7 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.MomentumOptimizer( - learning_rate=0.001, momentum=0.9 - ) + opt = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opt = FP16AllReduce(opt) diff --git a/test/legacy_test/dist_mnist_lars.py b/test/legacy_test/dist_mnist_lars.py index ae82b2fcdf2d6a9bf85fc8ed3f7b6aae1e3d9725..9c963bc4cbb39398c4b8ee02b58eb316754e76a8 100644 --- a/test/legacy_test/dist_mnist_lars.py +++ b/test/legacy_test/dist_mnist_lars.py @@ -49,7 +49,7 @@ class TestDistMnist2x2(TestDistRunnerBase): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.LarsMomentumOptimizer( + opt = paddle.incubate.optimizer.LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9 ) diff --git a/test/legacy_test/dist_se_resnext.py b/test/legacy_test/dist_se_resnext.py index a3d525a18a84fa6427b81c6f50bf5bfd994608d8..4609be2a5e290ef34a76e44f96e15e60563ef017 100644 --- a/test/legacy_test/dist_se_resnext.py +++ b/test/legacy_test/dist_se_resnext.py @@ -238,12 +238,12 @@ class DistSeResneXt2x2(TestDistRunnerBase): lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] if not use_dgc: - optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( + optimizer = paddle.optimizer.Momentum( + learning_rate=paddle.optimizer.lr.PiecewiseDecay( boundaries=bd, values=lr ), momentum=0.9, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) else: optimizer = ( diff --git a/test/legacy_test/dist_sharding_save.py b/test/legacy_test/dist_sharding_save.py index a49b2d06d03e18c9a66a8a23dab22155c9a2533e..7483e02b48c5d5b805e8587afb1a2350ecaa9c4b 100755 --- a/test/legacy_test/dist_sharding_save.py +++ b/test/legacy_test/dist_sharding_save.py @@ -66,7 +66,7 @@ def runtime_main(): "sharding_degree": 2, } - optimizer = paddle.fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=0.01, momentum=0.9 ) optimizer = fleet.distributed_optimizer( diff --git a/test/legacy_test/dist_text_classification.py b/test/legacy_test/dist_text_classification.py index 0736fb6a385505b6aa59f62fb62853fc295d2934..3a6bb6c152c9651aee2d54b3c71d7abc4b2bc4b5 100644 --- a/test/legacy_test/dist_text_classification.py +++ b/test/legacy_test/dist_text_classification.py @@ -114,7 +114,7 @@ def get_reader(word_dict, batch_size): def get_optimizer(learning_rate): - optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + optimizer = paddle.optimizer.SGD(learning_rate=learning_rate) return optimizer diff --git a/test/legacy_test/dist_word2vec.py b/test/legacy_test/dist_word2vec.py index 3764fd5c5dcbad69062d67a2190a013363337ba6..d6fcf02a43bdd8ff4f870077f0894fe31665c5c5 100644 --- a/test/legacy_test/dist_word2vec.py +++ b/test/legacy_test/dist_word2vec.py @@ -128,7 +128,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): inference_program = paddle.fluid.default_main_program().clone() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( diff --git a/test/legacy_test/distributed_fused_lamb_test_base.py b/test/legacy_test/distributed_fused_lamb_test_base.py index 8ba552c845eefa0c64d6b1a7a37d4ddf799dd10c..b0926c93c20825333d7890f3a6f8ac79a747eede 100644 --- a/test/legacy_test/distributed_fused_lamb_test_base.py +++ b/test/legacy_test/distributed_fused_lamb_test_base.py @@ -197,7 +197,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs): amp_init = None if gm_steps > 1 and not use_distributed_lamb: - optimizer = paddle.fluid.optimizer.GradientMergeOptimizer( + optimizer = paddle.incubate.optimizer.GradientMergeOptimizer( optimizer, k_steps=gm_steps, avg=False ) diff --git a/test/legacy_test/fleet_heter_ps_training.py b/test/legacy_test/fleet_heter_ps_training.py index 4871506e58aaaec30a6b42fd008e7ee2e0119409..e2555e0c58a6914871778a51e65ed27c4b0364d2 100644 --- a/test/legacy_test/fleet_heter_ps_training.py +++ b/test/legacy_test/fleet_heter_ps_training.py @@ -121,7 +121,7 @@ def net(batch_size=4, lr=0.01): ''' -optimizer = fluid.optimizer.Adam(learning_rate=0.01) +optimizer = paddle.optimizer.Adam(learning_rate=0.01) role = role_maker.PaddleCloudRoleMaker() fleet.init(role) diff --git a/test/legacy_test/fleet_meta_optimizer_base.py b/test/legacy_test/fleet_meta_optimizer_base.py index d77bb07cbb83e985a66747882f9d230c0d52fece..acd7bb0a2f7d8c4ad46a9804744f123fcecae84f 100755 --- a/test/legacy_test/fleet_meta_optimizer_base.py +++ b/test/legacy_test/fleet_meta_optimizer_base.py @@ -145,16 +145,16 @@ class TestFleetMetaOptimizer(unittest.TestCase): with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): if name == 'momentum': - optimizer = paddle.fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=0.01, momentum=0.9, - regularization=regularization, + weight_decay=regularization, grad_clip=grad_clip, ) elif name == 'adam': - optimizer = paddle.fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=0.01, - regularization=regularization, + weight_decay=regularization, grad_clip=grad_clip, ) elif name == 'adamw': diff --git a/test/legacy_test/ir_memory_optimize_net_base.py b/test/legacy_test/ir_memory_optimize_net_base.py index ccaffe74d984ab9ce6b4567abc8426e698627154..795852e7b03410ab960647be15d63ea73a2ddfd9 100644 --- a/test/legacy_test/ir_memory_optimize_net_base.py +++ b/test/legacy_test/ir_memory_optimize_net_base.py @@ -66,7 +66,7 @@ class BuildIrMemOptBase(unittest.TestCase): label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64") cost = network(data, label, len(self.word_dict)) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer.minimize(cost) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = enable_inplace diff --git a/test/legacy_test/parallel_dygraph_sparse_embedding_over_height.py b/test/legacy_test/parallel_dygraph_sparse_embedding_over_height.py index bb18f304a694d2040b3c5fdf72cd4c4c0b537f46..f53fc1b72fc63f312d53a961853092097dd3015f 100644 --- a/test/legacy_test/parallel_dygraph_sparse_embedding_over_height.py +++ b/test/legacy_test/parallel_dygraph_sparse_embedding_over_height.py @@ -24,7 +24,6 @@ from legacy_test.parallel_dygraph_sparse_embedding import ( from legacy_test.test_dist_base import runtime_main import paddle -from paddle import fluid # global configs # using small `vocab_size` to test rows number over height @@ -50,8 +49,8 @@ class TestSparseEmbeddingOverHeight(TestSparseEmbedding): fake_sample_reader(), batch_size=batch_size, drop_last=True ) - optimizer = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=model.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.001, parameters=model.parameters() ) return model, train_reader, optimizer diff --git a/test/legacy_test/parallel_executor_test_base.py b/test/legacy_test/parallel_executor_test_base.py index f02b2322a9188344b2828d45477ed242d616704d..65034bdb1dd6996a3a87e43538551ad30a5a1e6b 100644 --- a/test/legacy_test/parallel_executor_test_base.py +++ b/test/legacy_test/parallel_executor_test_base.py @@ -49,7 +49,7 @@ class TestParallelExecutorBase(unittest.TestCase): fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, - optimizer=fluid.optimizer.Adam, + optimizer=paddle.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False, ): @@ -167,7 +167,7 @@ class TestParallelExecutorBase(unittest.TestCase): fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, - optimizer=fluid.optimizer.Adam, + optimizer=paddle.optimizer.Adam, use_fast_executor=True, enable_sequential_execution=False, ): diff --git a/test/legacy_test/seresnext_net.py b/test/legacy_test/seresnext_net.py index e08a5ea8fc1b553e1290559f4dd0febee351a3b2..5ef504bc4049142864baeedaf8ae7ab6dcb44a7c 100644 --- a/test/legacy_test/seresnext_net.py +++ b/test/legacy_test/seresnext_net.py @@ -22,7 +22,6 @@ from seresnext_test_base import DeviceType from simple_nets import init_data import paddle -from paddle.fluid.layers.learning_rate_scheduler import cosine_decay os.environ['CPU_NUM'] = str(4) os.environ['FLAGS_cudnn_deterministic'] = str(1) @@ -179,12 +178,12 @@ def SE_ResNeXt50Small(use_feed): def optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1 + optimizer = paddle.optimizer.Momentum( + learning_rate=paddle.optimizer.lr.CosineAnnealingDecay( + learning_rate=learning_rate, T_max=1 ), momentum=0.9, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) return optimizer diff --git a/test/legacy_test/static_model_parallel_fused_attention.py b/test/legacy_test/static_model_parallel_fused_attention.py index 5ba641bb40693cd23385d8b490b828b8a0dbb83d..5110b0aa255c9c6727bad012308b274ae52eca54 100644 --- a/test/legacy_test/static_model_parallel_fused_attention.py +++ b/test/legacy_test/static_model_parallel_fused_attention.py @@ -131,7 +131,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/legacy_test/static_model_parallel_fused_feedforward.py b/test/legacy_test/static_model_parallel_fused_feedforward.py index 455748ade8771090d189d0848ac375c1d6fc508d..dcd502e58f89d28cfb5ca08415facdea46d312de 100644 --- a/test/legacy_test/static_model_parallel_fused_feedforward.py +++ b/test/legacy_test/static_model_parallel_fused_feedforward.py @@ -122,7 +122,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/legacy_test/static_model_parallel_fused_multi_transformer.py b/test/legacy_test/static_model_parallel_fused_multi_transformer.py index 08743a634af1d48e731642cecf3dd989efdc337a..4979fee8f9b99e3795dfccb4438eb32f3b931efd 100644 --- a/test/legacy_test/static_model_parallel_fused_multi_transformer.py +++ b/test/legacy_test/static_model_parallel_fused_multi_transformer.py @@ -164,7 +164,7 @@ class TestModelParallel(TestDistRunnerBase): rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) - opt = fluid.optimizer.SGD(0.1) + opt = paddle.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer( diff --git a/test/legacy_test/test_adam_op.py b/test/legacy_test/test_adam_op.py index ee8016954cbae36571edb50d4ef10afc6632c090..0c5707c8e59fadaab680b18ff40e7ab1e77630a9 100644 --- a/test/legacy_test/test_adam_op.py +++ b/test/legacy_test/test_adam_op.py @@ -793,305 +793,6 @@ class TestAdamOpV2(unittest.TestCase): paddle.enable_static() -class TestAdamOptimizer(unittest.TestCase): - def _test( - self, - place, - use_tensor=True, - use_fluid_api=True, - use_global_beta_pow=False, - flatten_param_grads=False, - ): - paddle.enable_static() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - SEED = 2021 - paddle.seed(SEED) - np.random.seed(SEED) - - a_np = np.random.random(size=(2, 2)).astype('float32') - b_np = np.random.random(size=(2, 2)).astype('float32') - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - weight_attr1 = paddle.ParamAttr( - name="weight1", - initializer=paddle.nn.initializer.Constant(value=1.0), - trainable=True, - ) - weight_attr2 = paddle.ParamAttr( - name="weight2", - initializer=paddle.nn.initializer.Constant(value=2.0), - trainable=True, - ) - clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) - - with paddle.static.program_guard(main_prog, startup_prog): - with paddle.utils.unique_name.guard(): - a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') - b = paddle.static.data(name="b", shape=[2, 2], dtype='float32') - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64' - ) - - sum = paddle.add(a, b) - z = paddle.pow(sum, 2.0) - - fc_1 = paddle.static.nn.fc( - x=z, size=2, weight_attr=weight_attr1 - ) - prediction = paddle.static.nn.fc( - x=fc_1, - size=2, - weight_attr=weight_attr2, - activation='softmax', - ) - - cost = paddle.nn.functional.cross_entropy( - input=prediction, - label=label, - reduction='none', - use_softmax=False, - ) - loss = paddle.mean(cost) - beta1_init = 0.9 - beta2_init = 0.999 - epsilon_init = 1e-8 - if use_tensor: - beta1 = paddle.static.create_global_var( - shape=[1], - value=float(beta1_init), - dtype='float32', - persistable=True, - name="beta1", - ) - beta2 = paddle.static.create_global_var( - shape=[1], - value=float(beta2_init), - dtype='float32', - persistable=True, - name="beta2", - ) - epsilon = paddle.static.create_global_var( - shape=[1], - value=float(epsilon_init), - dtype='float32', - persistable=True, - name="epsilon", - ) - if use_fluid_api: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - use_global_beta_pow=use_global_beta_pow, - flatten_param_grads=flatten_param_grads, - align_size=256, - grad_clip=clip, - ) - else: - adam = paddle.optimizer.Adam( - learning_rate=0.01, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - grad_clip=clip, - ) - else: - if use_fluid_api: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1_init, - beta2=beta2_init, - epsilon=epsilon_init, - use_global_beta_pow=use_global_beta_pow, - flatten_param_grads=flatten_param_grads, - align_size=256, - grad_clip=clip, - ) - else: - adam = fluid.optimizer.Adam( - learning_rate=0.01, - beta1=beta1_init, - beta2=beta2_init, - epsilon=epsilon_init, - grad_clip=clip, - ) - - adam.minimize(loss) - - scope = fluid.Scope() - with fluid.scope_guard(scope): - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print(f"Start run on {place}") - for epoch in range(10): - pred_res, loss_res = exe.run( - main_prog, - feed={"a": a_np, "b": b_np, "label": label_np}, - fetch_list=[prediction, loss], - ) - print( - "Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res - ) - ) - paddle.disable_static() - return pred_res, loss_res - - def _test_with_place(self, place): - preds = [] - losses = [] - - for use_tensor in [True, False]: - for use_fluid_api in [True, False]: - for use_global_beta_pow in [True, False]: - for flatten_param_grads in [True, False]: - pred, loss = self._test( - place, - use_tensor, - use_fluid_api, - use_global_beta_pow, - flatten_param_grads, - ) - preds.append(pred) - losses.append(loss) - for pred in preds: - np.testing.assert_allclose(pred, preds[0], rtol=1e-05) - for loss in losses: - np.testing.assert_allclose(loss, losses[0], rtol=1e-05) - - def test_adam_api(self): - # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly. - self._test_with_place(paddle.CPUPlace()) - if core.is_compiled_with_cuda(): - self._test_with_place(paddle.CUDAPlace(0)) - - def test_adam_flatten_param_grads_with_regularizer(self): - # flatten_param_grads + regularizer is not supported yet. - paddle.enable_static() - main = fluid.Program() - weight_attr = paddle.ParamAttr( - name="weight1", - initializer=paddle.nn.initializer.Constant(value=1.0), - regularizer=paddle.regularizer.L1Decay(coeff=0.1), - trainable=True, - ) - with fluid.program_guard(main): - x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') - y = paddle.static.data(name='y', shape=[None, 1], dtype='float32') - y_predict = paddle.static.nn.fc(x, size=1, weight_attr=weight_attr) - cost = paddle.nn.functional.square_error_cost( - input=y_predict, label=y - ) - avg_cost = paddle.mean(cost) - - adam = fluid.optimizer.AdamOptimizer( - 0.01, flatten_param_grads=True, align_size=256 - ) - adam.minimize(avg_cost) - paddle.disable_static() - - self.assertEqual(adam._flatten_param_grads, False) - - def test_adam_exception(self): - paddle.enable_static() - a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') - b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') - label = paddle.static.data(name="label", shape=[32, 1], dtype='int64') - - sum = paddle.add(a, b) - z = paddle.pow(sum, 2.0) - - fc_1 = paddle.static.nn.fc(x=z, size=128) - prediction = paddle.static.nn.fc(x=fc_1, size=2, activation='softmax') - - cost = paddle.nn.functional.cross_entropy( - input=prediction, label=label, reduction='none', use_softmax=False - ) - loss = paddle.mean(cost) - adam = fluid.optimizer.Adam(use_global_beta_pow=True) - adam.minimize(loss) - self.assertRaises(Exception, adam._get_global_accumulator, 'tmp') - adam._add_global_accumulator( - 'tmp', type=core.VarDesc.VarType.LOD_TENSOR - ) - adam._get_global_accumulator('tmp') - self.assertRaises( - Exception, - adam._add_global_accumulator, - adam._beta1_pow_acc_str, - type=core.VarDesc.VarType.LOD_TENSOR, - ) - paddle.disable_static() - - def test_adam_save_load(self): - paddle.disable_static() - a = paddle.rand([4, 10]) - linear = paddle.nn.Linear(10, 10) - b = linear(a) - state_dict = linear.state_dict() - paddle.save(state_dict, "paddle_dy.pdparams") - - scheduler = paddle.optimizer.lr.NoamDecay( - d_model=0.01, warmup_steps=100, verbose=True - ) - adam = paddle.fluid.optimizer.Adam( - learning_rate=scheduler, - parameter_list=linear.parameters(), - use_global_beta_pow=True, - ) - adam.minimize(b) - state_dict = adam.state_dict() - paddle.save(state_dict, "paddle_dy.pdopt") - para_state_dict = paddle.load("paddle_dy.pdparams") - opt_state_dict = paddle.load("paddle_dy.pdopt") - adam.set_state_dict(opt_state_dict) - - paddle.enable_static() - - def test_adam_save_load_error(self): - paddle.disable_static() - - def get_opt(dtype, shape): - with paddle.utils.unique_name.guard(): - paddle.set_default_dtype(dtype) - a = paddle.rand([4, 10]) - linear = paddle.nn.Linear(10, 10) - b = linear(a) - state_dict = linear.state_dict() - paddle.save(state_dict, "paddle_dy.pdparams") - - scheduler = paddle.optimizer.lr.NoamDecay( - d_model=0.01, warmup_steps=100, verbose=True - ) - adam = paddle.fluid.optimizer.Adam( - learning_rate=scheduler, - parameter_list=linear.parameters(), - use_global_beta_pow=True, - ) - adam.minimize(b) - return adam - - adam = get_opt('float32', [10, 10]) - - state_dict = adam.state_dict() - paddle.save(state_dict, "paddle_dy.pdopt") - para_state_dict = paddle.load("paddle_dy.pdparams") - opt_state_dict = paddle.load("paddle_dy.pdopt") - adam.set_state_dict(opt_state_dict) - - adam2 = get_opt('float64', [10, 10]) # dtype not match - self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict) - - adam3 = get_opt('float32', [10, 10]) # shape not match - opt_state_dict['beta1_pow_acc_0'] = np.array( - [0.9, 0.9], dtype='float32' - ) - self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict) - paddle.enable_static() - - class TestAdamOpV2Group(TestAdamOpV2): def test_adam_op(self): paddle.disable_static() diff --git a/test/legacy_test/test_adam_optimizer_fp32_fp64.py b/test/legacy_test/test_adam_optimizer_fp32_fp64.py index cecfb6dff79ed12ce09b9c5327b73ad703476172..b5a65b693890629a2f11044f02697f0e0b097b7b 100644 --- a/test/legacy_test/test_adam_optimizer_fp32_fp64.py +++ b/test/legacy_test/test_adam_optimizer_fp32_fp64.py @@ -38,7 +38,7 @@ def main_test_func(place, dtype): ) avg_cost = paddle.mean(cost) - adam_optimizer = fluid.optimizer.AdamOptimizer(0.01) + adam_optimizer = paddle.optimizer.Adam(0.01) adam_optimizer.minimize(avg_cost) fetch_list = [avg_cost] diff --git a/test/legacy_test/test_auto_parallel_cost_model.py b/test/legacy_test/test_auto_parallel_cost_model.py index 7cf8b2d399f1288a56b72982c200fa9e65332a34..9c32caf214e445780dd8cfe94fee07fb5a209512 100644 --- a/test/legacy_test/test_auto_parallel_cost_model.py +++ b/test/legacy_test/test_auto_parallel_cost_model.py @@ -150,7 +150,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_dist_tensor.py b/test/legacy_test/test_auto_parallel_dist_tensor.py index 420e8b7f526e8223f15d6445b76bf854d599bb77..259739aaa0a1ec9f454dcd08dbe77525620607cf 100644 --- a/test/legacy_test/test_auto_parallel_dist_tensor.py +++ b/test/legacy_test/test_auto_parallel_dist_tensor.py @@ -49,7 +49,7 @@ def get_dist_prog( ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_mapper.py b/test/legacy_test/test_auto_parallel_mapper.py index cae7c24a1614b15f868768bb2bf48ca8a8636936..bcf791dd9711f210751f5cf4b195299b5202e876 100644 --- a/test/legacy_test/test_auto_parallel_mapper.py +++ b/test/legacy_test/test_auto_parallel_mapper.py @@ -463,7 +463,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_reshard.py b/test/legacy_test/test_auto_parallel_reshard.py index 4af3fc831abe451d778bd0f573192ab8127f81f9..0c1e1d0d576807f89e4b627a41dd22702addc58c 100644 --- a/test/legacy_test/test_auto_parallel_reshard.py +++ b/test/legacy_test/test_auto_parallel_reshard.py @@ -130,7 +130,7 @@ def get_dist_prog( ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_reshard_dpmppp.py b/test/legacy_test/test_auto_parallel_reshard_dpmppp.py index b8afece8001cb52936e4d920fc5f2247962439cf..55778a582ba4e889b10b67c7674ddddaed69521d 100644 --- a/test/legacy_test/test_auto_parallel_reshard_dpmppp.py +++ b/test/legacy_test/test_auto_parallel_reshard_dpmppp.py @@ -114,7 +114,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_reshard_mppp.py b/test/legacy_test/test_auto_parallel_reshard_mppp.py index ebc7b95290e691d55f2d98d618dc0ceab7e66e6e..c98f96fc30c6efd8023719413e5c952882d65171 100644 --- a/test/legacy_test/test_auto_parallel_reshard_mppp.py +++ b/test/legacy_test/test_auto_parallel_reshard_mppp.py @@ -128,7 +128,7 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): ) fleet._user_defined_strategy = fleet.DistributedStrategy() - fleet.user_defined_optimizer = paddle.fluid.optimizer.AdamOptimizer() + fleet.user_defined_optimizer = paddle.optimizer.Adam() parallelizer = AutoParallelizer(fleet) parallelizer._dist_context = dist_context diff --git a/test/legacy_test/test_auto_parallel_reshard_serial.py b/test/legacy_test/test_auto_parallel_reshard_serial.py index 2ff75315725793a2be9190d5df1f4a7d85e074d9..4434a7817304aeabd8e57e5ec11b59fc1ec304d7 100644 --- a/test/legacy_test/test_auto_parallel_reshard_serial.py +++ b/test/legacy_test/test_auto_parallel_reshard_serial.py @@ -138,7 +138,7 @@ def get_dist_prog_with_parallelizer( print("mlp_forward after", flush=True) - optimizer = paddle.fluid.optimizer.AdamOptimizer( + optimizer = paddle.optimizer.Adam( learning_rate=0.00001, beta1=0.9, beta2=0.999, diff --git a/test/legacy_test/test_backward.py b/test/legacy_test/test_backward.py index 031664e4cb35d6d4339d967cdb94ca9afe16fa14..55fd9b85227bc005a67ade0f9c9b3d26155da2cb 100644 --- a/test/legacy_test/test_backward.py +++ b/test/legacy_test/test_backward.py @@ -72,7 +72,7 @@ class TestBackward(unittest.TestCase): loss = net.build_model() self._check_backward(loss, main) - optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) optimizer.minimize(loss) exe.run(startup) exe.run(feed=net.init_data()) @@ -161,7 +161,7 @@ class TestBackward(unittest.TestCase): with fluid.program_guard(main, startup): loss = net.build_model() - optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) optimizer.minimize(loss, parameter_list=parameter_list) exe.run(startup) exe.run(feed=net.init_data()) @@ -179,7 +179,7 @@ class TestBackward(unittest.TestCase): with fluid.program_guard(main, startup): loss = net.build_model() - optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) optimizer.minimize(loss, no_grad_set=no_grad_set) exe.run(startup) exe.run(feed=net.init_data()) diff --git a/test/legacy_test/test_buffer_shared_memory_reuse_pass.py b/test/legacy_test/test_buffer_shared_memory_reuse_pass.py index c12fa77f8c1b5af804cead97042e1cdcadd8cf7c..930b85a8201da2c70b3d8665627e8e8afb415945 100644 --- a/test/legacy_test/test_buffer_shared_memory_reuse_pass.py +++ b/test/legacy_test/test_buffer_shared_memory_reuse_pass.py @@ -56,7 +56,7 @@ class InplaceTestBase(unittest.TestCase): with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): loss = simple_fc_net() - adam = fluid.optimizer.Adam(learning_rate=1e-3) + adam = paddle.optimizer.Adam(learning_rate=1e-3) adam.minimize(loss) with fluid.scope_guard(scope): diff --git a/test/legacy_test/test_case.py b/test/legacy_test/test_case.py index e6323f68312b6a43ebd11a8bd1e9616977e11b05..48cca3b63ec6d8ccd9fd67b4647503615a3d8e6b 100644 --- a/test/legacy_test/test_case.py +++ b/test/legacy_test/test_case.py @@ -19,7 +19,7 @@ import numpy as np import paddle from paddle import fluid -from paddle.fluid import core, optimizer +from paddle.fluid import core from paddle.fluid.backward import append_backward from paddle.fluid.framework import Program, program_guard @@ -613,7 +613,7 @@ class TestMutiTask(unittest.TestCase): ) one = paddle.tensor.fill_constant(shape=[1], dtype='int32', value=1) - adam = optimizer.Adam(learning_rate=0.001) + adam = paddle.optimizer.Adam(learning_rate=0.001) adagrad = paddle.optimizer.Adagrad(learning_rate=0.001) def fn_1(): diff --git a/test/legacy_test/test_communicator_async.py b/test/legacy_test/test_communicator_async.py index 70408682192323c271047558b241625b7de261d3..fddbed8d7b638aaaf2287bc7aedb7cfc3412ed51 100644 --- a/test/legacy_test/test_communicator_async.py +++ b/test/legacy_test/test_communicator_async.py @@ -20,7 +20,6 @@ import paddle paddle.enable_static() -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -45,7 +44,7 @@ class TestCommunicator(unittest.TestCase): fleet.init(role) avg_cost = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_communicator_geo.py b/test/legacy_test/test_communicator_geo.py index 64a207160243d0ad047c71f4dfa015f2bf33e75f..1b1713c0601eb9243a12378ef1bd01202ffab9df 100644 --- a/test/legacy_test/test_communicator_geo.py +++ b/test/legacy_test/test_communicator_geo.py @@ -70,7 +70,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase): def run_pserver(self, role, strategy): fleet.init(role) avg_cost, x, z, y = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) @@ -83,7 +83,7 @@ class TestCommunicatorGeoEnd2End(unittest.TestCase): fleet.init(role) avg_cost, x, z, y = self.net() - optimizer = fluid.optimizer.SGD(0.01) + optimizer = paddle.optimizer.SGD(0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_communicator_ps_gpu.py b/test/legacy_test/test_communicator_ps_gpu.py index fb1bc87045c2d01b038f3be4cc61ea7499e8b8e5..f487366e516931eefcca3f7ee4c525c361ffcda5 100644 --- a/test/legacy_test/test_communicator_ps_gpu.py +++ b/test/legacy_test/test_communicator_ps_gpu.py @@ -58,7 +58,7 @@ class TestCommunicator(unittest.TestCase): cost = paddle.nn.functional.square_error_cost(input=x, label=y) avg_cost = paddle.mean(cost) - optimizer = fluid.optimizer.Adam(0.01) + optimizer = paddle.optimizer.Adam(0.01) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_cond.py b/test/legacy_test/test_cond.py index 0f4d7daf606b4070bd5e01b8cbc98551946619b1..037ec5c5cce5cd8c3ab01e15a0fb658b87bc4b5b 100644 --- a/test/legacy_test/test_cond.py +++ b/test/legacy_test/test_cond.py @@ -695,7 +695,7 @@ class TestCondBackward(unittest.TestCase): ) i = paddle.static.data(name="i", shape=[1], dtype='int32') loss = cond_func(i, img, label) - optimizer = fluid.optimizer.SGD(learning_rate=0.1) + optimizer = paddle.optimizer.SGD(learning_rate=0.1) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/legacy_test/test_dataloader_early_reset.py b/test/legacy_test/test_dataloader_early_reset.py index 2be75e5ee8cf0e98fe9114dabced23ff8391e3c4..1367faa4d94cf39728696cec98b2e32b33b919c9 100644 --- a/test/legacy_test/test_dataloader_early_reset.py +++ b/test/legacy_test/test_dataloader_early_reset.py @@ -36,7 +36,7 @@ class TestDataLoaderEarlyReset(unittest.TestCase): y = paddle.static.nn.fc(self.x, size=10) loss = paddle.mean(y) - optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer = paddle.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) def get_place(self): diff --git a/test/legacy_test/test_dataset.py b/test/legacy_test/test_dataset.py index f673d232d0341bfceb93d8d656d4b6be9ccd3092..04e38849da2819f863db338e4e8ae088d7c0fa50 100644 --- a/test/legacy_test/test_dataset.py +++ b/test/legacy_test/test_dataset.py @@ -1134,7 +1134,7 @@ class TestDataset2(unittest.TestCase): fleet.init() except ImportError as e: print("warning: no mpi4py") - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) try: adam = fleet.distributed_optimizer(adam) adam.minimize([fake_cost], [scope]) @@ -1206,7 +1206,7 @@ class TestDataset2(unittest.TestCase): fleet.init() except ImportError as e: print("warning: no mpi4py") - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) try: adam = fleet.distributed_optimizer( adam, @@ -1339,7 +1339,7 @@ class TestDataset2(unittest.TestCase): fleet.init() except ImportError as e: print("warning: no mpi4py") - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) try: adam = fleet.distributed_optimizer( adam, diff --git a/test/legacy_test/test_decoupled_py_reader.py b/test/legacy_test/test_decoupled_py_reader.py index ad7df44dcece051302a63bb68bbbb5b2dd9f822d..38ef00b1ddcf1ebd3e68c12753282feff81ea6e4 100644 --- a/test/legacy_test/test_decoupled_py_reader.py +++ b/test/legacy_test/test_decoupled_py_reader.py @@ -77,7 +77,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer): ) ) - optimizer = fluid.optimizer.Adam() + optimizer = paddle.optimizer.Adam() optimizer.minimize(loss) return startup_prog, main_prog, py_reader, loss diff --git a/test/legacy_test/test_deprecated_memory_optimize_interfaces.py b/test/legacy_test/test_deprecated_memory_optimize_interfaces.py index abfafbb889d15304cb0343a6c4f497e1f41771b9..bcb17baaee7647865abaded3c7d5f27bae58517d 100644 --- a/test/legacy_test/test_deprecated_memory_optimize_interfaces.py +++ b/test/legacy_test/test_deprecated_memory_optimize_interfaces.py @@ -16,6 +16,7 @@ import unittest from simple_nets import simple_fc_net +import paddle from paddle import fluid from paddle.distributed import transpiler @@ -30,7 +31,7 @@ class DeprecatedMemoryOptimizationInterfaceTest(unittest.TestCase): with fluid.program_guard(main_prog, startup_prog): with fluid.unique_name.guard(): loss = simple_fc_net() - opt = fluid.optimizer.Adam(learning_rate=1e-3) + opt = paddle.optimizer.Adam(learning_rate=1e-3) opt.minimize(loss) if call_interface: diff --git a/test/legacy_test/test_desc_clone.py b/test/legacy_test/test_desc_clone.py index 831d0caf245143e8c6e1d0382e67742caef332e2..10f0134757a7974c63fdbced9707f7974ba5825c 100644 --- a/test/legacy_test/test_desc_clone.py +++ b/test/legacy_test/test_desc_clone.py @@ -89,9 +89,7 @@ def get_model(batch_size): inference_program = fluid.default_main_program().clone() # Optimization - opt = fluid.optimizer.AdamOptimizer( - learning_rate=0.001, beta1=0.9, beta2=0.999 - ) + opt = paddle.optimizer.Adam(learning_rate=0.001, beta1=0.9, beta2=0.999) # Reader train_reader = paddle.batch( diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_async.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_async.py index 2ab31734c4232bb8c2534551a114ca925097c555..94daccf0c59915d371608bc11794de0571094e44 100644 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_async.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_async.py @@ -54,7 +54,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -91,7 +91,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True strategy.a_sync_configs = {"launch_barrier": False} - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py index 54675d48805b2c8af0a4b8d3a0a9cc8661d23a81..54a8e7c1de27d05c4cfa9b47361d7834c9ccbfb0 100644 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto.py @@ -58,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py index b7386500c4313395e594b409c76c712074001d4d..a5a95370168ac62b6506214225e008ef23469a24 100644 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_async.py @@ -75,7 +75,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py index bde7a3d1820be2b3e25da0088c1d6b7b99343475..7bf6733c7c77b0150b03fc39309b8de3ed25feea 100644 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_auto_geo.py @@ -63,7 +63,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): os.environ["FLAGS_LAUNCH_BARRIER"] = "0" strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_geo.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_geo.py index d16b53641919b73ca83025b987b9585c7b9ce108..7b565264f995b10b1ef5482dcc3e4ee182db97c2 100755 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_geo.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_geo.py @@ -58,7 +58,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy.a_sync = True strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False} - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_a_sync_optimizer_sync.py b/test/legacy_test/test_dist_fleet_a_sync_optimizer_sync.py index 30139dd6d053fdb009fe521209c5ae9b30a91168..69e751a04419ea721078158de30d3c259677630a 100644 --- a/test/legacy_test/test_dist_fleet_a_sync_optimizer_sync.py +++ b/test/legacy_test/test_dist_fleet_a_sync_optimizer_sync.py @@ -46,7 +46,7 @@ class TestFleetGradientMergeMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = False strategy.a_sync_configs = {"launch_barrier": False} - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_base.py b/test/legacy_test/test_dist_fleet_base.py index 6e3b8854c34db49ba2280845449bfa73d73457e6..45657b71d99101cbaf8ca84d012c4aecb68edab2 100644 --- a/test/legacy_test/test_dist_fleet_base.py +++ b/test/legacy_test/test_dist_fleet_base.py @@ -126,7 +126,7 @@ class FleetDistRunnerBase: scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=LEARNING_RATE, gamma=0.999, verbose=True ) - optimizer = fluid.optimizer.SGD(scheduler, grad_clip=grad_clip) + optimizer = paddle.optimizer.SGD(scheduler, grad_clip=grad_clip) """ # learning rate decay method before 2.0 optimizer = fluid.optimizer.SGD( @@ -137,7 +137,7 @@ class FleetDistRunnerBase: staircase=True)) """ else: - optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip) + optimizer = paddle.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_decay.py b/test/legacy_test/test_dist_fleet_decay.py index ba872213c349776949bcd58be3fde56a9a7cc9d7..fcf97e0bd507b5a471b0c7107bbb2fca916d93ee 100644 --- a/test/legacy_test/test_dist_fleet_decay.py +++ b/test/legacy_test/test_dist_fleet_decay.py @@ -15,7 +15,6 @@ import unittest import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -74,7 +73,7 @@ class TestNoamDecay(unittest.TestCase): scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True ) - optimizer = fluid.optimizer.Adam(scheduler) + optimizer = paddle.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_geo.py b/test/legacy_test/test_dist_fleet_geo.py index 95d67e95d5d909d350987da9097f718a4c06819c..1ae08214297882f4684fd2dc08dfd6de93f8040d 100644 --- a/test/legacy_test/test_dist_fleet_geo.py +++ b/test/legacy_test/test_dist_fleet_geo.py @@ -21,7 +21,6 @@ from dist_fleet_simnet_bow import train_network from test_dist_fleet_base import TestFleetBase import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -82,7 +81,7 @@ class TestGeoSgdTranspiler(unittest.TestCase): avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse) - optimizer = fluid.optimizer.SGD(0.1) + optimizer = paddle.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_heter_base.py b/test/legacy_test/test_dist_fleet_heter_base.py index 484631156050906adad467a0c634f3415986f569..3f75352a03e566c54160af60c7a9a58737f6254a 100644 --- a/test/legacy_test/test_dist_fleet_heter_base.py +++ b/test/legacy_test/test_dist_fleet_heter_base.py @@ -27,7 +27,6 @@ import unittest from contextlib import closing import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -150,7 +149,7 @@ class FleetDistHeterRunnerBase: return self.strategy def build_optimizer(self, avg_cost, strategy): - optimizer = fluid.optimizer.SGD(LEARNING_RATE) + optimizer = paddle.optimizer.SGD(LEARNING_RATE) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_heter_program.py b/test/legacy_test/test_dist_fleet_heter_program.py index aad71627a9929bdb54da31cd53a1292e6c1fa6e7..eeefe24d051914db0a3ecd50477f1a3b02ef62ee 100644 --- a/test/legacy_test/test_dist_fleet_heter_program.py +++ b/test/legacy_test/test_dist_fleet_heter_program.py @@ -157,7 +157,7 @@ class TestDistFleetHeterProgram(unittest.TestCase): return avg_cost def build_optimizer(self, avg_cost, strategy): - optimizer = fluid.optimizer.SGD(1e-2) + optimizer = paddle.optimizer.SGD(1e-2) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_dist_fleet_minimize.py b/test/legacy_test/test_dist_fleet_minimize.py index 59751a9604156b279ba9906064364ffc5e8b242f..cc317cd353eadf7da068f489ddd55926d6a93a56 100644 --- a/test/legacy_test/test_dist_fleet_minimize.py +++ b/test/legacy_test/test_dist_fleet_minimize.py @@ -254,7 +254,7 @@ class TestPSMinimize(unittest.TestCase): sparse_config['embedding'] = self.gen_sparse_config() strategy.fleet_desc_configs = sparse_config - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/test/legacy_test/test_dist_fleet_ps.py b/test/legacy_test/test_dist_fleet_ps.py index 03ee5bb67fcc5fb496c45b67713e2ada70ada81a..1b6becb18f5aeb128c2a3ea1fec46a3dde66e9b0 100644 --- a/test/legacy_test/test_dist_fleet_ps.py +++ b/test/legacy_test/test_dist_fleet_ps.py @@ -195,7 +195,7 @@ class TestPSPassWithBow(unittest.TestCase): fleet.init(role) loss, acc, _ = self.net() - optimizer = fluid.optimizer.SGD(base_lr) + optimizer = paddle.optimizer.SGD(base_lr) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps10.py b/test/legacy_test/test_dist_fleet_ps10.py index 8961f50d3028f8fc99fd90f6ef38233906488606..53da0fee0433eb688e0d856a4f420f698136a19b 100644 --- a/test/legacy_test/test_dist_fleet_ps10.py +++ b/test/legacy_test/test_dist_fleet_ps10.py @@ -18,7 +18,6 @@ os.environ["WITH_DISTRIBUTE"] = "ON" import unittest import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -77,7 +76,7 @@ class TestExponentialDecay(unittest.TestCase): scheduler = paddle.optimizer.lr.InverseTimeDecay( learning_rate=base_lr, gamma=0.999, verbose=True ) - optimizer = fluid.optimizer.Adam(scheduler) + optimizer = paddle.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps11.py b/test/legacy_test/test_dist_fleet_ps11.py index d5a4c64423f7398de65d28cd97b7f9f5bdbe8f0c..44ad3514f64d82e35e39bd78f95a7c3d87d0abd0 100755 --- a/test/legacy_test/test_dist_fleet_ps11.py +++ b/test/legacy_test/test_dist_fleet_ps11.py @@ -195,7 +195,7 @@ class TestPSPassWithBow(unittest.TestCase): configs = {"use_ps_gpu": 1, "launch_barrier": False} strategy.a_sync_configs = configs strategy.a_sync = True - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/test/legacy_test/test_dist_fleet_ps12.py b/test/legacy_test/test_dist_fleet_ps12.py index dc0d0325a5a525be76a6a1ea0bad32a3e1c60213..7bc2fdcd479ae5691003187359d548f167fb977f 100644 --- a/test/legacy_test/test_dist_fleet_ps12.py +++ b/test/legacy_test/test_dist_fleet_ps12.py @@ -195,7 +195,7 @@ class TestPSPassWithBow(unittest.TestCase): configs = {"use_ps_gpu": 1} strategy.a_sync_configs = configs strategy.a_sync = True - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/test/legacy_test/test_dist_fleet_ps13.py b/test/legacy_test/test_dist_fleet_ps13.py index 2fbdbeba47f77eb1e24cef7f7802bfde37baa5ee..edced3b5c8ec95538edb036c4e39639bfad16709 100644 --- a/test/legacy_test/test_dist_fleet_ps13.py +++ b/test/legacy_test/test_dist_fleet_ps13.py @@ -201,7 +201,7 @@ class TestPSPassWithBow(unittest.TestCase): "table_parameters.__emb__.accessor.embedx_sgd_param.name": "SparseSharedAdamSGDRule", } strategy.sparse_table_configs = configs - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/test/legacy_test/test_dist_fleet_ps2.py b/test/legacy_test/test_dist_fleet_ps2.py index f27e4172d12945d5cd84371b9d961aded124dfa8..d134a02d1fecb9093dec1ce218dc9307e11ee3f7 100644 --- a/test/legacy_test/test_dist_fleet_ps2.py +++ b/test/legacy_test/test_dist_fleet_ps2.py @@ -203,7 +203,7 @@ class TestPSPassWithBow(unittest.TestCase): "table_parameters.__emb__.accessor.embedx_sgd_param.name": "SparseAdamSGDRule", } strategy.sparse_table_configs = configs - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) diff --git a/test/legacy_test/test_dist_fleet_ps3.py b/test/legacy_test/test_dist_fleet_ps3.py index 59ca7c7dc61884f078d9617c9c438102918f681d..286edccfebda29b2359ec58f01c0ffa8b047b1bb 100644 --- a/test/legacy_test/test_dist_fleet_ps3.py +++ b/test/legacy_test/test_dist_fleet_ps3.py @@ -195,7 +195,7 @@ class TestPSPassWithBow(unittest.TestCase): fleet.init(role) loss, acc, _ = self.net() - optimizer = fluid.optimizer.SGD(base_lr) + optimizer = paddle.optimizer.SGD(base_lr) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps4.py b/test/legacy_test/test_dist_fleet_ps4.py index 3d401885815ddc8ef565ab87feb6e5d420358aa3..1f749707077237cdeaa8342664bf4bacbed58168 100644 --- a/test/legacy_test/test_dist_fleet_ps4.py +++ b/test/legacy_test/test_dist_fleet_ps4.py @@ -189,7 +189,7 @@ class TestPSPassWithBow(unittest.TestCase): fleet.init(role) loss, acc, _ = self.net() - optimizer = fluid.optimizer.Adam(base_lr) + optimizer = paddle.optimizer.Adam(base_lr) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps5.py b/test/legacy_test/test_dist_fleet_ps5.py index a7c363bd8287aac168b6944f18bff518d321a40c..63290ee6f4d798ccb1245f6ba7dcdb8be416d9b8 100644 --- a/test/legacy_test/test_dist_fleet_ps5.py +++ b/test/legacy_test/test_dist_fleet_ps5.py @@ -196,7 +196,7 @@ class TestPSPassWithBow(unittest.TestCase): fleet.init(role) loss, acc, _ = self.net() - optimizer = fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=base_lr, gamma=0.969, diff --git a/test/legacy_test/test_dist_fleet_ps6.py b/test/legacy_test/test_dist_fleet_ps6.py index c4be4348c0c7e1422b95a367a7524c14f6321e32..8b96ed41a31fd4d48266f60eb72aefc218d06754 100644 --- a/test/legacy_test/test_dist_fleet_ps6.py +++ b/test/legacy_test/test_dist_fleet_ps6.py @@ -189,7 +189,7 @@ class TestPSPassWithBow(unittest.TestCase): fleet.init(role) loss, acc, _ = self.net() - optimizer = fluid.optimizer.Adam(base_lr) + optimizer = paddle.optimizer.Adam(base_lr) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps7.py b/test/legacy_test/test_dist_fleet_ps7.py index 63544ffeea9273d41c925c8ee7f4bfcef924f7fd..8e8070898d65cf7dab1360ffa5b8feae77cfab84 100644 --- a/test/legacy_test/test_dist_fleet_ps7.py +++ b/test/legacy_test/test_dist_fleet_ps7.py @@ -19,7 +19,6 @@ os.environ["WITH_DISTRIBUTE"] = "ON" import unittest import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -78,7 +77,7 @@ class TestNaturalExpDecay(unittest.TestCase): scheduler = paddle.optimizer.lr.NaturalExpDecay( learning_rate=base_lr, gamma=0.999, verbose=True ) - optimizer = fluid.optimizer.Adam(scheduler) + optimizer = paddle.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps8.py b/test/legacy_test/test_dist_fleet_ps8.py index ed0a46ba940e54792f97242d560614245b6687c0..fc34adff69b9cc99909cff07baa93957dcc6248e 100644 --- a/test/legacy_test/test_dist_fleet_ps8.py +++ b/test/legacy_test/test_dist_fleet_ps8.py @@ -18,7 +18,6 @@ os.environ["WITH_DISTRIBUTE"] = "ON" import unittest import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -77,7 +76,7 @@ class TestNoamDecay(unittest.TestCase): scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True ) - optimizer = fluid.optimizer.Adam(scheduler) + optimizer = paddle.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_ps9.py b/test/legacy_test/test_dist_fleet_ps9.py index d56686c1576c6e99c0ae87af878031e5f783847c..2028358b36318f5aee721a96e69c5b28d2896182 100644 --- a/test/legacy_test/test_dist_fleet_ps9.py +++ b/test/legacy_test/test_dist_fleet_ps9.py @@ -18,7 +18,6 @@ os.environ["WITH_DISTRIBUTE"] = "ON" import unittest import paddle -from paddle import fluid from paddle.distributed import fleet from paddle.distributed.fleet.base import role_maker @@ -77,7 +76,7 @@ class TestExponentialDecay(unittest.TestCase): scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=base_lr, gamma=0.999, verbose=True ) - optimizer = fluid.optimizer.Adam(scheduler) + optimizer = paddle.optimizer.Adam(scheduler) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True diff --git a/test/legacy_test/test_dist_fleet_spmt.py b/test/legacy_test/test_dist_fleet_spmt.py index 17e6c03693d4aaa385cefd456f39311453349505..d9b5ad66b4fa3b8f0741067f569719cc98a0a69e 100644 --- a/test/legacy_test/test_dist_fleet_spmt.py +++ b/test/legacy_test/test_dist_fleet_spmt.py @@ -193,7 +193,7 @@ class TestSPMT(unittest.TestCase): # configs = {"use_ps_gpu": 1, "launch_barrier": False} # strategy.a_sync_configs = configs # strategy.a_sync = True - # optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + # optimizer = paddle.optimizer.Adam(learning_rate=0.01) # optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # optimizer.minimize(loss) @@ -240,7 +240,7 @@ class TestSPMT(unittest.TestCase): with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): loss, acc, _ = self.net() - optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01) + optimizer = paddle.optimizer.Adam(learning_rate=0.01) optimizer.minimize(loss) print("===main_program====") print(main_program) diff --git a/test/legacy_test/test_dist_fleet_trainer_desc_config.py b/test/legacy_test/test_dist_fleet_trainer_desc_config.py index 9a9acb8541065b84fd4ae36d5b82777b87227fd4..96ef9a92cd1292adba0e5da7c57bd93177426f51 100644 --- a/test/legacy_test/test_dist_fleet_trainer_desc_config.py +++ b/test/legacy_test/test_dist_fleet_trainer_desc_config.py @@ -54,7 +54,7 @@ class TestDistStrategyTrainerDescConfig(unittest.TestCase): } strategy.trainer_desc_configs = config - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -67,7 +67,7 @@ class TestDistStrategyTrainerDescConfig(unittest.TestCase): int(os.environ["PADDLE_TRAINERS_NUM"]), ) - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize([avg_cost]) diff --git a/test/legacy_test/test_dist_mnist_fleetapi.py b/test/legacy_test/test_dist_mnist_fleetapi.py index bffa9c9b37a26b37f1390dddb8252317b873db37..96435a816377826d0867a8f0face6582d8990757 100644 --- a/test/legacy_test/test_dist_mnist_fleetapi.py +++ b/test/legacy_test/test_dist_mnist_fleetapi.py @@ -59,7 +59,7 @@ class FleetCollectiveTest(unittest.TestCase): hidden = paddle.static.nn.fc(x=data, size=10) loss = paddle.mean(hidden) - optimizer = fluid.optimizer.AdamOptimizer() + optimizer = paddle.optimizer.Adam() role = role_maker.UserDefinedCollectiveRoleMaker(0, ['127.0.0.1:6170']) fleet.init(role) diff --git a/test/legacy_test/test_dist_sparse_load_ps0.py b/test/legacy_test/test_dist_sparse_load_ps0.py index bd1ebef36f25edb326b1f9334ad677db4ad5bb36..985aa0d9337713e9470ba9393f4a12982e617e49 100644 --- a/test/legacy_test/test_dist_sparse_load_ps0.py +++ b/test/legacy_test/test_dist_sparse_load_ps0.py @@ -62,7 +62,7 @@ class SparseLoadOp(unittest.TestCase): with fluid.framework.program_guard(test_program, startup_program): with fluid.unique_name.guard(): loss = self.net(emb_array, fc_array) - optimizer = fluid.optimizer.Adam(1e-3) + optimizer = paddle.optimizer.Adam(1e-3) optimizer.minimize(loss) exe = fluid.Executor(fluid.CPUPlace()) @@ -107,7 +107,7 @@ class TestSparseLoadOpCase1(SparseLoadOp): loss = self.net(emb_array, fc_array) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = fluid.optimizer.Adam(1e-3) + optimizer = paddle.optimizer.Adam(1e-3) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) fleet.init_server(model_path) diff --git a/test/legacy_test/test_dist_sparse_load_ps1.py b/test/legacy_test/test_dist_sparse_load_ps1.py index c540250d508f0872ca4855a8d17939cd6b2fabfe..c5bb9b47c98b9adcbb43c0ab291d2640d44f6986 100644 --- a/test/legacy_test/test_dist_sparse_load_ps1.py +++ b/test/legacy_test/test_dist_sparse_load_ps1.py @@ -60,7 +60,7 @@ class TestSparseLoadOpCase2(SparseLoadOp): loss = self.net(emb_array, fc_array) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True - optimizer = fluid.optimizer.Adam(1e-3) + optimizer = paddle.optimizer.Adam(1e-3) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(loss) fleet.init_server(model_path) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_adagrad.py b/test/legacy_test/test_dist_sparse_tensor_load_adagrad.py index cb9e0e5f43075d19672e521bf2f9879431be10cb..88354c6d797fb30d06217ae8574d1617246a33c0 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_adagrad.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_adagrad.py @@ -30,7 +30,7 @@ class TestSparseLoadProgramAdagrad(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.Adam(1e-3) + optimizer = paddle.optimizer.Adam(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_adam.py b/test/legacy_test/test_dist_sparse_tensor_load_adam.py index 0075573dcd483ac93787d7fbe424ac0afc2f4c91..1a2c60657c1a9f62cda94fd236a8b7b9a5dd860b 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_adam.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_adam.py @@ -30,7 +30,7 @@ class TestSparseLoadProgramAdam(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.Adam(1e-3) + optimizer = paddle.optimizer.Adam(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_ftrl.py b/test/legacy_test/test_dist_sparse_tensor_load_ftrl.py index 0b70e53a70eae7971d829e3629daac88fe72a2a0..b473a6ac089224b52d7e3a215c19c11953efcb7a 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_ftrl.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_ftrl.py @@ -30,7 +30,7 @@ class TestSparseLoadProgramFtrl(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.SGD(1e-3) + optimizer = paddle.optimizer.SGD(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_momentum.py b/test/legacy_test/test_dist_sparse_tensor_load_momentum.py index 6b17047e87398dbb140590efaca3556c9b88566f..6b45d5b72cc2167c90f1286d6b55cdabea1f341d 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_momentum.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_momentum.py @@ -30,7 +30,7 @@ class TestSparseLoadProgramMomentum(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.SGD(1e-3) + optimizer = paddle.optimizer.SGD(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_rmsprop.py b/test/legacy_test/test_dist_sparse_tensor_load_rmsprop.py index 7defcc4fd21afee8d01e71c7e53d9051a42f2d85..999b9315c83ebc0bb13792d8676758bca97b550d 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_rmsprop.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_rmsprop.py @@ -30,7 +30,7 @@ class TestSparseLoadProgramRmsprop(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.SGD(1e-3) + optimizer = paddle.optimizer.SGD(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_sparse_tensor_load_sgd.py b/test/legacy_test/test_dist_sparse_tensor_load_sgd.py index 63f39626488fec6d8d658e1024cd5bf1726c2cf8..5444ebdcae4c9f035061ea236ea83decb83b101a 100644 --- a/test/legacy_test/test_dist_sparse_tensor_load_sgd.py +++ b/test/legacy_test/test_dist_sparse_tensor_load_sgd.py @@ -65,7 +65,7 @@ class TestSparseLoadProgramSGD(TestSparseLoadProgram): scope, train_program, startup_program, loss = self.net() with fluid.scope_guard(scope): with fluid.program_guard(train_program, startup_program): - optimizer = fluid.optimizer.SGD(1e-3) + optimizer = paddle.optimizer.SGD(1e-3) optimizer = fleet.distributed_optimizer( optimizer, self.strategy ) diff --git a/test/legacy_test/test_dist_transpiler.py b/test/legacy_test/test_dist_transpiler.py index 14a3baf95d7a53cba9507f64ff77971669196803..8f5565ee7b73db2666dc5e1c7d4477f9aabff4e9 100644 --- a/test/legacy_test/test_dist_transpiler.py +++ b/test/legacy_test/test_dist_transpiler.py @@ -48,7 +48,7 @@ class TranspilerTest(unittest.TestCase): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.1) sgd_optimizer.minimize(avg_cost) def get_main_program(self): @@ -306,7 +306,7 @@ class TestLRDecay(TranspilerTest): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=1.0, gamma=0.1, @@ -445,7 +445,7 @@ class TestFakeInit(TranspilerTest): ) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=paddle.optimizer.lr.ExponentialDecay( learning_rate=1.0, gamma=0.1, @@ -464,46 +464,6 @@ class TestFakeInit(TranspilerTest): self.assertEqual(len(fake_init_ops), 3) -class TestDecayedAdagrad(TranspilerTest): - def net_conf(self): - x = paddle.static.data(name='x', shape=[-1, 1000], dtype='float32') - y_predict = paddle.static.nn.fc( - x, - size=1000, - weight_attr=fluid.ParamAttr(name='fc_w'), - bias_attr=fluid.ParamAttr(name='fc_b'), - ) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1) - opt.minimize(avg_cost) - - def transpiler_test_impl(self): - pserver, startup = self.get_pserver(self.pserver1_ep) - trainer, _ = self.get_trainer() - - -class TestFtrl(TranspilerTest): - def net_conf(self): - x = paddle.static.data(name='x', shape=[-1, 1000], dtype='float32') - y_predict = paddle.static.nn.fc( - x, - size=1000, - weight_attr=fluid.ParamAttr(name='fc_w'), - bias_attr=fluid.ParamAttr(name='fc_b'), - ) - y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') - cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) - avg_cost = paddle.mean(cost) - opt = fluid.optimizer.Ftrl(learning_rate=0.1) - opt.minimize(avg_cost) - - def transpiler_test_impl(self): - pserver, startup = self.get_pserver(self.pserver1_ep) - trainer, _ = self.get_trainer() - - class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = paddle.static.data(name='x', shape=[-1, 1000], dtype='float32') @@ -516,7 +476,7 @@ class TestLRDecayConditional(TranspilerTest): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=fluid.layers.piecewise_decay( [10000, 20000], [1.0, 0.5, 1.0] ) @@ -581,7 +541,7 @@ class TestL2Decay(TranspilerTest): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.1) def filter(param): return param.name == "fc_w" @@ -620,12 +580,12 @@ class TestL2DecayWithPiecewise(TranspilerTest): base_lr = 1.0 bd = [1, 10, 20, 30] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] - sgd_optimizer = fluid.optimizer.Momentum( + sgd_optimizer = paddle.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr ), momentum=0.9, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) sgd_optimizer.minimize(avg_cost) @@ -692,7 +652,7 @@ class TestEmptyPserverOptimizeBlocks(TranspilerTest): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=1.0) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) sgd_optimizer.minimize(avg_cost) def transpiler_test_impl(self): @@ -750,7 +710,7 @@ class TestDistLookupTableBase(TranspilerTest): input=predict, label=label, reduction='none', use_softmax=False ) avg_cost = paddle.mean(cost) - optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer = paddle.optimizer.Adam(learning_rate=0.003) optimizer.minimize(avg_cost) @@ -1134,7 +1094,7 @@ class TestRMSPropOptimizer(TranspilerTest): y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32') cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) optimizer.minimize(avg_cost) def transpiler_test_impl(self): @@ -1365,7 +1325,7 @@ class TestRemoteNce(TestDistLookupTableBase): ) avg_cost = paddle.mean(cost) # optimizer - optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer = paddle.optimizer.Adam(learning_rate=0.003) optimizer.minimize(avg_cost) def net_conf(self): @@ -1454,7 +1414,7 @@ class TestRemoteHsigmoid(TestDistLookupTableBase): avg_cost = paddle.mean(cost) # optimizer - optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer = paddle.optimizer.SGD(learning_rate=0.003) optimizer.minimize(avg_cost) def net_conf(self): diff --git a/test/legacy_test/test_dygraph_multi_forward.py b/test/legacy_test/test_dygraph_multi_forward.py index c0f493dfabbc2dc3e663f444ff28fab65686e99c..afcfe12c30f5323627f00b47e65efe232737b249 100644 --- a/test/legacy_test/test_dygraph_multi_forward.py +++ b/test/legacy_test/test_dygraph_multi_forward.py @@ -21,7 +21,6 @@ import paddle from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer from paddle.nn import Linear SEED = 123123111 @@ -113,8 +112,8 @@ class TestDygraphMultiForward(unittest.TestCase): paddle.seed(SEED) paddle.framework.random._manual_program_seed(SEED) mnist = MNIST() - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=mnist.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=mnist.parameters() ) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True @@ -159,7 +158,7 @@ class TestDygraphMultiForward(unittest.TestCase): ) mnist = MNIST() - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True ) diff --git a/test/legacy_test/test_dynamic_rnn_stop_gradient.py b/test/legacy_test/test_dynamic_rnn_stop_gradient.py index 6321f0f5d9039388ee370bb1fc456c40ed6a97b6..4f5048fd483917dddf1a2b024acfa1336acffa71 100644 --- a/test/legacy_test/test_dynamic_rnn_stop_gradient.py +++ b/test/legacy_test/test_dynamic_rnn_stop_gradient.py @@ -62,7 +62,7 @@ def build_and_run_program(place, batch_size, beam_size, stop_gradient=False): out = tensor_array_to_tensor(scores, axis=0, use_stack=True)[0] loss = paddle.mean(out) - opt = fluid.optimizer.Adam(0.01) + opt = paddle.optimizer.Adam(0.01) opt.minimize(loss) exe = fluid.Executor(place) data = np.random.random_integers( diff --git a/test/legacy_test/test_eager_deletion_delete_vars.py b/test/legacy_test/test_eager_deletion_delete_vars.py index a727fba1192deb685df55369668f87157b79477d..065de1feb04556557271640bb12efb17cf643e17 100644 --- a/test/legacy_test/test_eager_deletion_delete_vars.py +++ b/test/legacy_test/test_eager_deletion_delete_vars.py @@ -48,7 +48,7 @@ def simple_fc_net(): input=prediction, label=label, reduction='none', use_softmax=False ) loss = paddle.mean(loss) - optimizer = fluid.optimizer.Adam(learning_rate=1e-3) + optimizer = paddle.optimizer.Adam(learning_rate=1e-3) optimizer.minimize(loss) return image, label, loss diff --git a/test/legacy_test/test_eager_deletion_padding_rnn.py b/test/legacy_test/test_eager_deletion_padding_rnn.py index bb00a8a4e20ff6c94ce962e623311c238ca1f88f..3faa050fcea9710135793e26afe4f1e98c9a2b04 100644 --- a/test/legacy_test/test_eager_deletion_padding_rnn.py +++ b/test/legacy_test/test_eager_deletion_padding_rnn.py @@ -374,17 +374,7 @@ class PaddingRNNTestBase(unittest.TestCase): ) ) - self.learning_rate = paddle.static.create_global_var( - name="learning_rate", - shape=[1], - value=1.0, - dtype='float32', - persistable=True, - ) - - optimizer = fluid.optimizer.SGD( - learning_rate=self.learning_rate - ) + optimizer = paddle.optimizer.SGD(learning_rate=1.0) optimizer.minimize(self.loss) self.exe.run(self.startup_program) @@ -465,7 +455,6 @@ class PaddingRNNTestBase(unittest.TestCase): feed=input_data_feed, fetch_list=[ self.loss.name, - "learning_rate", self.last_hidden.name, self.last_cell.name, ], @@ -473,9 +462,8 @@ class PaddingRNNTestBase(unittest.TestCase): ) cost_train = np.array(fetch_outs[0]) - lr = np.array(fetch_outs[1]) - init_hidden = np.array(fetch_outs[2]) - init_cell = np.array(fetch_outs[3]) + init_hidden = np.array(fetch_outs[1]) + init_cell = np.array(fetch_outs[2]) total_loss += cost_train iters += self.config.num_steps diff --git a/test/legacy_test/test_eager_deletion_while_op.py b/test/legacy_test/test_eager_deletion_while_op.py index d9f05241976fd4eaffd9da0abfc58634d8170194..55d0e4ae15039d010eb2a51c66bd1c8a0193bd04 100644 --- a/test/legacy_test/test_eager_deletion_while_op.py +++ b/test/legacy_test/test_eager_deletion_while_op.py @@ -117,7 +117,7 @@ class TestEagerDeletionWhileOpBase(unittest.TestCase): fc = paddle.static.nn.fc(tmp, size=256) loss = paddle.mean(sum_result) - optim = fluid.optimizer.Adam(learning_rate=1e-3) + optim = paddle.optimizer.Adam(learning_rate=1e-3) optim.minimize(loss) gc_vars = core._get_eager_deletion_vars( diff --git a/test/legacy_test/test_ema.py b/test/legacy_test/test_ema.py index 0b12280edccf8b3bd94fdc18d540d86c6686fdd7..54625de715e459163a666d1334a6bf2fd74ad616 100644 --- a/test/legacy_test/test_ema.py +++ b/test/legacy_test/test_ema.py @@ -44,10 +44,10 @@ class TestExponentialMovingAverage(unittest.TestCase): for_test=True ) - optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer = paddle.optimizer.Adam(learning_rate=0.001) optimizer.minimize(cost) - self._ema = fluid.optimizer.ExponentialMovingAverage( + self._ema = paddle.static.ExponentialMovingAverage( self._ema_decay ) self._ema.update() diff --git a/test/legacy_test/test_embedding_id_stop_gradient.py b/test/legacy_test/test_embedding_id_stop_gradient.py index 2252ae325d331038d9e3b67cd7b0f04ce83e15ba..eef33dd44ec5f55962525a4abb99a6e3b0b60af4 100644 --- a/test/legacy_test/test_embedding_id_stop_gradient.py +++ b/test/legacy_test/test_embedding_id_stop_gradient.py @@ -62,7 +62,7 @@ class TestEmbeddingIdStopGradientBase(unittest.TestCase): x, size=[10, 32], dtype='float32' ) avg_cost = paddle.mean(emb, name='mean_loss') - optim = fluid.optimizer.SGD(learning_rate=0.001) + optim = paddle.optimizer.SGD(learning_rate=0.001) optim.minimize(avg_cost) exe = fluid.Executor(place) diff --git a/test/legacy_test/test_exception.py b/test/legacy_test/test_exception.py index 5194acb98903fec5f25375a59ab6cc9439a7b8da..1c8e40cf94ed78a832aa75ccae05b5273be34fd7 100644 --- a/test/legacy_test/test_exception.py +++ b/test/legacy_test/test_exception.py @@ -63,7 +63,7 @@ class TestExceptionNoCStack(unittest.TestCase): loss = paddle.nn.functional.square_error_cost(input=predict, label=y) avg_loss = paddle.mean(loss) - fluid.optimizer.SGD(learning_rate=0.01).minimize(avg_loss) + paddle.optimizer.SGD(learning_rate=0.01).minimize(avg_loss) place = fluid.CPUPlace() exe = fluid.Executor(place) diff --git a/test/legacy_test/test_executor_check_feed.py b/test/legacy_test/test_executor_check_feed.py index 4faaabf2eeaa8af4edec08f0e9f6755db17ae018..02529bce96479237df9148139ee4d9b6f42a2d62 100644 --- a/test/legacy_test/test_executor_check_feed.py +++ b/test/legacy_test/test_executor_check_feed.py @@ -20,7 +20,7 @@ from paddle import fluid class TestExecutor(unittest.TestCase): def net(self): - lr = paddle.static.data(name="lr", shape=[1], dtype='float32') + lr = 0.0 x = paddle.static.data(name="x", shape=[None, 1], dtype='float32') y = paddle.static.data(name="y", shape=[None, 1], dtype='float32') y_predict = paddle.static.nn.fc(x, size=1) @@ -28,10 +28,10 @@ class TestExecutor(unittest.TestCase): cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - opt = fluid.optimizer.Adam(learning_rate=lr) + opt = paddle.optimizer.Adam(learning_rate=lr) opt.minimize(avg_cost) - return lr, avg_cost + return paddle.to_tensor(lr), avg_cost def test_program_check_feed(self): main_program = fluid.Program() diff --git a/test/legacy_test/test_executor_feed_non_tensor.py b/test/legacy_test/test_executor_feed_non_tensor.py index b7e1f02beb4b7512d45fb152d4ee8fe542d75cef..d92b7370482c0a565d661e438c148831f1d8bba1 100644 --- a/test/legacy_test/test_executor_feed_non_tensor.py +++ b/test/legacy_test/test_executor_feed_non_tensor.py @@ -22,7 +22,7 @@ from paddle import fluid class TestExecutor(unittest.TestCase): def net(self): - lr = paddle.static.data(name="lr", shape=[], dtype='float32') + lr = 0.0 x = paddle.static.data(name="x", shape=[None, 1], dtype='float32') y = paddle.static.data(name="y", shape=[None, 1], dtype='float32') y_predict = paddle.static.nn.fc(x, size=1) @@ -30,10 +30,10 @@ class TestExecutor(unittest.TestCase): cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y) avg_cost = paddle.mean(cost) - opt = fluid.optimizer.Adam(learning_rate=lr) + opt = paddle.optimizer.Adam(learning_rate=lr) opt.minimize(avg_cost) - return lr, avg_cost + return paddle.to_tensor(lr), avg_cost def test_program_feed_float(self): main_program = fluid.Program() diff --git a/test/legacy_test/test_feed_data_check_shape_type.py b/test/legacy_test/test_feed_data_check_shape_type.py index 83726b2f82a9e1295f9a3f4c08f946ad479d1273..c8a85e0556f1ae980c7e0e49f2b8121954147adf 100644 --- a/test/legacy_test/test_feed_data_check_shape_type.py +++ b/test/legacy_test/test_feed_data_check_shape_type.py @@ -76,7 +76,7 @@ class TestFeedData(unittest.TestCase): ) ) - optimizer = fluid.optimizer.Adam() + optimizer = paddle.optimizer.Adam() optimizer.minimize(loss) return in_data, label, loss diff --git a/test/legacy_test/test_fetch_lod_tensor_array.py b/test/legacy_test/test_fetch_lod_tensor_array.py index bf0f894683d5edf532405a205e6dbdf08d5efaed..8a8288476dd433f8c23c9ea63ddb40b89722fb93 100644 --- a/test/legacy_test/test_fetch_lod_tensor_array.py +++ b/test/legacy_test/test_fetch_lod_tensor_array.py @@ -34,7 +34,7 @@ class TestFetchLoDTensorArray(unittest.TestCase): ) loss = simple_fc_net_with_inputs(img, label, class_num=10) loss = simple_fc_net() - opt = fluid.optimizer.SGD(learning_rate=0.001) + opt = paddle.optimizer.SGD(learning_rate=0.001) opt.minimize(loss) array = paddle.tensor.array_write(x=img, i=i) diff --git a/test/legacy_test/test_fleet.py b/test/legacy_test/test_fleet.py index 245fa15ec83bfcdd41250b6f9a50286db097d1ce..52d6df39b92eea50d6406d3edca40844f3f3b932 100644 --- a/test/legacy_test/test_fleet.py +++ b/test/legacy_test/test_fleet.py @@ -85,7 +85,7 @@ class TestFleet1(unittest.TestCase): label_cast = paddle.cast(label, dtype='float32') cost = paddle.nn.functional.log_loss(fc, label_cast) try: - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ diff --git a/test/legacy_test/test_fleet_auto.py b/test/legacy_test/test_fleet_auto.py index 2c9c6ccccc18d5c2f1867ff8c77c9d259cb8e977..c29b729136e4e0f8f61b2996932fef46d4ba4950 100644 --- a/test/legacy_test/test_fleet_auto.py +++ b/test/legacy_test/test_fleet_auto.py @@ -45,7 +45,7 @@ class TestDistributedStrategyAuto(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.auto = True - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_fleet_base_3.py b/test/legacy_test/test_fleet_base_3.py index 48124eb5a17ccccd3b261ffe9585c804128e53a7..bbc0eabcbb306c6d637cd79ae59b592bdf3f83d3 100644 --- a/test/legacy_test/test_fleet_base_3.py +++ b/test/legacy_test/test_fleet_base_3.py @@ -46,7 +46,7 @@ class TestFleetBase_1(unittest.TestCase): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) strategy = fleet.DistributedStrategy() - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) @@ -81,7 +81,7 @@ class TestFleetBase(unittest.TestCase): self.assertEqual(len(graph_list), 0) strategy = fleet.DistributedStrategy() - optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/test/legacy_test/test_fleet_base_single.py b/test/legacy_test/test_fleet_base_single.py index 55cb0486e4ca6a70c7c2e98971e88cee3180e37f..352b64b19155ffb192336e5b664c8a2da7f1ef66 100644 --- a/test/legacy_test/test_fleet_base_single.py +++ b/test/legacy_test/test_fleet_base_single.py @@ -90,7 +90,7 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase): avg_cost = paddle.mean(x=cost) fleet.init(is_collective=True) - optimizer = fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer) optimizer.minimize(avg_cost) @@ -132,7 +132,7 @@ class TestFleetBaseSingleRunPS(unittest.TestCase): fleet.init() strategy = paddle.distributed.fleet.DistributedStrategy() - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) if fleet.is_server(): diff --git a/test/legacy_test/test_fleet_gradient_scale.py b/test/legacy_test/test_fleet_gradient_scale.py index 3991f4b6904c157b62a0a8f0dbd3fe471b668d57..ef7741883c9d192839022a24c7517632d70c91db 100644 --- a/test/legacy_test/test_fleet_gradient_scale.py +++ b/test/legacy_test/test_fleet_gradient_scale.py @@ -63,7 +63,7 @@ class TestGradientScale(unittest.TestCase): cost = self.mlp(input_x=input_x, input_y=input_y) output_name = cost.name optimizer = fleet.distributed_optimizer( - fluid.optimizer.Adam(), strategy + paddle.optimizer.Adam(), strategy ) optimizer.minimize(cost) diff --git a/test/legacy_test/test_fleet_nocvm_1.py b/test/legacy_test/test_fleet_nocvm_1.py index 26c94cbb5422811e6dd774fd5705cd6aacc36a6e..f4014613525f544957f8451f03a3140744d65362 100644 --- a/test/legacy_test/test_fleet_nocvm_1.py +++ b/test/legacy_test/test_fleet_nocvm_1.py @@ -79,7 +79,7 @@ class TestFleet1(unittest.TestCase): label_cast = paddle.cast(label, dtype='float32') cost = paddle.nn.functional.log_loss(fc, label_cast) try: - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer( adam, strategy={ diff --git a/test/legacy_test/test_fleet_pyramid_hash.py b/test/legacy_test/test_fleet_pyramid_hash.py index 6efb8a6e84efd0d350b0a4c9ae52360704421650..9c86ea8f770b9a77a106318f1a8264146d8da066 100644 --- a/test/legacy_test/test_fleet_pyramid_hash.py +++ b/test/legacy_test/test_fleet_pyramid_hash.py @@ -72,7 +72,7 @@ class TestPyramidHashOpApi(unittest.TestCase): fleet.init(role) strategy = StrategyFactory.create_geo_strategy(5) - optimizer = fluid.optimizer.SGD(0.1) + optimizer = paddle.optimizer.SGD(0.1) optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) diff --git a/test/legacy_test/test_fleet_rolemaker.py b/test/legacy_test/test_fleet_rolemaker.py index 5a5ac156bc5b06fcad4801c23bbd9fc8eb6026d0..b9af57199fc363913f6ff21518d4b476f9a42e3d 100644 --- a/test/legacy_test/test_fleet_rolemaker.py +++ b/test/legacy_test/test_fleet_rolemaker.py @@ -96,7 +96,7 @@ class TestCloudRoleMaker(unittest.TestCase): label_cast = paddle.cast(label, dtype='float32') cost = paddle.nn.functional.log_loss(fc, label_cast) try: - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() diff --git a/test/legacy_test/test_fleet_rolemaker_2.py b/test/legacy_test/test_fleet_rolemaker_2.py index d2ba47d580936e3ff1999379f290d85461fdb829..a43cebd4c66c3434f7665a0e0d5b1e381386af2a 100644 --- a/test/legacy_test/test_fleet_rolemaker_2.py +++ b/test/legacy_test/test_fleet_rolemaker_2.py @@ -73,7 +73,7 @@ class TestCloudRoleMaker2(unittest.TestCase): label_cast = paddle.cast(label, dtype='float32') cost = paddle.nn.functional.log_loss(fc, label_cast) try: - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() diff --git a/test/legacy_test/test_fleet_rolemaker_3.py b/test/legacy_test/test_fleet_rolemaker_3.py index 64e508122c5155198b1af7ac85026268991ac936..88541d58b24c3bdbe7310babf711b50e10e08ddd 100644 --- a/test/legacy_test/test_fleet_rolemaker_3.py +++ b/test/legacy_test/test_fleet_rolemaker_3.py @@ -70,7 +70,7 @@ class TestCloudRoleMaker(unittest.TestCase): label_cast = paddle.cast(label, dtype='float32') cost = paddle.nn.functional.log_loss(fc, label_cast) try: - adam = fluid.optimizer.Adam(learning_rate=0.000005) + adam = paddle.optimizer.Adam(learning_rate=0.000005) adam = fleet.distributed_optimizer(adam) adam.minimize([cost], [scope]) fleet.run_server() diff --git a/test/legacy_test/test_fleet_unitaccessor.py b/test/legacy_test/test_fleet_unitaccessor.py index 2228a8f6863f846225d014afff1c3c6cf64f1b5f..8146bf90efd253c3c211d0c542a4c0cb27cadbfd 100644 --- a/test/legacy_test/test_fleet_unitaccessor.py +++ b/test/legacy_test/test_fleet_unitaccessor.py @@ -78,17 +78,17 @@ class TestFleet1(unittest.TestCase): strategy["embedding"]["sparse_accessor_class"] = "DownpourUnitAccessor" strategy["embedding"]["embed_sparse_optimizer"] = "naive" try: - adam1 = fluid.optimizer.Adam(learning_rate=0.000005) + adam1 = paddle.optimizer.Adam(learning_rate=0.000005) adam1 = fleet.distributed_optimizer(adam1, strategy=strategy) adam1.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adagrad" - adam2 = fluid.optimizer.Adam(learning_rate=0.000005) + adam2 = paddle.optimizer.Adam(learning_rate=0.000005) adam2 = fleet.distributed_optimizer(adam2, strategy=strategy) adam2.minimize([cost], [scope]) strategy["embedding"]["embed_sparse_optimizer"] = "adam" - adam3 = fluid.optimizer.Adam(learning_rate=0.000005) + adam3 = paddle.optimizer.Adam(learning_rate=0.000005) adam3 = fleet.distributed_optimizer(adam3, strategy=strategy) adam3.minimize([cost], [scope]) except: diff --git a/test/legacy_test/test_fuse_all_reduce_pass.py b/test/legacy_test/test_fuse_all_reduce_pass.py index 837742853d184e095238007ac557552992a14f9f..b7ea594522b9f372320b06f4c84fc584a9477e04 100644 --- a/test/legacy_test/test_fuse_all_reduce_pass.py +++ b/test/legacy_test/test_fuse_all_reduce_pass.py @@ -86,9 +86,9 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): ) def optimizer(self, learning_rate=1e-3): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=learning_rate, - regularization=paddle.regularizer.L2Decay(1e-3), + weight_decay=paddle.regularizer.L2Decay(1e-3), ) return optimizer diff --git a/test/legacy_test/test_fuse_bn_act_pass.py b/test/legacy_test/test_fuse_bn_act_pass.py index 6df1a3209ad12cc98e87f44462da1bde4b2d729c..711b4061506d51711ea967bd5d2fe6b509390bd2 100644 --- a/test/legacy_test/test_fuse_bn_act_pass.py +++ b/test/legacy_test/test_fuse_bn_act_pass.py @@ -61,7 +61,7 @@ class TestFuseBatchNormActPass(unittest.TestCase): input=prediction, label=y, reduction='none', use_softmax=False ) loss = paddle.mean(loss) - sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = paddle.optimizer.SGD(learning_rate=0.001) if use_cuda: sgd = paddle.static.amp.decorate( sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0 diff --git a/test/legacy_test/test_fuse_bn_add_act_pass.py b/test/legacy_test/test_fuse_bn_add_act_pass.py index 63796599278ced3430e88569dd41b34ec44162b4..f0b11655916deec12e6d6c2cfdfb22c3f6d072bf 100644 --- a/test/legacy_test/test_fuse_bn_add_act_pass.py +++ b/test/legacy_test/test_fuse_bn_add_act_pass.py @@ -113,7 +113,7 @@ class TestFusedBnAddActAPI(unittest.TestCase): input=prediction, label=y, reduction='none', use_softmax=False ) loss = paddle.mean(loss) - sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = paddle.optimizer.SGD(learning_rate=0.001) sgd = paddle.static.amp.decorate( sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0 ) @@ -176,7 +176,7 @@ class TestFusedBnAddActAPI(unittest.TestCase): input=prediction, label=y, reduction='none', use_softmax=False ) loss = paddle.mean(loss) - sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = paddle.optimizer.SGD(learning_rate=0.001) sgd = paddle.static.amp.decorate( sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0 ) diff --git a/test/legacy_test/test_fuse_elewise_add_act_pass.py b/test/legacy_test/test_fuse_elewise_add_act_pass.py index b2590d974712f4eac8189047798ccb97a902bbd0..02348fae60a0ef300d0a136a42ee4b0592edd675 100644 --- a/test/legacy_test/test_fuse_elewise_add_act_pass.py +++ b/test/legacy_test/test_fuse_elewise_add_act_pass.py @@ -36,9 +36,9 @@ class TestMNIST(TestParallelExecutorBase): img, label = init_data() def _optimizer(learning_rate=1e-6): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=learning_rate, - regularization=paddle.regularizer.L2Decay(1e-6), + weight_decay=paddle.regularizer.L2Decay(1e-6), ) return optimizer @@ -103,7 +103,7 @@ class TestFuseActElewiseAddInplaceGradPass(unittest.TestCase): Out2 = F.relu(Out1) prediction = paddle.tensor.math._add_with_axis(Y, Out2, axis=1) loss = paddle.mean(prediction) - sgd = fluid.optimizer.SGD(learning_rate=0.001) + sgd = paddle.optimizer.SGD(learning_rate=0.001) sgd.minimize(loss) return X, Y, loss diff --git a/test/legacy_test/test_fuse_optimizer_pass.py b/test/legacy_test/test_fuse_optimizer_pass.py index 4e7f1901d72e41714f9471bd41d2704419c9e36c..d85355306dfc37a228de72e2cdd0a5d58e9c5342 100644 --- a/test/legacy_test/test_fuse_optimizer_pass.py +++ b/test/legacy_test/test_fuse_optimizer_pass.py @@ -40,7 +40,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): use_device, feed_dict=None, get_data_from_feeder=None, - optimizer=fluid.optimizer.Adam, + optimizer=paddle.optimizer.Adam, ): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return @@ -90,7 +90,7 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): class TestFuseAdamOps(TestFuseOptimizationOps): def optimizer(self, learning_rate=1e-4): - return fluid.optimizer.Adam(learning_rate=learning_rate) + return paddle.optimizer.Adam(learning_rate=learning_rate) def test_batchnorm_fc_with_fuse_op(self): self._decorate_compare_fused_optimizer_ops( @@ -103,12 +103,12 @@ class TestFuseAdamOps(TestFuseOptimizationOps): class TestFuseSGDOps(TestFuseAdamOps): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.SGD(learning_rate=learning_rate) + return paddle.optimizer.SGD(learning_rate=learning_rate) class TestFuseMomentumOps(TestFuseAdamOps): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.Momentum( + return paddle.optimizer.Momentum( learning_rate=learning_rate, momentum=0.1 ) @@ -139,7 +139,7 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): ) def optimizer(self, learning_rate=1e-4): - return fluid.optimizer.Adam(learning_rate=learning_rate) + return paddle.optimizer.Adam(learning_rate=learning_rate) def test_simple_bow_net_with_fuse_op(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) @@ -153,12 +153,12 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): class TestSpareFuseSGDOps(TestSpareFuseAdamOps): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.SGD(learning_rate=learning_rate) + return paddle.optimizer.SGD(learning_rate=learning_rate) class TestSpareFuseMomentumOps(TestSpareFuseAdamOps): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.Momentum( + return paddle.optimizer.Momentum( learning_rate=learning_rate, momentum=0.1 ) @@ -170,7 +170,7 @@ class TestPassConflictBase(TestFuseAdamOps): use_device, feed_dict=None, get_data_from_feeder=None, - optimizer=fluid.optimizer.Adam, + optimizer=paddle.optimizer.Adam, ): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return @@ -188,7 +188,7 @@ class TestPassConflictBase(TestFuseAdamOps): class TestFuseAdamOpsPassConflict(TestPassConflictBase): def optimizer(self, learning_rate=1e-4): - return fluid.optimizer.Adam(learning_rate=learning_rate) + return paddle.optimizer.Adam(learning_rate=learning_rate) def test_batchnorm_fc_with_fuse_op(self): self._decorate_compare_fused_optimizer_ops( @@ -201,12 +201,12 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase): class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.SGD(learning_rate=learning_rate) + return paddle.optimizer.SGD(learning_rate=learning_rate) class TestFuseMomentumOpsPassConflict(TestFuseAdamOpsPassConflict): def optimizer(self, learning_rate=1e-3): - return fluid.optimizer.Momentum( + return paddle.optimizer.Momentum( learning_rate=learning_rate, momentum=0.1 ) diff --git a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py b/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py index 84772d0691042c032bb856a61d0f76df16de8a1e..70487e22448f4ac08e789c43cda3f597b86b4db1 100644 --- a/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py +++ b/test/legacy_test/test_fuse_relu_depthwise_conv_pass.py @@ -19,7 +19,6 @@ from parallel_executor_test_base import DeviceType, TestParallelExecutorBase import paddle import paddle.nn.functional as F -from paddle import fluid from paddle.fluid import core @@ -85,9 +84,9 @@ class TestMNIST(TestParallelExecutorBase): img, label = self._init_data(random_data) def _optimizer(learning_rate=1e-6): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=learning_rate, - regularization=paddle.regularizer.L2Decay(1e-6), + weight_decay=paddle.regularizer.L2Decay(1e-6), ) return optimizer diff --git a/test/legacy_test/test_fused_attention_pass.py b/test/legacy_test/test_fused_attention_pass.py index 263ff746c710f48b913141bb3a454c3cf89969c0..3387662d75827777a50c068b9ffa257d25598764 100644 --- a/test/legacy_test/test_fused_attention_pass.py +++ b/test/legacy_test/test_fused_attention_pass.py @@ -154,7 +154,7 @@ class TestFusedAttentionPass(unittest.TestCase): out = multi_head_attn(attn_input, attn_mask) loss = paddle.mean(out) - sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(loss) if use_pass: diff --git a/test/legacy_test/test_fused_feedforward_pass.py b/test/legacy_test/test_fused_feedforward_pass.py index 1abbfec2201bdbf2da2b115a8e2b5d259bbc598b..107e3f319d3d7a3ba260b84a8429affeca432871 100644 --- a/test/legacy_test/test_fused_feedforward_pass.py +++ b/test/legacy_test/test_fused_feedforward_pass.py @@ -129,7 +129,7 @@ class TestFusedFeedforwadPass(unittest.TestCase): out = feed_forward(data) loss = paddle.mean(out) - sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(loss) if use_pass: diff --git a/test/legacy_test/test_generator_dataloader.py b/test/legacy_test/test_generator_dataloader.py index ed9fdf004aedb5f40fc58f03cde7fe2dace32b08..9d895538521900a115eb35913157cc69cafc96c7 100644 --- a/test/legacy_test/test_generator_dataloader.py +++ b/test/legacy_test/test_generator_dataloader.py @@ -78,7 +78,7 @@ def simple_fc_net(places, use_legacy_py_reader, use_double_buffer): ) ) - optimizer = fluid.optimizer.Adam() + optimizer = paddle.optimizer.Adam() optimizer.minimize(loss) return startup_prog, main_prog, py_reader, loss diff --git a/test/legacy_test/test_gradient_clip.py b/test/legacy_test/test_gradient_clip.py index cc91c85bee0d1e77951f2670303693d53eec2c40..dae8b7b47d88498635513482aed9c39f9c151df6 100644 --- a/test/legacy_test/test_gradient_clip.py +++ b/test/legacy_test/test_gradient_clip.py @@ -217,7 +217,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): def backward_func(cost): clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0) paddle.nn.clip.set_gradient_clip(clip) - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=0.01, grad_clip=clip ) # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective @@ -233,7 +233,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): def test_tpyeError(self): # the type of optimizer(grad_clip=) must be an instance of GradientClipBase's derived class with self.assertRaises(TypeError): - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=0.1, grad_clip="test" ) @@ -428,9 +428,9 @@ class TestDygraphGradientClip(unittest.TestCase): out = linear(fluid.dygraph.to_variable(inputs)) loss = paddle.mean(out) loss.backward() - sgd_optimizer = fluid.optimizer.SGD( + sgd_optimizer = paddle.optimizer.SGD( learning_rate=0.0, - parameter_list=linear.parameters(), + parameters=linear.parameters(), grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1), ) self.check_clip_result(loss, sgd_optimizer) diff --git a/test/legacy_test/test_hsigmoid_op.py b/test/legacy_test/test_hsigmoid_op.py index 5e566a75d04bc3d1836617943a6c4c7047e71ab0..b9c4338fa4c059f279ccc98544e4f08e64864462 100644 --- a/test/legacy_test/test_hsigmoid_op.py +++ b/test/legacy_test/test_hsigmoid_op.py @@ -332,7 +332,7 @@ class TestHSigmoidOpWithSparseGrad(unittest.TestCase): label = np.array([1, 4]).astype('int64') loss, data_list = self.hs_net_conf(is_sparse) - optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer = paddle.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) main_program = fluid.default_main_program() diff --git a/test/legacy_test/test_imperative_auto_prune.py b/test/legacy_test/test_imperative_auto_prune.py index 337c0641b761f7b30f0a4d7e922826ffeefcb8d1..4491bf705a3d593e71a124d3b5627674da5649f6 100644 --- a/test/legacy_test/test_imperative_auto_prune.py +++ b/test/legacy_test/test_imperative_auto_prune.py @@ -283,9 +283,9 @@ class TestImperativeAutoPrune(unittest.TestCase): linear2_origin = linear2.weight.numpy() linear2.weight.stop_gradient = True out2.backward() - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=0.003, - parameter_list=(linear.parameters() + linear2.parameters()), + parameters=(linear.parameters() + linear2.parameters()), ) optimizer.minimize(out2) np.testing.assert_array_equal( @@ -311,9 +311,9 @@ class TestImperativeAutoPrune(unittest.TestCase): linear2_origin = linear2.weight.numpy() out2.stop_gradient = True out2.backward() - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=0.003, - parameter_list=(linear.parameters() + linear2.parameters()), + parameters=(linear.parameters() + linear2.parameters()), ) optimizer.minimize(out2) np.testing.assert_array_equal( @@ -359,8 +359,8 @@ class TestImperativeAutoPrune(unittest.TestCase): with fluid.dygraph.guard(place): model = MyLayer(size, vocab_size, size) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001) - optimizer = fluid.optimizer.AdamOptimizer( - 0.001, parameter_list=model.parameters(), grad_clip=grad_clip + optimizer = paddle.optimizer.Adam( + 0.001, parameters=model.parameters(), grad_clip=grad_clip ) indices = fluid.dygraph.to_variable(indices) embed = fluid.dygraph.to_variable(embed) @@ -378,8 +378,8 @@ class TestImperativeAutoPrune(unittest.TestCase): with fluid.dygraph.guard(place): model = MyLayer2(size, vocab_size, size) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001) - optimizer = fluid.optimizer.AdamOptimizer( - 0.001, parameter_list=model.parameters(), grad_clip=grad_clip + optimizer = paddle.optimizer.Adam( + 0.001, parameters=model.parameters(), grad_clip=grad_clip ) indices = fluid.dygraph.to_variable(indices) diff --git a/test/legacy_test/test_imperative_deepcf.py b/test/legacy_test/test_imperative_deepcf.py index 7b7fd5dadf4da059103ea1c54e5dd4bbdbce1c01..7827e4097465822e39b7e0b27744679ee5caab6b 100644 --- a/test/legacy_test/test_imperative_deepcf.py +++ b/test/legacy_test/test_imperative_deepcf.py @@ -269,7 +269,7 @@ class TestDygraphDeepCF(unittest.TestCase): deepcf = DeepCF(num_users, num_items, matrix) prediction = deepcf(users, items) loss = paddle.sum(paddle.nn.functional.log_loss(prediction, labels)) - adam = fluid.optimizer.AdamOptimizer(0.01) + adam = paddle.optimizer.Adam(0.01) adam.minimize(loss) exe = fluid.Executor( @@ -307,9 +307,7 @@ class TestDygraphDeepCF(unittest.TestCase): paddle.framework.random._manual_program_seed(seed) deepcf = DeepCF(num_users, num_items, matrix) - adam = fluid.optimizer.AdamOptimizer( - 0.01, parameter_list=deepcf.parameters() - ) + adam = paddle.optimizer.Adam(0.01, parameters=deepcf.parameters()) for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) for slice in range( @@ -340,9 +338,7 @@ class TestDygraphDeepCF(unittest.TestCase): paddle.framework.random._manual_program_seed(seed) deepcf2 = DeepCF(num_users, num_items, matrix) - adam2 = fluid.optimizer.AdamOptimizer( - 0.01, parameter_list=deepcf2.parameters() - ) + adam2 = paddle.optimizer.Adam(0.01, parameters=deepcf2.parameters()) fluid.set_flags({'FLAGS_sort_sum_gradient': True}) for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) @@ -376,9 +372,7 @@ class TestDygraphDeepCF(unittest.TestCase): fluid.default_main_program().random_seed = seed deepcf = DeepCF(num_users, num_items, matrix) - adam = fluid.optimizer.AdamOptimizer( - 0.01, parameter_list=deepcf.parameters() - ) + adam = paddle.optimizer.Adam(0.01, parameters=deepcf.parameters()) for e in range(self.num_epoches): sys.stderr.write('epoch %d\n' % e) diff --git a/test/legacy_test/test_imperative_gan.py b/test/legacy_test/test_imperative_gan.py index bb03c794de9011ede6b3ffa69e816d39d1fe23be..524ed9ec35f08c58708dcfd94235e66c1dc8b166 100644 --- a/test/legacy_test/test_imperative_gan.py +++ b/test/legacy_test/test_imperative_gan.py @@ -21,7 +21,6 @@ import paddle from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer from paddle.nn import Linear @@ -95,7 +94,7 @@ class TestDygraphGAN(unittest.TestCase): d_loss = d_loss_real + d_loss_fake - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) sgd.minimize(d_loss) with new_program_scope(main=generate_p, startup=startup, scope=scope): @@ -114,7 +113,7 @@ class TestDygraphGAN(unittest.TestCase): ) ) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) sgd.minimize(g_loss) exe = fluid.Executor( @@ -149,9 +148,9 @@ class TestDygraphGAN(unittest.TestCase): discriminator = Discriminator() generator = Generator() - sgd = SGDOptimizer( + sgd = paddle.optimizer.SGD( learning_rate=1e-3, - parameter_list=( + parameters=( discriminator.parameters() + generator.parameters() ), ) @@ -204,9 +203,9 @@ class TestDygraphGAN(unittest.TestCase): paddle.framework.random._manual_program_seed(1) discriminator2 = Discriminator() generator2 = Generator() - sgd2 = SGDOptimizer( + sgd2 = paddle.optimizer.SGD( learning_rate=1e-3, - parameter_list=( + parameters=( discriminator2.parameters() + generator2.parameters() ), ) diff --git a/test/legacy_test/test_imperative_gnn.py b/test/legacy_test/test_imperative_gnn.py index 93c2aee3a9d5cea79155d29d50eb9db570c3f7ee..225dbe83b1df21c9a186956f15a1daac8ba19e1b 100644 --- a/test/legacy_test/test_imperative_gnn.py +++ b/test/legacy_test/test_imperative_gnn.py @@ -23,7 +23,7 @@ import paddle.nn.functional as F from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import AdamOptimizer +from paddle.optimizer import Adam def gen_data(): @@ -92,7 +92,7 @@ class TestDygraphGNN(unittest.TestCase): ) loss = paddle.sum(loss) - adam = AdamOptimizer(learning_rate=1e-3) + adam = Adam(learning_rate=1e-3) adam.minimize(loss) exe = fluid.Executor( fluid.CPUPlace() @@ -132,9 +132,7 @@ class TestDygraphGNN(unittest.TestCase): ) loss = paddle.sum(loss) loss.backward() - adam = AdamOptimizer( - learning_rate=1e-3, parameter_list=model.parameters() - ) + adam = Adam(learning_rate=1e-3, parameters=model.parameters()) adam.minimize(loss) model.clear_gradients() @@ -160,9 +158,7 @@ class TestDygraphGNN(unittest.TestCase): ) loss2 = paddle.sum(loss2) loss2.backward() - adam2 = AdamOptimizer( - learning_rate=1e-3, parameter_list=model2.parameters() - ) + adam2 = Adam(learning_rate=1e-3, parameters=model2.parameters()) adam2.minimize(loss2) model2.clear_gradients() loss2_value = loss2.numpy() diff --git a/test/legacy_test/test_imperative_lod_tensor_to_selected_rows.py b/test/legacy_test/test_imperative_lod_tensor_to_selected_rows.py index 0d072e14bbe5e1ad67bf2bb96c0fcc37dd29d3cb..52e378265547ee0cc01d3c3eb41c9d07ca676ba7 100644 --- a/test/legacy_test/test_imperative_lod_tensor_to_selected_rows.py +++ b/test/legacy_test/test_imperative_lod_tensor_to_selected_rows.py @@ -22,7 +22,6 @@ import paddle from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer class SimpleNet(paddle.nn.Layer): @@ -115,9 +114,9 @@ class TestDygraphSimpleNet(unittest.TestCase): dtype=dtype, ) - sgd = SGDOptimizer( + sgd = paddle.optimizer.SGD( learning_rate=1e-3, - parameter_list=simple_net.parameters(), + parameters=simple_net.parameters(), ) dy_param_updated = {} dy_param_init = {} @@ -162,7 +161,7 @@ class TestDygraphSimpleNet(unittest.TestCase): ) exe = fluid.Executor(place) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) x = paddle.static.data( name="x", shape=[-1, num_steps], dtype='int64' ) diff --git a/test/legacy_test/test_imperative_mnist.py b/test/legacy_test/test_imperative_mnist.py index fd8383e686d13034ace4f866d32766b232da0784..ff436db8d80846a685e7030c1e68b7be87e2e7a4 100644 --- a/test/legacy_test/test_imperative_mnist.py +++ b/test/legacy_test/test_imperative_mnist.py @@ -21,7 +21,6 @@ from utils import DyGraphProgramDescTracerTestHelper import paddle from paddle import fluid from paddle.fluid import core -from paddle.fluid.optimizer import SGDOptimizer from paddle.nn import Linear @@ -125,8 +124,8 @@ class TestImperativeMnist(unittest.TestCase): fluid.default_main_program().random_seed = seed mnist = MNIST() - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=mnist.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=mnist.parameters() ) batch_py_reader = fluid.io.PyReader(capacity=1) @@ -188,7 +187,7 @@ class TestImperativeMnist(unittest.TestCase): ) mnist = MNIST() - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=batch_size, diff --git a/test/legacy_test/test_imperative_mnist_sorted_gradient.py b/test/legacy_test/test_imperative_mnist_sorted_gradient.py index 47a92931b407eb297519bf8f9bfbc5b88e35e9f7..a85bdae98cdeb2980f0d2d9c9b12bd74ac42fa8a 100644 --- a/test/legacy_test/test_imperative_mnist_sorted_gradient.py +++ b/test/legacy_test/test_imperative_mnist_sorted_gradient.py @@ -22,7 +22,6 @@ import paddle from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer class TestImperativeMnistSortGradient(unittest.TestCase): @@ -36,8 +35,8 @@ class TestImperativeMnistSortGradient(unittest.TestCase): fluid.set_flags({'FLAGS_sort_sum_gradient': True}) mnist2 = MNIST() - sgd2 = SGDOptimizer( - learning_rate=1e-3, parameter_list=mnist2.parameters() + sgd2 = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=mnist2.parameters() ) train_reader2 = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True @@ -93,7 +92,7 @@ class TestImperativeMnistSortGradient(unittest.TestCase): ) mnist = MNIST() - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True ) diff --git a/test/legacy_test/test_imperative_ocr_attention_model.py b/test/legacy_test/test_imperative_ocr_attention_model.py index e01ac78dcef4bcf79af23cb2f8fcaa68bcf8f63d..30c00600ae7723153b77b477324b4a2afb070bcd 100644 --- a/test/legacy_test/test_imperative_ocr_attention_model.py +++ b/test/legacy_test/test_imperative_ocr_attention_model.py @@ -456,8 +456,8 @@ class TestDygraphOCRAttention(unittest.TestCase): ) else: learning_rate = Config.LR - optimizer = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=ocr_attention.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.001, parameters=ocr_attention.parameters() ) dy_param_init_value = {} for param in ocr_attention.parameters(): @@ -533,7 +533,7 @@ class TestDygraphOCRAttention(unittest.TestCase): else: learning_rate = Config.LR - optimizer = fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) images = paddle.static.data( name='pixel', shape=[-1] + Config.DATA_SHAPE, dtype='float32' diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py index fec48938b3606798eef15b23565e408d155c0517..170a3b7d1858506860fd2596c6dc18f83d1f3332 100644 --- a/test/legacy_test/test_imperative_optimizer.py +++ b/test/legacy_test/test_imperative_optimizer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import itertools import unittest import numpy as np @@ -22,20 +21,6 @@ import paddle from paddle import fluid from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.fluid import core -from paddle.fluid.optimizer import ( - Adam, - DecayedAdagradOptimizer, - DpsgdOptimizer, - ExponentialMovingAverage, - FtrlOptimizer, - LarsMomentumOptimizer, - LookaheadOptimizer, - ModelAverage, - MomentumOptimizer, - PipelineOptimizer, - RecomputeOptimizer, - SGDOptimizer, -) # Note(wangzhongpu) # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. @@ -242,187 +227,6 @@ class TestImperativeOptimizerBase(unittest.TestCase): ) -class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - bd = [3, 6, 9] - optimizer = SGDOptimizer( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, - values=[0.1 * (0.1**i) for i in range(len(bd) + 1)], - ), - parameter_list=parameter_list, - ) - return optimizer - - def get_optimizer(self): - bd = [3, 6, 9] - optimizer = SGDOptimizer( - learning_rate=paddle.optimizer.lr.PiecewiseDecay( - boundaries=bd, - values=[0.1 * (0.1**i) for i in range(len(bd) + 1)], - ) - ) - return optimizer - - def test_sgd(self): - self._check_mlp() - - -class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.natural_exp_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ), - parameter_list=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.natural_exp_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ) - ) - return optimizer - - def test_sgd(self): - self._check_mlp() - - -class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ), - parameter_list=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ) - ) - return optimizer - - def test_sgd(self): - self._check_mlp() - - -class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = Adam( - learning_rate=fluid.layers.inverse_time_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ), - parameter_list=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = Adam( - learning_rate=fluid.layers.inverse_time_decay( - learning_rate=0.1, - decay_steps=10000, - decay_rate=0.5, - staircase=True, - ) - ) - return optimizer - - def test_adam(self): - self._check_mlp() - - -class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.SGD( - learning_rate=paddle.optimizer.lr.PolynomialDecay( - learning_rate=0.1, decay_steps=5, cycle=self.cycle - ), - parameters=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.SGD( - learning_rate=paddle.optimizer.lr.PolynomialDecay( - learning_rate=0.1, decay_steps=5, cycle=self.cycle - ) - ) - return optimizer - - def test_sgd_cycle(self): - self.cycle = True - self._check_mlp() - - def test_sgd(self): - self.cycle = False - self._check_mlp() - - -class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.cosine_decay( - learning_rate=0.1, step_each_epoch=10000, epochs=120 - ), - parameter_list=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = SGDOptimizer( - learning_rate=fluid.layers.cosine_decay( - learning_rate=0.1, step_each_epoch=10000, epochs=120 - ) - ) - return optimizer - - def test_sgd(self): - self._check_mlp() - - -class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.SGD( - learning_rate=paddle.optimizer.lr.NoamDecay( - d_model=512, warmup_steps=8000 - ), - parameters=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.SGD( - learning_rate=paddle.optimizer.lr.NoamDecay( - d_model=512, warmup_steps=8000 - ) - ) - return optimizer - - def test_sgd(self): - self._check_mlp() - - class TestOptimizerLearningRate(unittest.TestCase): def test_constant_lr(self): with fluid.dygraph.guard(): @@ -436,17 +240,15 @@ class TestOptimizerLearningRate(unittest.TestCase): loss = paddle.mean(b) - adam = fluid.optimizer.Adam( - 0.001, parameter_list=linear.parameters() - ) + adam = paddle.optimizer.Adam(0.001, parameters=linear.parameters()) np.testing.assert_allclose( - adam.current_step_lr(), 0.001, rtol=1e-06, atol=0.0 + adam.get_lr(), 0.001, rtol=1e-06, atol=0.0 ) for i in range(10): adam.minimize(loss) - lr = adam.current_step_lr() + lr = adam.get_lr() np.testing.assert_allclose(lr, 0.001, rtol=1e-06, atol=0.0) @@ -528,23 +330,15 @@ class TestOptimizerLearningRate(unittest.TestCase): loss = paddle.mean(b) - adam = fluid.optimizer.Adam(0.1, parameter_list=linear.parameters()) + adam = paddle.optimizer.Adam(0.1, parameters=linear.parameters()) lr_list = [0.2, 0.3, 0.4, 0.5, 0.6] for i in range(5): adam.set_lr(lr_list[i]) adam.minimize(loss) - lr = adam.current_step_lr() + lr = adam.get_lr() np.testing.assert_allclose(lr, lr_list[i], rtol=1e-06, atol=0.0) - lr_var = paddle.static.create_global_var( - shape=[1], value=0.7, dtype='float32' - ) - adam.set_lr(lr_var) - adam.minimize(loss) - lr = adam.current_step_lr() - np.testing.assert_allclose(lr, 0.7, rtol=1e-06, atol=0.0) - with self.assertRaises(RuntimeError): adam = paddle.optimizer.Adam( paddle.optimizer.lr.NaturalExpDecay( @@ -556,190 +350,10 @@ class TestOptimizerLearningRate(unittest.TestCase): adam.set_lr(0.01) -class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = MomentumOptimizer( - learning_rate=0.001, momentum=0.9, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) - return optimizer - - def test_momentum(self): - self._check_mlp() - - -class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = LarsMomentumOptimizer( - learning_rate=0.001, momentum=0.9, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) - return optimizer - - def test_larsmomentum(self): - self._check_mlp() - - -class TestImperativeAdagradOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.Adagrad( - learning_rate=0.2, parameters=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.Adagrad(learning_rate=0.2) - return optimizer - - def test_adagrad(self): - self._check_mlp() - - -class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.Adamax( - learning_rate=0.2, parameters=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.Adamax(learning_rate=0.2) - return optimizer - - def test_adamax(self): - self._check_mlp() - - -class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = DpsgdOptimizer( - learning_rate=0.01, - clip=10.0, - batch_size=16.0, - sigma=1.0, - parameter_list=parameter_list, - ) - optimizer._seed = 100 - return optimizer - - def get_optimizer(self): - optimizer = DpsgdOptimizer( - learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0 - ) - optimizer._seed = 100 - return optimizer - - def test_dpsgd(self): - self._check_mlp(place=fluid.CPUPlace()) - - -class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = DecayedAdagradOptimizer( - learning_rate=0.2, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = DecayedAdagradOptimizer(learning_rate=0.2) - return optimizer - - def test_decayadagrad(self): - self._check_mlp() - - -class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.Adadelta( - learning_rate=0.0003, - epsilon=1.0e-6, - rho=0.95, - parameters=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.Adadelta( - learning_rate=0.0003, epsilon=1.0e-6, rho=0.95 - ) - return optimizer - - def test_adadelta(self): - self._check_mlp() - - -class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.RMSProp( - learning_rate=0.1, parameters=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.RMSProp(learning_rate=0.1) - return optimizer - - def test_rmsprop(self): - self._check_mlp() - - -class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = FtrlOptimizer( - learning_rate=0.1, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = FtrlOptimizer(learning_rate=0.1) - return optimizer - - def test_ftrl(self): - self._check_mlp() - - def exclude_fn(param): return param.name.endswith('.b_0') -class TestImperativeLambOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = paddle.optimizer.Lamb( - learning_rate=0.002, - exclude_from_weight_decay_fn=exclude_fn, - parameters=parameter_list, - ) - return optimizer - - def get_optimizer(self): - optimizer = paddle.optimizer.Lamb( - learning_rate=0.002, exclude_from_weight_decay_fn=exclude_fn - ) - return optimizer - - # should fix: may fail in CI-windows - def _test_lamb(self): - self._check_mlp() - - -class TestImperativeModelAverage(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = ModelAverage( - 0.15, min_average_window=10000, max_average_window=12500 - ) - return optimizer - - def test_modelaverage(self): - exception_message = "In dygraph, don't support ModelAverage." - self._check_exception(exception_message) - - class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = DGCMomentumOptimizer( @@ -756,85 +370,6 @@ class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): self._check_exception(exception_message) -class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = ExponentialMovingAverage(0.999) - return optimizer - - def test_exponentialmoving(self): - exception_message = ( - "In dygraph, don't support ExponentialMovingAverage." - ) - self._check_exception(exception_message) - - -class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = fluid.optimizer.SGD( - learning_rate=0.5, parameter_list=parameter_list - ) - optimizer = PipelineOptimizer(optimizer) - return optimizer - - def test_pipline(self): - exception_message = "In dygraph, don't support PipelineOptimizer." - self._check_exception(exception_message) - - -class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = fluid.optimizer.SGD( - learning_rate=0.5, parameter_list=parameter_list - ) - optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) - return optimizer - - def test_lookahead(self): - exception_message = "In dygraph, don't support LookaheadOptimizer." - self._check_exception(exception_message) - - -class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = fluid.optimizer.SGD( - learning_rate=0.5, parameter_list=parameter_list - ) - optimizer = RecomputeOptimizer(optimizer) - return optimizer - - def test_recompute(self): - exception_message = "In dygraph, don't support RecomputeOptimizer." - self._check_exception(exception_message) - - -class TestImperativeOptimizerList(unittest.TestCase): - def test_parameter_list(self): - with fluid.dygraph.guard(): - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - - sgd = SGDOptimizer( - 1.0, - parameter_list=itertools.chain( - linear_1.parameters(), linear_2.parameters() - ), - ) - - in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32") - in_data = fluid.dygraph.to_variable(in_np) - - y = linear_1(in_data) - y = linear_2(y) - loss = paddle.mean(y) - loss.backward() - sgd.minimize(loss) - - self.assertTrue( - len(sgd._parameter_list) - == len(linear_1.parameters() + linear_2.parameters()) - ) - - if __name__ == '__main__': paddle.enable_static() unittest.main() diff --git a/test/legacy_test/test_imperative_optimizer_v2.py b/test/legacy_test/test_imperative_optimizer_v2.py index b9cfe1e0132a303db4f7b6672a7549dc98eb61bc..e7e0f1c4a4782e8df3e5325c048c8d75ae672464 100644 --- a/test/legacy_test/test_imperative_optimizer_v2.py +++ b/test/legacy_test/test_imperative_optimizer_v2.py @@ -22,18 +22,6 @@ import paddle from paddle import fluid from paddle.distributed.fleet.meta_optimizers import DGCMomentumOptimizer from paddle.fluid import core -from paddle.fluid.optimizer import ( - DecayedAdagradOptimizer, - DpsgdOptimizer, - ExponentialMovingAverage, - FtrlOptimizer, - LarsMomentumOptimizer, - LookaheadOptimizer, - ModelAverage, - MomentumOptimizer, - PipelineOptimizer, - RecomputeOptimizer, -) # Note(wangzhongpu) # In dygraph, don't support ModelAverage, DGCMomentumOptimizer, ExponentialMovingAverage, PipelineOptimizer, LookaheadOptimizer, RecomputeOptimizer. @@ -687,13 +675,13 @@ class TestOptimizerLearningRate(unittest.TestCase): class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = MomentumOptimizer( - learning_rate=0.001, momentum=0.9, parameter_list=parameter_list + optimizer = paddle.optimizer.Momentum( + learning_rate=0.001, momentum=0.9, parameters=parameter_list ) return optimizer def get_optimizer(self): - optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) + optimizer = paddle.optimizer.Momentum(learning_rate=0.001, momentum=0.9) return optimizer def test_momentum(self): @@ -702,13 +690,15 @@ class TestImperativeMomentumOptimizer(TestImperativeOptimizerBase): class TestImperativeLarsMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = LarsMomentumOptimizer( + optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer( learning_rate=0.001, momentum=0.9, parameter_list=parameter_list ) return optimizer def get_optimizer(self): - optimizer = LarsMomentumOptimizer(learning_rate=0.001, momentum=0.9) + optimizer = paddle.incubate.optimizer.LarsMomentumOptimizer( + learning_rate=0.001, momentum=0.9 + ) return optimizer def test_larsmomentum(self): @@ -745,44 +735,6 @@ class TestImperativeAdamaxOptimizer(TestImperativeOptimizerBase): self._check_mlp() -class TestImperativeDpsgdOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = DpsgdOptimizer( - learning_rate=0.01, - clip=10.0, - batch_size=16.0, - sigma=1.0, - parameter_list=parameter_list, - ) - optimizer._seed = 100 - return optimizer - - def get_optimizer(self): - optimizer = DpsgdOptimizer( - learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0 - ) - optimizer._seed = 100 - return optimizer - - def test_dpsgd(self): - self._check_mlp(place=fluid.CPUPlace()) - - -class TestImperativeDecayedAdagradOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = DecayedAdagradOptimizer( - learning_rate=0.2, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = DecayedAdagradOptimizer(learning_rate=0.2) - return optimizer - - def test_decayadagrad(self): - self._check_mlp() - - class TestImperativeAdadeltaOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = paddle.optimizer.Adadelta( @@ -818,21 +770,6 @@ class TestImperativeRMSPropOptimizer(TestImperativeOptimizerBase): self._check_mlp() -class TestImperativeFtrlOptimizer(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = FtrlOptimizer( - learning_rate=0.1, parameter_list=parameter_list - ) - return optimizer - - def get_optimizer(self): - optimizer = FtrlOptimizer(learning_rate=0.1) - return optimizer - - def test_ftrl(self): - self._check_mlp() - - def exclude_fn(param): return param.name.endswith('.b_0') @@ -857,18 +794,6 @@ class TestImperativeLambOptimizer(TestImperativeOptimizerBase): self._check_mlp() -class TestImperativeModelAverage(TestImperativeOptimizerBase): - def get_optimizer_dygraph(self, parameter_list): - optimizer = ModelAverage( - 0.15, min_average_window=10000, max_average_window=12500 - ) - return optimizer - - def test_modelaverage(self): - exception_message = "In dygraph, don't support ModelAverage." - self._check_exception(exception_message) - - class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): optimizer = DGCMomentumOptimizer( @@ -887,7 +812,7 @@ class TestImperativeDGCMomentumOptimizer(TestImperativeOptimizerBase): class TestImperativeExponentialMovingAverage(TestImperativeOptimizerBase): def get_optimizer_dygraph(self, parameter_list): - optimizer = ExponentialMovingAverage(0.999) + optimizer = paddle.static.ExponentialMovingAverage(0.999) return optimizer def test_exponentialmoving(self): @@ -902,7 +827,7 @@ class TestImperativePipelineOptimizer(TestImperativeOptimizerBase): optimizer = paddle.optimizer.SGD( learning_rate=0.5, parameters=parameter_list ) - optimizer = PipelineOptimizer(optimizer) + optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer) return optimizer def test_pipline(self): @@ -915,7 +840,9 @@ class TestImperativeLookaheadOptimizer(TestImperativeOptimizerBase): optimizer = paddle.optimizer.SGD( learning_rate=0.5, parameters=parameter_list ) - optimizer = LookaheadOptimizer(optimizer, alpha=0.5, k=5) + optimizer = paddle.incubate.optimizer.LookAhead( + optimizer, alpha=0.5, k=5 + ) return optimizer def test_lookahead(self): @@ -928,7 +855,7 @@ class TestImperativeRecomputeOptimizer(TestImperativeOptimizerBase): optimizer = paddle.optimizer.SGD( learning_rate=0.5, parameters=parameter_list ) - optimizer = RecomputeOptimizer(optimizer) + optimizer = paddle.incubate.optimizer.RecomputeOptimizer(optimizer) return optimizer def test_recompute(self): diff --git a/test/legacy_test/test_imperative_partitial_backward.py b/test/legacy_test/test_imperative_partitial_backward.py index de2dcdc9255a17a971e750d155c9edc8779ce191..77d531574a471713c508e3838ddfd96dca3d13d8 100644 --- a/test/legacy_test/test_imperative_partitial_backward.py +++ b/test/legacy_test/test_imperative_partitial_backward.py @@ -39,8 +39,8 @@ class TestImperativePartitialBackward(unittest.TestCase): for param in linear2.parameters(): self.assertIsNone(param._grad_ivar()) - optimizer = fluid.optimizer.AdamOptimizer( - parameter_list=(linear1.parameters() + linear2.parameters()) + optimizer = paddle.optimizer.Adam( + parameters=(linear1.parameters() + linear2.parameters()) ) _, params_grads = optimizer.minimize(loss) diff --git a/test/legacy_test/test_imperative_ptb_rnn.py b/test/legacy_test/test_imperative_ptb_rnn.py index 422a95e820f1cbf080cc772b330d6e616d98a012..e3790fa2b1931cf562ae3e1615037ce00849fae9 100644 --- a/test/legacy_test/test_imperative_ptb_rnn.py +++ b/test/legacy_test/test_imperative_ptb_rnn.py @@ -22,7 +22,6 @@ import paddle from paddle import fluid from paddle.fluid import core, framework from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer from paddle.nn import Embedding @@ -264,8 +263,8 @@ class TestDygraphPtbRnn(unittest.TestCase): is_sparse=is_sparse, ) - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=ptb_model.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=ptb_model.parameters() ) dy_param_updated = {} dy_param_init = {} @@ -326,7 +325,7 @@ class TestDygraphPtbRnn(unittest.TestCase): if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) ) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) x = paddle.static.data( name="x", shape=[-1, num_steps], dtype='int64' ) diff --git a/test/legacy_test/test_imperative_ptb_rnn_sorted_gradient.py b/test/legacy_test/test_imperative_ptb_rnn_sorted_gradient.py index 6164ecaeedf8d2350443ad94ea5be8e3c6aff339..972c38f4b6fa59185c8e0b91e1699334d2ae20d6 100644 --- a/test/legacy_test/test_imperative_ptb_rnn_sorted_gradient.py +++ b/test/legacy_test/test_imperative_ptb_rnn_sorted_gradient.py @@ -22,7 +22,6 @@ import paddle from paddle import fluid from paddle.fluid import core, framework from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer class TestDygraphPtbRnnSortGradient(unittest.TestCase): @@ -55,8 +54,8 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): is_sparse=is_sparse, ) - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=ptb_model.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=ptb_model.parameters() ) dy_param_updated = {} dy_param_init = {} @@ -114,7 +113,7 @@ class TestDygraphPtbRnnSortGradient(unittest.TestCase): if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) ) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) x = paddle.static.data( name="x", shape=[-1, num_steps, 1], dtype='int64' ) diff --git a/test/legacy_test/test_imperative_reinforcement.py b/test/legacy_test/test_imperative_reinforcement.py index fe35c1d33f63f5e289288fbfae45b158fced6c13..481b78d613491b5620769a86465f3bcbbe7efe31 100644 --- a/test/legacy_test/test_imperative_reinforcement.py +++ b/test/legacy_test/test_imperative_reinforcement.py @@ -21,7 +21,6 @@ import paddle import paddle.nn.functional as F from paddle import fluid from paddle.fluid import core -from paddle.fluid.optimizer import SGDOptimizer class Policy(paddle.nn.Layer): @@ -81,8 +80,8 @@ class TestImperativeMnist(unittest.TestCase): loss_probs = paddle.multiply(dy_reward, loss_probs) loss = paddle.sum(loss_probs) - sgd = SGDOptimizer( - learning_rate=1e-3, parameter_list=policy.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=policy.parameters() ) dy_param_init_value = {} @@ -124,7 +123,7 @@ class TestImperativeMnist(unittest.TestCase): policy = Policy(input_size=4) - st_sgd = SGDOptimizer(learning_rate=1e-3) + st_sgd = paddle.optimizer.SGD(learning_rate=1e-3) st_state = paddle.static.data( name='st_state', shape=[-1, 4], dtype='float32' diff --git a/test/legacy_test/test_imperative_resnet.py b/test/legacy_test/test_imperative_resnet.py index d130f97686a1ec2ce4d7fbe5a4a7e10712c7bc18..4bb78f64d31258e0396d94c2a9328c6c139591d6 100644 --- a/test/legacy_test/test_imperative_resnet.py +++ b/test/legacy_test/test_imperative_resnet.py @@ -59,11 +59,11 @@ def optimizer_setting(params, parameter_list=None): lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] if fluid.in_dygraph_mode(): - optimizer = fluid.optimizer.SGD( - learning_rate=0.01, parameter_list=parameter_list + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=parameter_list ) else: - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], diff --git a/test/legacy_test/test_imperative_resnet_sorted_gradient.py b/test/legacy_test/test_imperative_resnet_sorted_gradient.py index c537da047b83ded8c36bb4837630844e8682fa1a..98bdd0c8ccb075692380f100428a91951e6efb5e 100644 --- a/test/legacy_test/test_imperative_resnet_sorted_gradient.py +++ b/test/legacy_test/test_imperative_resnet_sorted_gradient.py @@ -55,11 +55,11 @@ def optimizer_setting(params, parameter_list=None): lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] if fluid.in_dygraph_mode(): - optimizer = fluid.optimizer.SGD( - learning_rate=0.01, parameter_list=parameter_list + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=parameter_list ) else: - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], diff --git a/test/legacy_test/test_imperative_se_resnext.py b/test/legacy_test/test_imperative_se_resnext.py index 0675119126f5be9a081725a5cb4ba751be5e21bd..09feff3444edd3b32c5969f63ba9ec05b68c6f1c 100644 --- a/test/legacy_test/test_imperative_se_resnext.py +++ b/test/legacy_test/test_imperative_se_resnext.py @@ -55,11 +55,11 @@ def optimizer_setting(params, parameter_list=None): # base_lr = params["lr"] # lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] if fluid.in_dygraph_mode(): - optimizer = fluid.optimizer.SGD( - learning_rate=0.01, parameter_list=parameter_list + optimizer = paddle.optimizer.SGD( + learning_rate=0.01, parameters=parameter_list ) else: - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) return optimizer diff --git a/test/legacy_test/test_imperative_selected_rows.py b/test/legacy_test/test_imperative_selected_rows.py index 386217f64d0a212d3ea4125d048eebf0da86c505..5f04e8d28a19f4613c74c6afa632b1c1b7558915 100644 --- a/test/legacy_test/test_imperative_selected_rows.py +++ b/test/legacy_test/test_imperative_selected_rows.py @@ -20,7 +20,6 @@ import paddle from paddle import fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer class SimpleNet(paddle.nn.Layer): @@ -57,9 +56,9 @@ class TestSimpleNet(unittest.TestCase): input = paddle.to_tensor(input_word) simplenet = SimpleNet(20, 32, dtype) - adam = SGDOptimizer( + adam = paddle.optimizer.SGD( learning_rate=0.001, - parameter_list=simplenet.parameters(), + parameters=simplenet.parameters(), ) # grad_clip=grad_clip input_emb, emb = simplenet(input) input_emb.retain_grads() @@ -95,9 +94,9 @@ class TestSimpleNet(unittest.TestCase): input = to_variable(input_word) simplenet = SimpleNet(20, 32, "float32") - adam = SGDOptimizer( + adam = paddle.optimizer.SGD( learning_rate=0.001, - parameter_list=simplenet.parameters(), + parameters=simplenet.parameters(), grad_clip=grad_clip, ) input_emb, emb = simplenet(input) diff --git a/test/legacy_test/test_imperative_selected_rows_to_lod_tensor.py b/test/legacy_test/test_imperative_selected_rows_to_lod_tensor.py index 844b4081ebce0619b7fa43cd2c7eb8af5e12fd5a..4f7e4780e933681b33da37132bf0aea00210dd43 100644 --- a/test/legacy_test/test_imperative_selected_rows_to_lod_tensor.py +++ b/test/legacy_test/test_imperative_selected_rows_to_lod_tensor.py @@ -21,7 +21,6 @@ import paddle from paddle import fluid from paddle.fluid import core, framework from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.optimizer import SGDOptimizer from paddle.nn import Embedding @@ -125,9 +124,9 @@ class TestDygraphSimpleNet(unittest.TestCase): dtype=dtype, ) - sgd = SGDOptimizer( + sgd = paddle.optimizer.SGD( learning_rate=1e-3, - parameter_list=simple_net.parameters(), + parameters=simple_net.parameters(), ) dy_param_updated = {} dy_param_init = {} @@ -172,7 +171,7 @@ class TestDygraphSimpleNet(unittest.TestCase): ) exe = fluid.Executor(place) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) x = paddle.static.data( name="x", shape=[-1, num_steps], dtype='int64' ) diff --git a/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py b/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py index 0eda5bcc3493392965365acbf94dab2f97407c02..62476046e8ab2a19a031153bbf19413af6304284 100644 --- a/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py +++ b/test/legacy_test/test_imperative_star_gan_with_gradient_penalty.py @@ -482,14 +482,14 @@ def build_optimizer(layer, cfg, loss=None): beta1 = 0.5 beta2 = 0.999 if fluid.in_dygraph_mode(): - return fluid.optimizer.Adam( + return paddle.optimizer.Adam( learning_rate=learning_rate, beta1=beta1, beta2=beta2, - parameter_list=layer.parameters(), + parameters=layer.parameters(), ) else: - optimizer = fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, beta1=beta1, beta2=beta2 ) diff --git a/test/legacy_test/test_imperative_trace_non_persistable_inputs.py b/test/legacy_test/test_imperative_trace_non_persistable_inputs.py index cf8f1c59e829edb027d302962d1254596c390133..b90b5d47b899463d7599c525f083c44c17423a5e 100644 --- a/test/legacy_test/test_imperative_trace_non_persistable_inputs.py +++ b/test/legacy_test/test_imperative_trace_non_persistable_inputs.py @@ -44,8 +44,8 @@ class TestTracedLayerRecordNonPersistableInput(unittest.TestCase): batch_size = 4 fc_size = 2 layer = SimpleFCLayer(feature_size, batch_size, fc_size) - optimizer = fluid.optimizer.SGD( - learning_rate=1e-3, parameter_list=layer.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=layer.parameters() ) expected_persistable_vars = { diff --git a/test/legacy_test/test_imperative_transformer_sorted_gradient.py b/test/legacy_test/test_imperative_transformer_sorted_gradient.py index e430e95b43d5ae7708933d881ad343e1f34798bc..2d724c080cb7ebbd841d2ce700b7234227d4a358 100644 --- a/test/legacy_test/test_imperative_transformer_sorted_gradient.py +++ b/test/legacy_test/test_imperative_transformer_sorted_gradient.py @@ -1142,16 +1142,16 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): ) with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate - optimizer = fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps, - parameter_list=transformer.parameters(), + parameters=transformer.parameters(), ) else: - optimizer = fluid.optimizer.SGD( - learning_rate=0.003, parameter_list=transformer.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=0.003, parameters=transformer.parameters() ) dy_param_init = {} dy_param_updated = {} @@ -1220,7 +1220,7 @@ class TestDygraphTransformerSortGradient(unittest.TestCase): if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0) ) - optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer = paddle.optimizer.SGD(learning_rate=0.003) data_input_names = ( encoder_data_input_fields diff --git a/test/legacy_test/test_infer_no_need_buffer_slots.py b/test/legacy_test/test_infer_no_need_buffer_slots.py index 3f49a0a5b9d7332676ece9d7ba7edeb9a4f127d4..53f9212c7db72227db57eebf4695f3184b010e73 100644 --- a/test/legacy_test/test_infer_no_need_buffer_slots.py +++ b/test/legacy_test/test_infer_no_need_buffer_slots.py @@ -39,7 +39,7 @@ class TestInferNoNeedBufferSlots(unittest.TestCase): startup_program = framework.Program() with fluid.program_guard(program, startup_program): loss = self.net() - sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd = paddle.optimizer.SGD(learning_rate=0.01) sgd.minimize(loss) block = program.global_block() diff --git a/test/legacy_test/test_inference_model_io.py b/test/legacy_test/test_inference_model_io.py index b14f633086d700f5c071832c7b6d8a89c8aad102..f2e86a351d6fcced4b5cabe184da8645c642add1 100644 --- a/test/legacy_test/test_inference_model_io.py +++ b/test/legacy_test/test_inference_model_io.py @@ -26,7 +26,7 @@ from paddle.distributed.io import ( load_inference_model_distributed, save_persistables, ) -from paddle.fluid import core, executor, optimizer +from paddle.fluid import core, executor from paddle.fluid.compiler import CompiledProgram from paddle.fluid.framework import Program, program_guard from paddle.fluid.io import load_inference_model, save_inference_model @@ -61,7 +61,7 @@ class TestBook(unittest.TestCase): ) avg_cost = paddle.mean(cost) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() @@ -265,7 +265,7 @@ class TestSaveInferenceModelNew(unittest.TestCase): ) avg_cost = paddle.mean(cost) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() @@ -444,7 +444,7 @@ class TestSaveInferenceModelNew(unittest.TestCase): ) avg_cost = paddle.mean(cost) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() @@ -493,7 +493,7 @@ class TestSaveInferenceModelNew(unittest.TestCase): ) avg_cost = paddle.mean(cost) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost, init_program) place = core.CPUPlace() diff --git a/test/legacy_test/test_inplace_abn_op.py b/test/legacy_test/test_inplace_abn_op.py index 0e4344a16ee86d2a78ca0595bf20abc556e864a5..94c04ba41e4e180987b4e8a53aae38cbf8a13b04 100644 --- a/test/legacy_test/test_inplace_abn_op.py +++ b/test/legacy_test/test_inplace_abn_op.py @@ -78,7 +78,7 @@ class TestInplaceANBOpTraining(unittest.TestCase): sigmoid = paddle.nn.functional.sigmoid(bn) out = paddle.sum(sigmoid) if not only_forward: - sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) + sgd_opt = paddle.optimizer.SGD(learning_rate=0.0) sgd_opt.backward(out) return main, startup, [out, bn] diff --git a/test/legacy_test/test_inplace_addto_strategy.py b/test/legacy_test/test_inplace_addto_strategy.py index 34385b6814f4185742dca60f3e116879e75d0458..225b460f1ae4aaed8a6eb641251bb6c41f693167 100644 --- a/test/legacy_test/test_inplace_addto_strategy.py +++ b/test/legacy_test/test_inplace_addto_strategy.py @@ -72,7 +72,7 @@ def create_program(data_format="NCHW"): loss = paddle.sum(y) - sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd = paddle.optimizer.SGD(learning_rate=0.01) sgd.minimize(loss) return loss, main, startup, conv._conv.weight diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py index 7f58638e7ac7ad178372817cd669397568a0e4c8..5fce01a87ef509b304eccce5f5e40c66793195b0 100644 --- a/test/legacy_test/test_jit_save_load.py +++ b/test/legacy_test/test_jit_save_load.py @@ -300,7 +300,7 @@ class LinearNetWithMultiStaticFunc(paddle.nn.Layer): def train(layer, input_size=784, label_size=1): # create optimizer - sgd = fluid.optimizer.SGDOptimizer( + sgd = paddle.optimizer.SGD( learning_rate=0.01, parameter_list=layer.parameters() ) # create data loader @@ -328,8 +328,8 @@ def train(layer, input_size=784, label_size=1): def train_with_label(layer, input_size=784, label_size=1): # create optimizer - sgd = fluid.optimizer.SGDOptimizer( - learning_rate=0.01, parameter_list=layer.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=0.01, parameters=layer.parameters() ) # create data loader train_loader = fluid.io.DataLoader.from_generator(capacity=5) @@ -671,8 +671,8 @@ class TestJitSaveLoadConfig(unittest.TestCase): def test_output_spec(self): train_layer = LinearNetReturnLoss(8, 8) - adam = fluid.optimizer.AdamOptimizer( - learning_rate=0.1, parameter_list=train_layer.parameters() + adam = paddle.optimizer.Adam( + learning_rate=0.1, parameters=train_layer.parameters() ) x = fluid.dygraph.to_variable( np.random.random((4, 8)).astype('float32') @@ -779,8 +779,8 @@ class TestJitPruneModelAndLoad(unittest.TestCase): def train_and_save(self): train_layer = LinearNetReturnHidden(8, 8) - adam = fluid.optimizer.AdamOptimizer( - learning_rate=0.1, parameter_list=train_layer.parameters() + adam = paddle.optimizer.Adam( + learning_rate=0.1, parameters=train_layer.parameters() ) x = fluid.dygraph.to_variable( np.random.random((4, 8)).astype('float32') diff --git a/test/legacy_test/test_learning_rate_scheduler.py b/test/legacy_test/test_learning_rate_scheduler.py index e58478d1ee64e3120a11c73f67bf182a6a27e957..0ff27eec5ad2067e601095cc93ce6fdc62dccff1 100644 --- a/test/legacy_test/test_learning_rate_scheduler.py +++ b/test/legacy_test/test_learning_rate_scheduler.py @@ -132,16 +132,16 @@ class TestLearningRateDecayDygraph(unittest.TestCase): learning_rate=1.0, factor=0.5, patience=5, cooldown=3 ) - adam1 = fluid.optimizer.Adam( + adam1 = paddle.optimizer.Adam( learning_rate=Exponential_scheduler, - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) - adam2 = fluid.optimizer.Adam( - learning_rate=Step_scheduler, parameter_list=linear.parameters() + adam2 = paddle.optimizer.Adam( + learning_rate=Step_scheduler, parameters=linear.parameters() ) - adam3 = fluid.optimizer.Adam( + adam3 = paddle.optimizer.Adam( learning_rate=Reducelr_scheduler, - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) print(adam3.state_dict()) @@ -172,66 +172,66 @@ class TestLearningRateDecayDygraph(unittest.TestCase): paddle.save(adam1.state_dict(), "save_path.pdopt") opt_state = paddle.load("save_path.pdopt") - adam_test = fluid.optimizer.Adam( + adam_test = paddle.optimizer.Adam( learning_rate=Exponential_scheduler_test, - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) - adam_test.set_dict(opt_state) + adam_test.set_state_dict(opt_state) self.assertEqual( adam_test._learning_rate.last_epoch, adam1._learning_rate.last_epoch, - "last_epoch is different before and after set_dict", + "last_epoch is different before and after set_state_dict", ) paddle.save(adam2.state_dict(), "save_path.pdopt") opt_state = paddle.load("save_path.pdopt") - adam_test = fluid.optimizer.Adam( + adam_test = paddle.optimizer.Adam( learning_rate=Step_scheduler_test, - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) - adam_test.set_dict(opt_state) + adam_test.set_state_dict(opt_state) self.assertEqual( adam_test._learning_rate.last_epoch, adam2._learning_rate.last_epoch, - "epoch_num is different before and after set_dict", + "epoch_num is different before and after set_state_dict", ) self.assertEqual( adam_test._learning_rate(), adam2._learning_rate(), - "current learning rate is different before and after set_dict", + "current learning rate is different before and after set_state_dict", ) paddle.save(adam3.state_dict(), "save_path.pdopt") opt_state = paddle.load("save_path.pdopt") - adam_test = fluid.optimizer.Adam( + adam_test = paddle.optimizer.Adam( learning_rate=Reducelr_scheduler_test, - parameter_list=linear.parameters(), + parameters=linear.parameters(), ) - adam_test.set_dict(opt_state) + adam_test.set_state_dict(opt_state) self.assertEqual( adam_test._learning_rate.best, adam3._learning_rate.best, - "best_loss is different before and after set_dict", + "best_loss is different before and after set_state_dict", ) self.assertEqual( adam_test._learning_rate.cooldown_counter, adam3._learning_rate.cooldown_counter, - "cooldown_counter is different before and after set_dict", + "cooldown_counter is different before and after set_state_dict", ) self.assertEqual( adam_test._learning_rate.num_bad_epochs, adam3._learning_rate.num_bad_epochs, - "num_bad_epochs is different before and after set_dict", + "num_bad_epochs is different before and after set_state_dict", ) self.assertEqual( adam_test._learning_rate.last_epoch, adam3._learning_rate.last_epoch, - "epoch is different before and after set_dict", + "epoch is different before and after set_state_dict", ) self.assertEqual( adam_test._learning_rate(), adam3._learning_rate(), - "current learning rate is different before and after set_dict", + "current learning rate is different before and after set_state_dict", ) def test_NoamDecay(self): @@ -368,8 +368,8 @@ class TestLearningRateDecayDygraph(unittest.TestCase): ) linear = paddle.nn.Linear(10, 10) - adam = fluid.optimizer.Adam( - scheduler, parameter_list=linear.parameters() + adam = paddle.optimizer.Adam( + scheduler, parameters=linear.parameters() ) for epoch in range(30): diff --git a/test/legacy_test/test_listen_and_serv_op.py b/test/legacy_test/test_listen_and_serv_op.py index 97b0eb435c92fcf10d6a9a595c8fa8c22745ac33..2ff2bddfcf2bf49592dcec12cea30b6156d1ed43 100644 --- a/test/legacy_test/test_listen_and_serv_op.py +++ b/test/legacy_test/test_listen_and_serv_op.py @@ -40,7 +40,7 @@ def run_pserver(use_cuda, sync_mode, ip, port, trainers, trainer_id): avg_cost = paddle.mean(cost) # optimizer - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() @@ -77,7 +77,7 @@ def run_pserver_with_empty_block( avg_cost = paddle.mean(cost) # optimizer - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/legacy_test/test_load_state_dict_from_old_format.py b/test/legacy_test/test_load_state_dict_from_old_format.py index 5a261f81cb281ac7790f6fae4b8f068a8e4ce78f..4cb19ae90e01b070144eb38ad01393e28f18268a 100644 --- a/test/legacy_test/test_load_state_dict_from_old_format.py +++ b/test/legacy_test/test_load_state_dict_from_old_format.py @@ -57,7 +57,7 @@ def static_train_net(img, label): ) avg_loss = paddle.mean(loss) - optimizer = fluid.optimizer.SGD(learning_rate=0.001) + optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer.minimize(avg_loss) return prediction, avg_loss diff --git a/test/legacy_test/test_lookup_table_v2_op.py b/test/legacy_test/test_lookup_table_v2_op.py index 586e347cf00b847e287e9be1d8adb71bd432e675..c02ce3f95be8457f2dfac6cc4df70d9054169b7d 100644 --- a/test/legacy_test/test_lookup_table_v2_op.py +++ b/test/legacy_test/test_lookup_table_v2_op.py @@ -223,7 +223,7 @@ class TestLookupTableIsSparse(unittest.TestCase): loss = paddle.nn.functional.square_error_cost(input=y, label=y_) loss = paddle.mean(loss) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-4) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1e-4) sgd_optimizer.minimize(loss) place = fluid.CPUPlace() diff --git a/test/legacy_test/test_mix_precision_all_reduce_fuse.py b/test/legacy_test/test_mix_precision_all_reduce_fuse.py index cf860365724a3debd887a78e26bb9914c166aae6..ef81e5cc3bf71e62d7b94e38ff58f3999427c5e8 100644 --- a/test/legacy_test/test_mix_precision_all_reduce_fuse.py +++ b/test/legacy_test/test_mix_precision_all_reduce_fuse.py @@ -20,7 +20,6 @@ from parallel_executor_test_base import DeviceType, TestParallelExecutorBase from simple_nets import init_data import paddle -from paddle import fluid from paddle.fluid import core batch_size = 12 @@ -66,7 +65,7 @@ def conv_net(use_feed): def _optimizer(learning_rate=1e-6): - optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + optimizer = paddle.optimizer.SGD(learning_rate=learning_rate) return optimizer diff --git a/test/legacy_test/test_model.py b/test/legacy_test/test_model.py index 2a1fafec22665d707eeeb7391323c7698278ad49..35649cce6bc545b7c8350207137d6c1792a5034e 100644 --- a/test/legacy_test/test_model.py +++ b/test/legacy_test/test_model.py @@ -149,8 +149,8 @@ def compute_acc(pred, label): def dynamic_train(model, dataloader): - optim = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=model.parameters() + optim = paddle.optimizer.Adam( + learning_rate=0.001, parameters=model.parameters() ) model.train() for inputs, labels in dataloader: @@ -280,8 +280,8 @@ class TestModel(unittest.TestCase): paddle.framework.random._manual_program_seed(seed) net = LeNet() - optim_new = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=net.parameters() + optim_new = paddle.optimizer.Adam( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs=self.inputs, labels=self.labels) model.prepare( @@ -346,8 +346,8 @@ class TestModel(unittest.TestCase): paddle.framework.random._manual_program_seed(seed) net = LeNet() - optim_new = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=net.parameters() + optim_new = paddle.optimizer.Adam( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs=tuple(self.inputs), labels=tuple(self.labels)) model.prepare( @@ -497,8 +497,8 @@ class TestModelFunction(unittest.TestCase): fluid.enable_dygraph(fluid.CPUPlace()) self.set_seed() m = MyModel() - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=m.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=m.parameters() ) m.train() output = m(to_tensor(data)) @@ -517,8 +517,8 @@ class TestModelFunction(unittest.TestCase): self.set_seed() net = MyModel() - optim2 = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=net.parameters() + optim2 = paddle.optimizer.SGD( + learning_rate=0.001, parameters=net.parameters() ) inputs = [InputSpec([None, dim], 'float32', 'x')] @@ -566,8 +566,8 @@ class TestModelFunction(unittest.TestCase): net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=net.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs, labels) model.prepare( @@ -595,8 +595,8 @@ class TestModelFunction(unittest.TestCase): learning_rate=0.001, parameters=net.parameters() ) else: - optim = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=net.parameters() + optim = paddle.optimizer.Adam( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs, labels) model.prepare( @@ -618,8 +618,8 @@ class TestModelFunction(unittest.TestCase): device = paddle.set_device('cpu') fluid.enable_dygraph(device) model = Model(MyModel()) - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=model.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=model.parameters() ) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) model.save(path) @@ -628,8 +628,8 @@ class TestModelFunction(unittest.TestCase): inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] model = Model(MyModel(), inputs, labels) - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=model.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=model.parameters() ) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) model.load(path) @@ -644,8 +644,8 @@ class TestModelFunction(unittest.TestCase): net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=net.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs, labels) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) @@ -657,8 +657,8 @@ class TestModelFunction(unittest.TestCase): net = MyModel() inputs = [InputSpec([None, 20], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=net.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=net.parameters() ) model = Model(net, inputs, labels) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) @@ -881,8 +881,8 @@ class TestModelFunction(unittest.TestCase): for initial in ["fit", "train_batch", "eval_batch", "predict_batch"]: net = LeNet() model = Model(net) - optim = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=model.parameters() + optim = paddle.optimizer.Adam( + learning_rate=0.001, parameters=model.parameters() ) model.prepare( optimizer=optim, loss=CrossEntropyLoss(reduction="sum") @@ -912,8 +912,8 @@ class TestModelFunction(unittest.TestCase): net = LeNet() inputs = InputSpec([None, 1, 28, 28], 'float32', 'x') model = Model(net, inputs) - optim = fluid.optimizer.Adam( - learning_rate=0.001, parameter_list=model.parameters() + optim = paddle.optimizer.Adam( + learning_rate=0.001, parameters=model.parameters() ) model.prepare(optimizer=optim, loss=CrossEntropyLoss(reduction="sum")) model.save(save_dir, training=False) @@ -926,8 +926,8 @@ class TestModelFunction(unittest.TestCase): data = np.random.random(size=(4, dim)).astype(np.float32) label = np.random.randint(0, 10, size=(4, 1)).astype(np.int64) net = MyModel() - optim = fluid.optimizer.SGD( - learning_rate=0.001, parameter_list=net.parameters() + optim = paddle.optimizer.SGD( + learning_rate=0.001, parameters=net.parameters() ) inputs = [InputSpec([None, dim], 'float32', 'x')] labels = [InputSpec([None, 1], 'int64', 'label')] diff --git a/test/legacy_test/test_momentum_op.py b/test/legacy_test/test_momentum_op.py index d59cfab7d1793550eb45b07f340c555c1e211bf5..23dbab84e78ed195b052b0b09b316237ff63b85a 100644 --- a/test/legacy_test/test_momentum_op.py +++ b/test/legacy_test/test_momentum_op.py @@ -819,11 +819,11 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase): weight_attr=paddle.nn.initializer.Constant(value=2.0), bias_attr=paddle.nn.initializer.Constant(value=2.0), ) - momentum_old = paddle.fluid.optimizer.Momentum( + momentum_old = paddle.optimizer.Momentum( learning_rate=0.01, momentum=0.9, - parameter_list=linear_old.parameters(), - regularization=paddle.regularizer.L2Decay(coeff=0.1), + parameters=linear_old.parameters(), + weight_decay=paddle.regularizer.L2Decay(coeff=0.1), ) self.__update_params(momentum=momentum_old, linear=linear_old) diff --git a/test/legacy_test/test_multiprocess_dataloader_dynamic.py b/test/legacy_test/test_multiprocess_dataloader_dynamic.py index 4cf7801bfa48b3110b51f024294ba12c2d68ade3..debc1cbf2063065f4cd66168de7884ae220c2d09 100644 --- a/test/legacy_test/test_multiprocess_dataloader_dynamic.py +++ b/test/legacy_test/test_multiprocess_dataloader_dynamic.py @@ -80,7 +80,7 @@ class TestDygraphDataLoader(unittest.TestCase): fluid.default_main_program().random_seed = 1 with fluid.dygraph.guard(places[0]): fc_net = SimpleFCNet() - optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters()) + optimizer = paddle.optimizer.Adam(parameters=fc_net.parameters()) dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM) dataloader = DataLoader( @@ -151,7 +151,7 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader): fluid.default_main_program().random_seed = 1 with fluid.dygraph.guard(places[0]): fc_net = SimpleFCNet() - optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters()) + optimizer = paddle.optimizer.Adam(parameters=fc_net.parameters()) dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM) dataloader = DataLoader( diff --git a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_dynamic.py index 9981fd7b9f461fd83f7c5a8f7b348dde31f3b638..5ee06b58674c3d29293e9b94391516c25faa2aa2 100644 --- a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_dynamic.py +++ b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_dynamic.py @@ -81,7 +81,7 @@ class TestDygraphDataLoader(unittest.TestCase): fluid.default_main_program().random_seed = 1 with fluid.dygraph.guard(places[0]): fc_net = SimpleFCNet() - optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters()) + optimizer = paddle.optimizer.Adam(parameters=fc_net.parameters()) dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM) dataloader = DataLoader( @@ -150,7 +150,7 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader): fluid.default_main_program().random_seed = 1 with fluid.dygraph.guard(places[0]): fc_net = SimpleFCNet() - optimizer = fluid.optimizer.Adam(parameter_list=fc_net.parameters()) + optimizer = paddle.optimizer.Adam(parameters=fc_net.parameters()) dataset = RandomBatchedDataset(SAMPLE_NUM, CLASS_NUM) dataloader = DataLoader( diff --git a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py index 8a948d493cc8902e7d2374399298827ee6005460..a2c0ec547fbcb76058ac70a316dee9b51521e042 100644 --- a/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py +++ b/test/legacy_test/test_multiprocess_dataloader_iterable_dataset_static.py @@ -90,7 +90,7 @@ def simple_fc_net_static(): ) ) - optimizer = fluid.optimizer.Adam() + optimizer = paddle.optimizer.Adam() optimizer.minimize(loss) return startup_prog, main_prog, image, label, loss diff --git a/test/legacy_test/test_multiprocess_dataloader_static.py b/test/legacy_test/test_multiprocess_dataloader_static.py index 9cf4912936cdd82fce174f2c7af75cf444597f2c..478aec47f7f2522e851886c526385c05a29072f1 100644 --- a/test/legacy_test/test_multiprocess_dataloader_static.py +++ b/test/legacy_test/test_multiprocess_dataloader_static.py @@ -90,7 +90,7 @@ def simple_fc_net_static(): ) ) - optimizer = fluid.optimizer.Adam() + optimizer = paddle.optimizer.Adam() optimizer.minimize(loss) return startup_prog, main_prog, image, label, loss diff --git a/test/legacy_test/test_nce.py b/test/legacy_test/test_nce.py index 93f54396ba085ac0bb7b2ddd23d7831bd5a019f4..f91abbff552487d9a4b3887b893867e503899425 100644 --- a/test/legacy_test/test_nce.py +++ b/test/legacy_test/test_nce.py @@ -175,7 +175,7 @@ class TestNCECase1SelectedRows(unittest.TestCase): def get_optimizer(self): # SGD optimizer - optimizer = fluid.optimizer.SGD(learning_rate=self.base_lr) + optimizer = paddle.optimizer.SGD(learning_rate=self.base_lr) return optimizer def train_network( diff --git a/test/legacy_test/test_network_with_dtype.py b/test/legacy_test/test_network_with_dtype.py index 7c37c9e52aaf30c930176f113e4cc0b2581863c3..07f38a1e80d9cb820204b83c07346d38101fa147 100644 --- a/test/legacy_test/test_network_with_dtype.py +++ b/test/legacy_test/test_network_with_dtype.py @@ -37,7 +37,7 @@ class TestNetWithDtype(unittest.TestCase): input=y_predict, label=y ) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) fetch_list = [avg_cost] diff --git a/test/legacy_test/test_optimizer.py b/test/legacy_test/test_optimizer.py index 7a93fe8f33a01c6ff9da02769bde04230edd206e..568d50eab22bde2a413777e41e345ccb41c43258 100644 --- a/test/legacy_test/test_optimizer.py +++ b/test/legacy_test/test_optimizer.py @@ -21,7 +21,7 @@ import numpy as np import paddle from paddle import fluid -from paddle.fluid import core, framework, optimizer +from paddle.fluid import core, framework from paddle.fluid.backward import append_backward from paddle.fluid.framework import ( Program, @@ -62,7 +62,7 @@ class TestOptimizer(unittest.TestCase): block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} ) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01) opts, _ = sgd_optimizer.minimize(mean_out, init_program) return opts @@ -106,7 +106,7 @@ class TestOptimizerBackwardApplygrad(unittest.TestCase): block.append_op( type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} ) - sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.01) with framework.program_guard(program, init_program): p_g = sgd_optimizer.backward(mean_out) opts = sgd_optimizer.apply_gradients(p_g) @@ -122,7 +122,7 @@ class TestOptimizerBackwardApplygrad(unittest.TestCase): class TestMomentumOptimizer(unittest.TestCase): - class MockMomentum(optimizer.MomentumOptimizer): + class MockMomentum(paddle.optimizer.Momentum): def get_accumulators(self): return self._accumulators @@ -184,9 +184,9 @@ class TestMomentumOptimizer(unittest.TestCase): init_ops = init_program.global_block().ops self.assertEqual(len(init_ops), 2) self.assertEqual(init_ops[1].type, "fill_constant") - self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate) + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) self.assertEqual(init_ops[0].type, "fill_constant") - self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) + self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) def test_nesterov_momentum_optimizer(self): init_program = framework.Program() @@ -243,13 +243,13 @@ class TestMomentumOptimizer(unittest.TestCase): init_ops = init_program.global_block().ops self.assertEqual(len(init_ops), 2) self.assertEqual(init_ops[1].type, "fill_constant") - self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate) + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) self.assertEqual(init_ops[0].type, "fill_constant") - self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) + self.assertAlmostEqual(init_ops[1].attr('value'), 0.0) class TestAdamOptimizer(unittest.TestCase): - class MockAdam(optimizer.AdamOptimizer): + class MockAdam(paddle.optimizer.Adam): def get_accumulators(self): return self._accumulators @@ -316,239 +316,7 @@ class TestAdamOptimizer(unittest.TestCase): init_ops = init_program.global_block().ops self.assertEqual(len(init_ops), 5) self.assertEqual(init_ops[-1].type, "fill_constant") - self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) - - -class TestDpsgdOptimizer(unittest.TestCase): - def test_dpsgd_optimizer(self): - def check_dpsgd_optimizer(optimizer_attr): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr=optimizer_attr, - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - dpsgd_optimizer = optimizer.DpsgdOptimizer( - learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0 - ) - opts, _ = dpsgd_optimizer.minimize(mean_out, init_program) - return opts - - opts = check_dpsgd_optimizer( - { - 'learning_rate': 1.1, - 'clip': 100.0, - 'batch_size': 16.0, - 'sigma': 4.0, - } - ) - self.assertEqual(len(opts), 2) - self.assertEqual([op.type for op in opts], ["scale", "dpsgd"]) - - -class TestDecayedAdagradOptimizer(unittest.TestCase): - class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer): - def get_accumulators(self): - return self._accumulators - - def get_moment_str(self): - return self._moment_acc_str - - def test_decayed_adagrad_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr={'learning_rate': 1.1}, - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - learning_rate = 0.01 - decayed_adagrad_optimizer = self.MockDecayedAdagrad( - learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6 - ) - params_grads = append_backward(mean_out) - self.assertEqual(len(params_grads), 1) - self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0) - with framework.program_guard(program, init_program): - opts = decayed_adagrad_optimizer.apply_gradients(params_grads) - self.assertEqual(len(opts), 2) - self.assertEqual([op.type for op in opts], ["scale", "decayed_adagrad"]) - - # Check accumulators - accumulators = decayed_adagrad_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 1) - self.assertTrue( - decayed_adagrad_optimizer.get_moment_str() in accumulators - ) - moment_acc = accumulators[decayed_adagrad_optimizer.get_moment_str()] - self.assertEqual(len(moment_acc), 1) - self.assertTrue(mul_x.name in moment_acc) - - # Check init_program - init_ops = init_program.global_block().ops - self.assertEqual(len(init_ops), 2) - self.assertEqual(init_ops[1].type, "fill_constant") - self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate) - self.assertEqual(init_ops[0].type, "fill_constant") - self.assertAlmostEqual(init_ops[0].attr('value'), 0.0) - - -class TestFtrlOptimizer(unittest.TestCase): - class MockFtrl(optimizer.FtrlOptimizer): - def get_accumulators(self): - return self._accumulators - - def get_squared_str(self): - return self._squared_acc_str - - def get_linear_str(self): - return self._linear_acc_str - - def test_ftrl_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr={'learning_rate': 1.1}, - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - learning_rate = 0.01 - ftrl_optimizer = self.MockFtrl( - learning_rate=learning_rate, l1=0.0, l2=0.0, lr_power=-0.5 - ) - params_grads = append_backward(mean_out) - self.assertEqual(len(params_grads), 1) - self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0) - with framework.program_guard(program, init_program): - opts = ftrl_optimizer.apply_gradients(params_grads) - self.assertEqual(len(opts), 2) - self.assertEqual([op.type for op in opts], ["scale", "ftrl"]) - - # Check accumulators - accumulators = ftrl_optimizer.get_accumulators() - self.assertEqual(len(accumulators), 2) - self.assertTrue(ftrl_optimizer.get_squared_str() in accumulators) - self.assertTrue(ftrl_optimizer.get_linear_str() in accumulators) - squared_acc = accumulators[ftrl_optimizer.get_squared_str()] - linear_acc = accumulators[ftrl_optimizer.get_linear_str()] - self.assertEqual(len(squared_acc), 1) - self.assertEqual(len(linear_acc), 1) - self.assertTrue(mul_x.name in squared_acc) - self.assertTrue(mul_x.name in linear_acc) - - # Check init_program - init_ops = init_program.global_block().ops - self.assertEqual(len(init_ops), 3) - self.assertEqual(init_ops[-1].type, "fill_constant") - self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate) - - -class TestLookaheadOptimizer(unittest.TestCase): - def test_lookahead_optimizer(self): - init_program = framework.Program() - program = framework.Program() - block = program.global_block() - init_block = init_program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - optimize_attr={'learning_rate': 1.1}, - ) - init_mul_x = init_block.create_parameter( - dtype="float32", shape=[5, 10], lod_level=0, name="mul.x" - ) - mul_y = block.create_var( - dtype="float32", shape=[10, 8], lod_level=0, name="mul.y" - ) - mul_out = block.create_var( - dtype="float32", shape=[5, 8], lod_level=0, name="mul.out" - ) - mean_out = block.create_var( - dtype="float32", shape=[1], lod_level=0, name="mean.out" - ) - - block.append_op( - type="mul", - inputs={"X": mul_x, "Y": mul_y}, - outputs={"Out": mul_out}, - attrs={"x_num_col_dims": 1}, - ) - block.append_op( - type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out} - ) - - sgd = optimizer.SGD(learning_rate=0.01) - lookahead = optimizer.LookaheadOptimizer(sgd, alpha=0.5, k=5) - with framework.program_guard(program, init_program): - opts, _ = lookahead.minimize(mean_out) - self.assertEqual(len(opts), 2) - self.assertEqual([op.type for op in opts], ["scale", "sgd"]) + self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) class TestRecomputeOptimizer(unittest.TestCase): @@ -656,8 +424,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -687,8 +457,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -719,8 +491,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out.name]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -751,8 +525,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([mul_out, b2_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -783,8 +559,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([mul_out, b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -814,8 +592,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b2_out, mul_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -846,8 +626,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([mul_x, b2_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -874,8 +656,10 @@ class TestRecomputeOptimizer(unittest.TestCase): def test_apply_gradients(self): mul_out, b1_out, b2_out, mean_out = self.net() - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out]) # apply backward params_grads = recompute_optimizer.backward( @@ -912,8 +696,10 @@ class TestRecomputeOptimizer(unittest.TestCase): def test_load(self): mul_out, b1_out, b2_out, mean_out = self.net() - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out]) try: state_dict = {} @@ -935,8 +721,10 @@ class TestRecomputeOptimizer(unittest.TestCase): [op.type for op in mean_out.block.ops], ["mul", "dropout", "elementwise_add", "elementwise_add", "mean"], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -980,8 +768,10 @@ class TestRecomputeOptimizer(unittest.TestCase): "mean", ], ) - sgd_optimizer = optimizer.SGD(learning_rate=1.0) - recompute_optimizer = optimizer.RecomputeOptimizer(sgd_optimizer) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=1.0) + recompute_optimizer = paddle.incubate.optimizer.RecomputeOptimizer( + sgd_optimizer + ) recompute_optimizer._set_checkpoints([b1_out]) opts, params_grads = recompute_optimizer.minimize(mean_out) @@ -1050,8 +840,8 @@ class TestRecomputeOptimizer(unittest.TestCase): name="y", shape=[-1, 1], dtype='int64' ) drop_res, prediction, cost = mlp(input_x, input_y) - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) sgd._set_checkpoints([prediction]) sgd.minimize(cost) @@ -1115,8 +905,8 @@ class TestRecomputeOptimizerCUDA(unittest.TestCase): name="y", shape=[-1, 1], dtype='int64' ) drop_res, prediction, cost = mlp(input_x, input_y) - sgd = fluid.optimizer.Adam(learning_rate=0.01) - sgd = fluid.optimizer.RecomputeOptimizer(sgd) + sgd = paddle.optimizer.Adam(learning_rate=0.01) + sgd = paddle.incubate.optimizer.RecomputeOptimizer(sgd) sgd._set_checkpoints([prediction]) sgd.minimize(cost) @@ -1186,8 +976,8 @@ class TestGradientMergeOptimizer(unittest.TestCase): ["mul", "elementwise_add", "mean"], ) - opt = optimizer.SGD(learning_rate=1.0) - opt = optimizer.GradientMergeOptimizer(opt, k_steps=4) + opt = paddle.optimizer.SGD(learning_rate=1.0) + opt = paddle.incubate.optimizer.GradientMergeOptimizer(opt, k_steps=4) with framework.program_guard(main_program, init_program): ops, params_grads = opt.minimize(cost) diff --git a/test/legacy_test/test_optimizer_grad.py b/test/legacy_test/test_optimizer_grad.py index 6be98e05359f7e65b5c7d620d59f18364934cc52..842c5d0766326437a9cf121d57ed10423e663cf3 100644 --- a/test/legacy_test/test_optimizer_grad.py +++ b/test/legacy_test/test_optimizer_grad.py @@ -19,7 +19,6 @@ import numpy as np import paddle from paddle import fluid -from paddle.fluid import optimizer from paddle.fluid.backward import _append_grad_suffix_ paddle.enable_static() @@ -151,7 +150,7 @@ class TestOptimizer(unittest.TestCase): def setUp(self): self._init_config() - self.optimizer = optimizer.SGDOptimizer(learning_rate=0.001) + self.optimizer = paddle.optimizer.SGD(learning_rate=0.001) self.attr = {} def _init_config(self): @@ -256,56 +255,5 @@ class TestSGDOptimizer(TestOptimizer): self._check_grads(use_bf16=True) -class TestAdamOptimizer(TestOptimizer): - """ - inherit TestOptimizer and shall override two functions as follows: - setUp(): to set config info of optimizer, including Optimizer and its hyper-parameter. - _apply_gradient(): to implement the way of updating grad. - """ - - def setUp(self): - self._init_config() - beta1, beta2, epsilon = 0.9, 0.999, 1e-8 - self.optimizer = optimizer.AdamOptimizer( - learning_rate=0.01, beta1=beta1, beta2=beta2, epsilon=epsilon - ) - self.attr = { - "beta1": beta1, - "beta2": beta2, - "beta1_pow": beta1, - "beta2_pow": beta2, - "moment1": np.zeros(SHAPE).astype("float32"), - "moment2": np.zeros(SHAPE).astype("float32"), - "epsilon": epsilon, - } - - def _apply_gradient(self, param, grad, name): - """ - The way of updating grad in AdamOptimizer - """ - attr = self.param_attr[name] - beta1, beta2 = attr["beta1"], attr["beta2"] - moment1, moment2 = attr['moment1'], attr['moment2'] - beta1_pow, beta2_pow = attr['beta1_pow'], attr['beta2_pow'] - epsilon = attr['epsilon'] - - moment1_out = beta1 * moment1 + (1.0 - beta1) * grad - moment2_out = beta2 * moment2 + (1.0 - beta2) * np.square(grad) - - lr = attr['lr'] * np.sqrt(1.0 - beta2_pow) / (1.0 - beta1_pow) - param_out = param - lr * ( - moment1_out - / (np.sqrt(moment2_out) + epsilon * np.sqrt(1 - beta2_pow)) - ) - - # update hyper-parameter of optimizer - self.param_attr[name]['beta1_pow'] = beta1_pow * beta1 - self.param_attr[name]['beta2_pow'] = beta2_pow * beta2 - self.param_attr[name]['moment1'] = moment1_out - self.param_attr[name]['moment2'] = moment2_out - - return param_out - - if __name__ == '__main__': unittest.main() diff --git a/test/legacy_test/test_optimizer_in_control_flow.py b/test/legacy_test/test_optimizer_in_control_flow.py index f9751f7b49444f75916c34d08f8bf915fe8183a3..5e1fca418b7fe780b6e26b6d3c4fca4dfc2735e2 100644 --- a/test/legacy_test/test_optimizer_in_control_flow.py +++ b/test/legacy_test/test_optimizer_in_control_flow.py @@ -18,7 +18,7 @@ import numpy as np import paddle from paddle import fluid -from paddle.fluid import core, optimizer +from paddle.fluid import core from paddle.fluid.framework import Program, program_guard BATCH_SIZE = 1 @@ -92,8 +92,8 @@ def static( label = paddle.static.data('label', [BATCH_SIZE, 1], 'int64') hidden, prediction = double_fc_net(image) - adam = optimizer.Adam(learning_rate=LR) - sgd = optimizer.SGD(learning_rate=LR) + adam = paddle.optimizer.Adam(learning_rate=LR) + sgd = paddle.optimizer.SGD(learning_rate=LR) id = paddle.static.data('id', [1], 'int32') two = paddle.tensor.fill_constant([1], 'int32', 2) @@ -178,11 +178,11 @@ def dynamic(train_data, use_cuda=False, use_parallel_exe=False): fluid.default_startup_program().random_seed = SEED fluid.default_main_program().random_seed = SEED dy_layer = DygraphLayer() - adam = fluid.optimizer.Adam( - learning_rate=LR, parameter_list=dy_layer.parameters() + adam = paddle.optimizer.Adam( + learning_rate=LR, parameters=dy_layer.parameters() ) - sgd = fluid.optimizer.SGD( - learning_rate=LR, parameter_list=dy_layer.parameters() + sgd = paddle.optimizer.SGD( + learning_rate=LR, parameters=dy_layer.parameters() ) for epoch in range(EPOCH_NUM): diff --git a/test/legacy_test/test_paddle_fluid_modelaverage.py b/test/legacy_test/test_paddle_fluid_modelaverage.py deleted file mode 100644 index 9b886a3b53f9f1736cf8d9d9064bdd55fe7393a1..0000000000000000000000000000000000000000 --- a/test/legacy_test/test_paddle_fluid_modelaverage.py +++ /dev/null @@ -1,113 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import unittest - -import numpy as np - -import paddle -from paddle import fluid - - -class TestModelAverage(unittest.TestCase): - def test_model_average_static(self): - paddle.enable_static() - place = fluid.CPUPlace() - shape = [2, 3, 8, 8] - exe = fluid.Executor(place) - train_program = fluid.Program() - startup = fluid.Program() - test_program = fluid.Program() - with fluid.program_guard(train_program, startup): - with fluid.unique_name.guard(): - data = paddle.static.data( - name='X', shape=[None, 1], dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - test_program = train_program.clone() - optimizer = paddle.optimizer.Momentum( - learning_rate=0.2, momentum=0.1 - ) - - optimizer.minimize(loss) - # build ModelAverage optimizer - model_average = paddle.fluid.optimizer.ModelAverage( - 0.15, min_average_window=2, max_average_window=10 - ) - - exe.run(startup) - for i in range(10): - x = np.random.random(size=(10, 1)).astype('float32') - ( - latest_b, - sum_1, - sum_2, - sum_3, - num_accumulates, - old_num_accumulates, - num_updates, - ) = exe.run( - program=train_program, - feed={'X': x}, - fetch_list=[ - 'fc_0.b_0', - 'fc_0.b_0_sum_1_0', - 'fc_0.b_0_sum_2_0', - 'fc_0.b_0_sum_3_0', - 'fc_0.b_0_num_accumulates_0', - 'fc_0.b_0_old_num_accumulates_0', - 'fc_0.b_0_num_updates_0', - ], - ) - self.assertTrue( - np.equal(sum_1, np.zeros(shape=[10], dtype='float32')).all() - ) - self.assertTrue( - np.equal(sum_2, np.zeros(shape=[10], dtype='float32')).all() - ) - self.assertTrue( - np.equal(num_accumulates, np.array([0], dtype='int64')).all() - ) - self.assertTrue( - np.equal(old_num_accumulates, np.array([2], dtype='int64')).all() - ) - self.assertTrue( - np.equal(num_updates, np.array([10], dtype='int64')).all() - ) - - average_b = (sum_1 + sum_2 + sum_3) / ( - num_accumulates + old_num_accumulates - ) - # apply ModelAverage - with model_average.apply(exe): - x = np.random.random(size=(10, 1)).astype('float32') - outs, b = exe.run( - program=test_program, - feed={'X': x}, - fetch_list=[loss.name, 'fc_0.b_0'], - ) - self.assertAlmostEqual(np.mean(average_b), np.mean(b)) - - x = np.random.random(size=(10, 1)).astype('float32') - outs, b = exe.run( - program=test_program, - feed={'X': x}, - fetch_list=[loss.name, 'fc_0.b_0'], - ) - self.assertAlmostEqual(np.mean(latest_b), np.mean(b)) - - -if __name__ == "__main__": - unittest.main() diff --git a/test/legacy_test/test_paddle_save_load.py b/test/legacy_test/test_paddle_save_load.py index f9d1e0d70a721470ac91767c16ae27ccc1058ee5..be7eb0febeb624ec3106fbf4d3055cabdb8404e1 100644 --- a/test/legacy_test/test_paddle_save_load.py +++ b/test/legacy_test/test_paddle_save_load.py @@ -24,7 +24,7 @@ import paddle import paddle.optimizer as opt from paddle import fluid, nn from paddle.fluid import framework -from paddle.fluid.optimizer import Adam +from paddle.optimizer import Adam from paddle.optimizer.lr import LRScheduler BATCH_SIZE = 16 diff --git a/test/legacy_test/test_pass_builder.py b/test/legacy_test/test_pass_builder.py index 1d97744410edb9e413dbcfdd11d7888bc8bf107c..19912b6df065fad58ca4d53baf26bd7b565845c3 100644 --- a/test/legacy_test/test_pass_builder.py +++ b/test/legacy_test/test_pass_builder.py @@ -21,6 +21,7 @@ import unittest import numpy as np from simple_nets import simple_fc_net +import paddle from paddle import fluid from paddle.fluid import compiler, core @@ -34,7 +35,7 @@ class TestPassBuilder(unittest.TestCase): loss = simple_fc_net() test_program = main.clone(for_test=True) - opt = fluid.optimizer.SGD(learning_rate=0.001) + opt = paddle.optimizer.SGD(learning_rate=0.001) opt.minimize(loss) batch_size = 32 diff --git a/test/legacy_test/test_program.py b/test/legacy_test/test_program.py index 34cbbbf74a4032c040a2ef2efb343bf99d037c4f..a1b5b51a886e35e47d097b4cbb9043fc3e0f98fb 100644 --- a/test/legacy_test/test_program.py +++ b/test/legacy_test/test_program.py @@ -109,7 +109,7 @@ class TestProgram(unittest.TestCase): data = paddle.static.data(name='x', shape=[None, 13], dtype='float32') hidden = paddle.static.nn.fc(x=data, size=10) loss = paddle.mean(hidden) - fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + paddle.optimizer.SGD(learning_rate=0.01).minimize(loss) # NOTE: here the parameters are fc_0.w_0 and fc_0.b_0 param_list = program.all_parameters() diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py index 72f739eb67971a9a1353e662060d362dc97e79d3..c304777e64570c2b4ef8062921ed009e16723834 100755 --- a/test/legacy_test/test_program_prune_backward.py +++ b/test/legacy_test/test_program_prune_backward.py @@ -105,7 +105,7 @@ def optimization_in_cond_net(with_optimize=False): opt.minimize(avg_loss) return avg_loss - sgd = fluid.optimizer.SGD(learning_rate=0.1) + sgd = paddle.optimizer.SGD(learning_rate=0.1) two = paddle.tensor.fill_constant([1], 'int32', 2) pred = two == 0 avg_loss = paddle.static.nn.case( @@ -165,9 +165,9 @@ class TestProgramPruneBackward(unittest.TestCase): def test_simple_fc_net(self): def optimizer(): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=0.001, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) return optimizer @@ -181,9 +181,9 @@ class TestProgramPruneBackward(unittest.TestCase): def test_simple_fc_net_with_accuracy(self): def optimizer(): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=0.001, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) return optimizer @@ -197,9 +197,9 @@ class TestProgramPruneBackward(unittest.TestCase): def test_batchnorm_fc(self): def optimizer(): - optimizer = fluid.optimizer.SGD( + optimizer = paddle.optimizer.SGD( learning_rate=0.001, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) return optimizer @@ -221,9 +221,9 @@ class TestProgramPruneBackward(unittest.TestCase): def test_transformer(self): def optimizer(): - optimizer = fluid.optimizer.Adam( + optimizer = paddle.optimizer.Adam( learning_rate=0.001, - regularization=paddle.regularizer.L2Decay(1e-4), + weight_decay=paddle.regularizer.L2Decay(1e-4), ) return optimizer @@ -238,7 +238,7 @@ class TestProgramPruneBackward(unittest.TestCase): def test_cond(self): def optimizer(): - optimizer = fluid.optimizer.SGD(learning_rate=0.01) + optimizer = paddle.optimizer.SGD(learning_rate=0.01) return optimizer with self.program_scope_guard(): diff --git a/test/legacy_test/test_prune.py b/test/legacy_test/test_prune.py index abf3482446826831ed0ade1c9082990ecb971bc3..45f40d358901d07f0b1485fc2306ccf68917da3d 100644 --- a/test/legacy_test/test_prune.py +++ b/test/legacy_test/test_prune.py @@ -298,7 +298,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) @@ -329,7 +329,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) @@ -390,7 +390,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) @@ -425,7 +425,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe.run(startup_program) x_np = np.random.random(size=(10, 2)).astype('float32') @@ -469,13 +469,9 @@ class TestExecutorRunAutoPrune(unittest.TestCase): w1_param_attrs, w2_param_attrs, ) = self.net2() - adam_optimizer1 = fluid.optimizer.AdamOptimizer( - learning_rate=0.5 - ) + adam_optimizer1 = paddle.optimizer.Adam(learning_rate=0.5) train1 = adam_optimizer1.minimize(loss1) - adam_optimizer2 = fluid.optimizer.AdamOptimizer( - learning_rate=0.5 - ) + adam_optimizer2 = paddle.optimizer.Adam(learning_rate=0.5) train2 = adam_optimizer2.minimize(loss2) exe.run(startup_program) x_np = np.random.random(size=(10, 2)).astype('float32') @@ -531,7 +527,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) sgd_optimizer.minimize(loss1) exe.run(startup_program) x_np = np.random.random(size=(10, 2)).astype('float32') @@ -564,7 +560,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) train1, _ = sgd_optimizer.minimize(loss1) cloned_program = program.clone() train2, _ = sgd_optimizer.minimize(loss2) @@ -625,7 +621,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) train1 = sgd_optimizer.minimize(loss1) cloned_program = program.clone() @@ -699,9 +695,9 @@ class TestExecutorRunAutoPrune(unittest.TestCase): ) = self.net2() loss1.persistable = True loss2.persistable = True - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) train1 = sgd_optimizer.minimize(loss1) - sgd_optimizer1 = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer1 = paddle.optimizer.SGD(learning_rate=0.5) train2 = sgd_optimizer1.minimize(loss2) exe = fluid.Executor(fluid.CPUPlace()) exe.run(startup_program) @@ -749,7 +745,7 @@ class TestExecutorRunAutoPrune(unittest.TestCase): with fluid.scope_guard(scope): with fluid.program_guard(program, startup_program): (x, y, label, loss1, loss2, w_param_attrs) = self.net1() - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.5) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.5) train1, _ = sgd_optimizer.minimize(loss1) cloned_program = program.clone() train2, _ = sgd_optimizer.minimize(loss2) diff --git a/test/legacy_test/test_pull_gpups_sparse_op.py b/test/legacy_test/test_pull_gpups_sparse_op.py index 37dd6753cca7a2c238ba758bde945ec86bfb0ef4..054cc9f495dbf5cac508af2792ac69606b88d459 100644 --- a/test/legacy_test/test_pull_gpups_sparse_op.py +++ b/test/legacy_test/test_pull_gpups_sparse_op.py @@ -39,7 +39,7 @@ class TestPullGpupsSparse(unittest.TestCase): slots, size=[11], is_distributed=True, is_sparse=True ) cost = paddle.mean(output) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(cost, train_program) block = train_program.global_block() place = fluid.CPUPlace() diff --git a/test/legacy_test/test_py_func_op.py b/test/legacy_test/test_py_func_op.py index 64b0b044ef6bf578bd4a2e241a5a9c4d5143c404..2719fff9235d398c8d6469d551df2aff0ef08e39 100644 --- a/test/legacy_test/test_py_func_op.py +++ b/test/legacy_test/test_py_func_op.py @@ -186,7 +186,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor): name='label', shape=[-1, 1], dtype='int64' ) loss = simple_fc_net(img, label, use_py_func_op) - optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer = paddle.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/test/legacy_test/test_raw_program_optimizer.py b/test/legacy_test/test_raw_program_optimizer.py index 22b9766ef732f1facd7201bcd31d5a5dbb059bfa..906b67d61d8c83b4c5da320b842ea8a956909059 100644 --- a/test/legacy_test/test_raw_program_optimizer.py +++ b/test/legacy_test/test_raw_program_optimizer.py @@ -63,7 +63,7 @@ class TestRawProgramOptimizer(unittest.TestCase): cost = self.mlp(input_x=input_x, input_y=input_y) output_name = cost.name optimizer = fleet.distributed_optimizer( - fluid.optimizer.Adam(), strategy + paddle.optimizer.Adam(), strategy ) optimizer.minimize(cost) diff --git a/test/legacy_test/test_regularizer.py b/test/legacy_test/test_regularizer.py index d8add3c3760d1d65e2ad2e0a99aabd0d15f200de..da0fde3558a5a41e3fb96be7ca9f2ac53f88697e 100644 --- a/test/legacy_test/test_regularizer.py +++ b/test/legacy_test/test_regularizer.py @@ -271,7 +271,7 @@ class TestRegularizer(unittest.TestCase): x = paddle.uniform([2, 2, 3]) out = paddle.static.nn.fc(x, 5, weight_attr=fc_param_attr) loss = paddle.sum(out) - sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2) + sgd = paddle.optimizer.SGD(learning_rate=0.1, weight_decay=l2) sgd.minimize(loss) with fluid.dygraph.guard(): input = fluid.dygraph.to_variable( @@ -291,16 +291,16 @@ class TestRegularizer(unittest.TestCase): loss1.backward() # set l2 regularizer in optimizer, but l1 in fluid.ParamAttr - fluid.optimizer.SGD( - parameter_list=linear1.parameters(), + paddle.optimizer.SGD( + parameters=linear1.parameters(), learning_rate=1e-2, - regularization=l2, + weight_decay=l2, ).minimize(loss1) # only set l1 in fluid.ParamAttr loss2 = linear2(input) loss2.backward() - fluid.optimizer.SGD( - parameter_list=linear2.parameters(), learning_rate=1e-2 + paddle.optimizer.SGD( + parameters=linear2.parameters(), learning_rate=1e-2 ).minimize(loss2) # they should both be applied by l1, and keep the same np.testing.assert_allclose( diff --git a/test/legacy_test/test_regularizer_api.py b/test/legacy_test/test_regularizer_api.py index a00dc07022c49769ee8f7c950f76b2e37cc1dcd0..5a30dcb63649ea522033ab44aec07827d5877726 100644 --- a/test/legacy_test/test_regularizer_api.py +++ b/test/legacy_test/test_regularizer_api.py @@ -186,7 +186,7 @@ class TestRegularizer(unittest.TestCase): x = paddle.uniform([2, 2, 3]) out = paddle.static.nn.fc(x, 5, weight_attr=fc_param_attr) loss = paddle.sum(out) - sgd = fluid.optimizer.SGD(learning_rate=0.1, regularization=l2) + sgd = paddle.optimizer.SGD(learning_rate=0.1, weight_decay=l2) sgd.minimize(loss) with fluid.dygraph.guard(): input = fluid.dygraph.to_variable( @@ -206,16 +206,16 @@ class TestRegularizer(unittest.TestCase): loss1.backward() # set l2 regularizer in optimizer, but l1 in fluid.ParamAttr - fluid.optimizer.SGD( - parameter_list=linear1.parameters(), + paddle.optimizer.SGD( + parameters=linear1.parameters(), learning_rate=1e-2, - regularization=l2, + weight_decay=l2, ).minimize(loss1) # only set l1 in fluid.ParamAttr loss2 = linear2(input) loss2.backward() - fluid.optimizer.SGD( - parameter_list=linear2.parameters(), learning_rate=1e-2 + paddle.optimizer.SGD( + parameters=linear2.parameters(), learning_rate=1e-2 ).minimize(loss2) # they should both be applied by l1, and keep the same np.testing.assert_allclose( diff --git a/test/legacy_test/test_rnn_decode_api.py b/test/legacy_test/test_rnn_decode_api.py index 47612c1913b767f2235e69e015e588e8928ae62a..efb7e30592da34a01a540eea72001bfafb7b4781 100644 --- a/test/legacy_test/test_rnn_decode_api.py +++ b/test/legacy_test/test_rnn_decode_api.py @@ -59,7 +59,7 @@ class PolicyGradient: if length is not None else paddle.mean(cost) ) - optimizer = fluid.optimizer.Adam(self.lr) + optimizer = paddle.optimizer.Adam(self.lr) optimizer.minimize(cost) return cost @@ -153,7 +153,7 @@ class MLE: loss = loss * mask loss = paddle.mean(loss, axis=[0]) loss = paddle.sum(loss) - optimizer = fluid.optimizer.Adam(self.lr) + optimizer = paddle.optimizer.Adam(self.lr) optimizer.minimize(loss) return loss diff --git a/test/legacy_test/test_sgd_op.py b/test/legacy_test/test_sgd_op.py index 86337e8a0f2f50a0da02013e19bdf4f255876fb7..e6239c8d8f112b69b82e7f8a231aff14a6e20954 100644 --- a/test/legacy_test/test_sgd_op.py +++ b/test/legacy_test/test_sgd_op.py @@ -217,7 +217,7 @@ class TestSGDOpWithLargeInput(unittest.TestCase): cost = paddle.nn.functional.square_error_cost(input=out, label=label) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = fluid.CPUPlace() @@ -427,117 +427,5 @@ class TestSGDMultiPrecision2_0(unittest.TestCase): ) -class TestSGDMultiPrecision1_0(unittest.TestCase): - def dygraph_sgd_mp(self, mp): - paddle.disable_static() - paddle.seed(10) - paddle.set_device('gpu') - input = paddle.randn((2, 2)) - model = paddle.nn.Linear(2, 2) - optimizer = paddle.fluid.optimizer.SGD( - learning_rate=0.001, - parameter_list=model.parameters(), - multi_precision=mp, - ) - if mp: - model = paddle.amp.decorate(models=model, level='O2') - scaler = paddle.amp.GradScaler(init_loss_scaling=1024) - - for idx in range(5): - if mp: - with paddle.amp.auto_cast(level='O2'): - output = model(input) - loss = paddle.mean(output) - scaled = scaler.scale(loss) - scaled.backward() - scaler.minimize(optimizer, scaled) - optimizer.clear_gradients() - else: - output = model(input) - loss = paddle.mean(output) - optimizer.minimize(loss) - optimizer.clear_gradients() - - return output, model.parameters() - - def static_sgd_mp(self, mp): - paddle.enable_static() - paddle.seed(10) - np.random.seed(10) - exe = paddle.static.Executor('gpu') - train_program = paddle.static.Program() - startup_program = paddle.static.Program() - optimizer = paddle.fluid.optimizer.SGD( - learning_rate=0.001, multi_precision=mp - ) - - if mp: - optimizer = paddle.static.amp.decorate( - optimizer, - init_loss_scaling=128.0, - use_dynamic_loss_scaling=True, - use_pure_fp16=True, - use_fp16_guard=False, - ) - with paddle.static.program_guard(train_program, startup_program): - if mp: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float16' - ) - else: - data = paddle.static.data( - shape=[2, 2], name='X', dtype='float32' - ) - hidden = paddle.static.nn.fc(x=data, size=10) - loss = paddle.mean(hidden) - optimizer.minimize(loss) - exe.run(startup_program) - - if mp: - optimizer.amp_init( - place=paddle.CUDAPlace(0), scope=paddle.static.global_scope() - ) - x = np.random.random(size=(2, 2)).astype('float16') - else: - x = np.random.random(size=(2, 2)).astype('float32') - out = [] - for idx in range(5): - (loss_data,) = exe.run( - train_program, feed={"X": x}, fetch_list=[loss.name] - ) - out.append(loss_data) - return out - - def test_main(self): - if not paddle.is_compiled_with_cuda(): - return - "Test dygraph mode" - output1_dy, params1_dy = self.dygraph_sgd_mp(mp=True) - output2_dy, params2_dy = self.dygraph_sgd_mp(mp=False) - np.testing.assert_allclose( - output1_dy.astype('float32').numpy(), - output2_dy.astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - for idx in range(len(params1_dy)): - np.testing.assert_allclose( - params1_dy[idx].astype('float32').numpy(), - params2_dy[idx].astype('float32').numpy(), - rtol=1e-05, - atol=0.1, - ) - "Test static graph mode" - output1_st = self.static_sgd_mp(mp=True) - output2_st = self.static_sgd_mp(mp=False) - for idx in range(len(output1_st)): - np.testing.assert_allclose( - output1_st[idx].astype('float32'), - output2_st[idx].astype('float32'), - rtol=1e-05, - atol=0.1, - ) - - if __name__ == "__main__": unittest.main() diff --git a/test/legacy_test/test_static_save_load.py b/test/legacy_test/test_static_save_load.py index 655ed28ddc73977d29ded796d3a05633766e2beb..9078de654d606ce20510cb0d66aec94220058ab9 100644 --- a/test/legacy_test/test_static_save_load.py +++ b/test/legacy_test/test_static_save_load.py @@ -25,7 +25,7 @@ from test_imperative_base import new_program_scope import paddle from paddle import fluid from paddle.fluid import core, framework -from paddle.fluid.optimizer import Adam +from paddle.optimizer import Adam paddle.enable_static() diff --git a/test/legacy_test/test_static_save_load_bf16.py b/test/legacy_test/test_static_save_load_bf16.py index b83a3ac6e55d543883f93cd5161d2d3e35da64b6..10b374689d9607cb50341fdbb6d7d4cf38ee0c5e 100644 --- a/test/legacy_test/test_static_save_load_bf16.py +++ b/test/legacy_test/test_static_save_load_bf16.py @@ -23,7 +23,6 @@ from test_static_save_load import PtbModel import paddle from paddle import fluid from paddle.fluid import core, framework -from paddle.fluid.optimizer import SGDOptimizer @unittest.skipIf( @@ -63,7 +62,7 @@ class TestSaveLoadBF16(unittest.TestCase): place = self.set_place() exe = fluid.Executor(place) - sgd = SGDOptimizer(learning_rate=1e-3) + sgd = paddle.optimizer.SGD(learning_rate=1e-3) x = paddle.static.data( name="x", shape=[-1, num_steps], dtype='int64' ) diff --git a/test/legacy_test/test_sync_batch_norm_op.py b/test/legacy_test/test_sync_batch_norm_op.py index bbcbac41b9e8b6441ad58736c709d3fa390d5247..5fa9bce916b527058158b46b9cc49ccc81690163 100644 --- a/test/legacy_test/test_sync_batch_norm_op.py +++ b/test/legacy_test/test_sync_batch_norm_op.py @@ -188,7 +188,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): if not sync_bn: out = out / core.get_cuda_device_count() if not only_forward: - sgd_opt = fluid.optimizer.SGD(learning_rate=0.0) + sgd_opt = paddle.optimizer.SGD(learning_rate=0.0) sgd_opt.backward(out) return main, startup, [out, conv, bn] diff --git a/test/legacy_test/test_traced_layer_err_msg.py b/test/legacy_test/test_traced_layer_err_msg.py index a65945b330d1bd190a860397ffe8779617ad08a8..7365f29975cb5f5b9d14c0e09e818b8cf6a052b4 100644 --- a/test/legacy_test/test_traced_layer_err_msg.py +++ b/test/legacy_test/test_traced_layer_err_msg.py @@ -211,8 +211,8 @@ class TestTracedLayerErrMsg(unittest.TestCase): layer = SimpleFCLayer( self.feature_size, self.batch_size, self.fc_size ) - optimizer = fluid.optimizer.SGD( - learning_rate=1e-3, parameter_list=layer.parameters() + optimizer = paddle.optimizer.SGD( + learning_rate=1e-3, parameters=layer.parameters() ) for i in range(5): diff --git a/test/legacy_test/test_trainable.py b/test/legacy_test/test_trainable.py index 9e1818c0d2d71a33c0fbbe4e49169610b7719015..5f6968af51096fa5f8780d35ea1b89052d622339 100644 --- a/test/legacy_test/test_trainable.py +++ b/test/legacy_test/test_trainable.py @@ -36,7 +36,7 @@ def test_trainable(): class TestTrainable(unittest.TestCase): def check_trainable( - self, model, feed_dict, op_count, optimizer=fluid.optimizer.Adam() + self, model, feed_dict, op_count, optimizer=paddle.optimizer.Adam() ): place = fluid.CPUPlace() exe = fluid.Executor(place) @@ -68,7 +68,7 @@ class TestTrainable(unittest.TestCase): self.check_trainable( test_trainable, feed_dict, - op_count={'adam': 1, 'scale': 0, 'mul_grad': 0}, + op_count={'adam': 1, 'scale': 0, 'mul_grad': 1}, ) self.check_trainable( test_trainable, diff --git a/test/prim/model/test_bert_cinn.py b/test/prim/model/test_bert_cinn.py index c647f72fdd68ca452ebc27700be799940ad4cacd..9651859a48b49f5c4cd19568e9baa1f9a1894b0f 100644 --- a/test/prim/model/test_bert_cinn.py +++ b/test/prim/model/test_bert_cinn.py @@ -74,7 +74,7 @@ def train(to_static, enable_prim, enable_cinn): bert = Bert(to_static, enable_cinn) criterion = BertPretrainingCriterion() - optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters()) + optimizer = paddle.optimizer.Adam(parameters=bert.parameters()) losses = [] for step, batch in enumerate(train_data_loader): diff --git a/test/prim/model/test_bert_prim.py b/test/prim/model/test_bert_prim.py index 4044620f7e8fc11413dee98025a1decbaf03d4a0..2c55eeba8c9ffc730ad9bdcfa034a45ba50b7aae 100644 --- a/test/prim/model/test_bert_prim.py +++ b/test/prim/model/test_bert_prim.py @@ -73,7 +73,7 @@ def train(to_static, enable_prim, enable_cinn): bert = Bert(to_static, enable_cinn) criterion = BertPretrainingCriterion() - optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters()) + optimizer = paddle.optimizer.Adam(parameters=bert.parameters()) losses = [] for step, batch in enumerate(train_data_loader): diff --git a/test/prim/model/test_bert_prim_cinn.py b/test/prim/model/test_bert_prim_cinn.py index ea475730d1d151f43a2d856118cb327e616bee6b..3a8634fbdc0994674c2a24bbf09fe307268f4c18 100644 --- a/test/prim/model/test_bert_prim_cinn.py +++ b/test/prim/model/test_bert_prim_cinn.py @@ -74,7 +74,7 @@ def train(to_static, enable_prim, enable_cinn): bert = Bert(to_static, enable_cinn) criterion = BertPretrainingCriterion() - optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters()) + optimizer = paddle.optimizer.Adam(parameters=bert.parameters()) losses = [] for step, batch in enumerate(train_data_loader): diff --git a/test/prim/model/test_resnet_cinn.py b/test/prim/model/test_resnet_cinn.py index 31877e78632ebcb7b0085ebb56266aeafc1f8999..41636462dda9e4b6de80c00e5f9f6672b50497c1 100644 --- a/test/prim/model/test_resnet_cinn.py +++ b/test/prim/model/test_resnet_cinn.py @@ -42,19 +42,19 @@ epoch_num = 1 # 8.438933372497559, # 10.305074691772461, - +# note: Version 2.0 momentum is fused to OP when L2Decay is available, and the results are different from the fluid version. # The results in ci as as follows: DY2ST_CINN_GT = [ 5.828789710998535, 8.340764999389648, - 4.998944282531738, - 8.474305152893066, - 8.09157943725586, - 7.440057754516602, - 9.907357215881348, - 8.304681777954102, - 8.383116722106934, - 10.120304107666016, + 4.998946666717529, + 8.4613676071167, + 8.033733367919922, + 7.352842807769775, + 9.8336181640625, + 8.22379207611084, + 8.195695877075195, + 10.508796691894531, ] if core.is_compiled_with_cuda(): @@ -93,11 +93,11 @@ class TransedFlowerDataSet(paddle.io.Dataset): def optimizer_setting(parameter_list=None): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=base_lr, momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/prim/model/test_resnet_prim.py b/test/prim/model/test_resnet_prim.py index 8406bbd298f988a1378fe28d6a22bea20d610884..6ede8ca3adb050b5fb4bc6124c10138a2946ad28 100644 --- a/test/prim/model/test_resnet_prim.py +++ b/test/prim/model/test_resnet_prim.py @@ -43,18 +43,19 @@ epoch_num = 1 # 10.256929397583008, # ] +# note: Version 2.0 momentum is fused to OP when L2Decay is available, and the results are different from the fluid version. # The results in ci as as follows: DY2ST_PRIM_GT = [ 5.82879114151001, 8.33370590209961, - 5.091761589050293, - 8.776082992553711, - 8.274380683898926, - 7.546653747558594, - 9.607137680053711, - 8.27371597290039, - 8.429732322692871, - 10.362630844116211, + 5.104889392852783, + 8.546337127685547, + 8.263965606689453, + 7.413934230804443, + 9.569124221801758, + 8.251557350158691, + 8.513609886169434, + 10.603094100952148, ] if core.is_compiled_with_cuda(): @@ -93,11 +94,11 @@ class TransedFlowerDataSet(paddle.io.Dataset): def optimizer_setting(parameter_list=None): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=base_lr, momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/prim/model/test_resnet_prim_cinn.py b/test/prim/model/test_resnet_prim_cinn.py index 5e20a983cc58b8da81e06521b35c18fda726685a..9804d4dd6b0b8c38915fc31f7aa803da830edae0 100644 --- a/test/prim/model/test_resnet_prim_cinn.py +++ b/test/prim/model/test_resnet_prim_cinn.py @@ -43,18 +43,19 @@ epoch_num = 1 # 9.919631958007812, # ] +# note: Version 2.0 momentum is fused to OP when L2Decay is available, and the results are different from the fluid version. # The results in ci as as follows: DY2ST_PRIM_CINN_GT = [ 5.828786849975586, 8.332863807678223, - 5.0373005867004395, - 8.464998245239258, - 8.20099925994873, - 7.576723098754883, - 9.679173469543457, - 8.381753921508789, - 8.10612678527832, - 10.124727249145508, + 5.041562080383301, + 8.514982223510742, + 7.9860992431640625, + 7.491837501525879, + 9.559739112854004, + 8.430597305297852, + 8.109201431274414, + 10.224763870239258, ] if core.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': True}) @@ -92,11 +93,11 @@ class TransedFlowerDataSet(paddle.io.Dataset): def optimizer_setting(parameter_list=None): - optimizer = fluid.optimizer.Momentum( + optimizer = paddle.optimizer.Momentum( learning_rate=base_lr, momentum=momentum_rate, - regularization=paddle.regularizer.L2Decay(l2_decay), - parameter_list=parameter_list, + weight_decay=paddle.regularizer.L2Decay(l2_decay), + parameters=parameter_list, ) return optimizer diff --git a/test/xpu/test_sgd_op_xpu.py b/test/xpu/test_sgd_op_xpu.py index 6b9efdb8ab2ca91b2fdb5135a0b89045cb657d40..7720b992060bcb2b7c8dfdc9faca5ee4bd0d7e20 100644 --- a/test/xpu/test_sgd_op_xpu.py +++ b/test/xpu/test_sgd_op_xpu.py @@ -76,7 +76,7 @@ class TestSGDOpWithLargeInput(unittest.TestCase): cost = paddle.nn.functional.square_error_cost(input=out, label=label) avg_cost = paddle.mean(cost) - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer = paddle.optimizer.SGD(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) place = paddle.XPUPlace(0)