diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index 0d5c9652de6b814627e54018366137e214726619..9540900b112f54594bbfdbc8d7cd3b6e1f5269dd 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -136,10 +136,6 @@ def parse_args(): '--no_random', action='store_true', help='If set, keep the random seed and do not shuffle the data.') - parser.add_argument( - '--use_lars', - action='store_true', - help='If set, use lars for optimizers, ONLY support resnet module.') parser.add_argument( '--reduce_strategy', type=str, diff --git a/benchmark/fluid/models/resnet.py b/benchmark/fluid/models/resnet.py index 1b3bfe659c7d97b58dc4121387d4db22266381c5..f692e7722a1c9a54a4509ce7c78cc68e1f28da74 100644 --- a/benchmark/fluid/models/resnet.py +++ b/benchmark/fluid/models/resnet.py @@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog): # configure optimize optimizer = None if is_train: - if args.use_lars: - lars_decay = 1.0 - else: - lars_decay = 0.0 - total_images = 1281167 / trainer_count step = int(total_images / (args.batch_size * args.gpus) + 1) diff --git a/benchmark/fluid/models/resnet_with_preprocess.py b/benchmark/fluid/models/resnet_with_preprocess.py index e8d661d847516a15e4e28796960815935b82ae6f..e996c9a704531757891354c7c75a9d7915195ee0 100644 --- a/benchmark/fluid/models/resnet_with_preprocess.py +++ b/benchmark/fluid/models/resnet_with_preprocess.py @@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog): # configure optimize optimizer = None if is_train: - if args.use_lars: - lars_decay = 1.0 - else: - lars_decay = 0.0 - total_images = 1281167 / trainer_count step = int(total_images / args.batch_size + 1) diff --git a/benchmark/fluid/models/se_resnext.py b/benchmark/fluid/models/se_resnext.py index 9f887fb324dc86a30b708b9ef04068282a3e6c3e..7fbb83c2ec1bab29731ae4e432dda202007b2e2c 100644 --- a/benchmark/fluid/models/se_resnext.py +++ b/benchmark/fluid/models/se_resnext.py @@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog): optimizer = None if is_train: - if args.use_lars: - lars_decay = 1.0 - else: - lars_decay = 0.0 - total_images = 1281167 / trainer_count step = int(total_images / args.batch_size + 1) @@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog): learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr), momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4), - LARS_weight_decay=lars_decay) + regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) if args.memory_optimize: diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0f1127f1fafea73da0e58ac4a090a70d479594b5..89331ad15617d6f9daa6de49afc461d1c7749603 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -351,25 +351,25 @@ paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filt paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False)) -paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None) +paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,)) +paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,)) +paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) +paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06)) +paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) +paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False)) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) +paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000)) +paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 84a584f424823a450effd4c36e9da600f5851da2..5b27068c9e805146b8bce03f4f676ef0d4d16c53 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -174,12 +174,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; + int64_t row_count_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel) + int64_t row_numel, int64_t row_count) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -194,28 +195,47 @@ struct SparseAdamFunctor { param_(param), param_out_(param_out), rows_(rows), - row_numel_(row_numel) {} + row_numel_(row_numel), + row_count_(row_count) {} + + inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const { + int64_t beg = 0, end = row_count_ - 1; + while (beg <= end) { + auto mid = ((beg + end) >> 1); + if (rows_[mid] == row) + return mid; + else if (rows_[mid] < row) + beg = mid + 1; + else + end = mid - 1; + } + return -1; + } inline HOSTDEVICE void operator()(size_t i) const { + int64_t row = i / row_numel_; + auto row_idx = BinarySearchInRows(row); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + + // The following code is the same as dense + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; - for (int64_t j = 0; j < row_numel_; ++j) { - T g = grad_[i * row_numel_ + j]; - T mom1 = moment1_[rows_[i] * row_numel_ + j]; - T mom2 = moment2_[rows_[i] * row_numel_ + j]; - T lr = *lr_; - T p = param_[rows_[i] * row_numel_ + j]; - - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - - moment1_out_[rows_[i] * row_numel_ + j] = mom1; - moment2_out_[rows_[i] * row_numel_ + j] = mom2; - param_out_[rows_[i] * row_numel_ + j] = p; - } // for col id + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; } }; @@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel { return; } // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor scatter::MergeAdd merge_func; - auto grad_merge = - merge_func(ctx.template device_context(), grad); + auto& grad_merge = *(ctx.scope() + .NewScope() + .Var("sparse_adam_grad_merge") + ->GetMutable()); + merge_func(ctx.template device_context(), grad, + &grad_merge); auto& grad_tensor = grad_merge.value(); const T* grad_data = grad_tensor.template data(); int64_t* rows = nullptr; @@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel { mom2.template data(), mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), - param_out.template mutable_data(ctx.GetPlace()), rows, row_numel); + param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size()); platform::ForRange for_range( static_cast(ctx.device_context()), - grad_merge.rows().size()); + param.numel()); for_range(functor); } else { PADDLE_THROW("Variable type not supported by adam_op"); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index 85607a6b0e4239b063ba75888e6859a5d85eafc9..daf06f370ffb591e25ad846b94c8284aad19a8dd 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/transform.h" namespace paddle { @@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto max = context.Attr("max"); auto min = context.Attr("min"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - T* out_data = out->mutable_data(context.GetPlace()); - const T* x_data = x->data(); - int64_t numel = x->numel(); - Transform trans; - trans(context.template device_context(), x_data, - x_data + numel, out_data, ClipFunctor(min, max)); + auto* x_var = context.InputVar("X"); + if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + int64_t numel = x->numel(); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); + } else if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + PADDLE_ENFORCE_NE(x, out, + "Inplace clip is not allowed when x is SelectedRows"); + math::scatter::MergeAdd merge_func; + merge_func(context.template device_context(), *x, out); + auto* out_tensor = out->mutable_value(); + auto* out_data = out_tensor->data(); + int64_t numel = out_tensor->numel(); + Transform trans; + trans(context.template device_context(), out_data, + out_data + numel, out_data, ClipFunctor(min, max)); + } else { + PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows"); + } } }; @@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto max = context.Attr("max"); auto min = context.Attr("min"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_x = + context.Output(framework::GradVarName("X")); if (d_x != nullptr) { - auto* x = context.Input("X"); + auto* x = context.Input("X"); int64_t numel = d_out->numel(); auto* d_x_data = d_x->mutable_data(context.GetPlace()); const T* d_out_data = d_out->data(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index a830dc5250a6aea7e622da4046b512d0c7c5d6f9..8e8baf49b2330e95ff1a868b0b0a03bc10d84484 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -199,6 +199,14 @@ struct MergeAdd { framework::SelectedRows operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input) { framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; auto input_rows = input.rows(); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); @@ -223,7 +231,6 @@ struct MergeAdd { out_data[out_i * input_width + j] += input_data[i * input_width + j]; } } - return out; } }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index d559aaa7210eca1f169585760b73e7d95b71d281..ae51a53a7197950338ef773d63103fa13bf0a5f5 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -234,7 +234,7 @@ template __global__ void MergeAddKernel(const T* input, const int64_t* input_rows, T* out, const int64_t* out_rows, size_t out_rows_size, int64_t row_numel) { - const int ty = blockIdx.y; + const int ty = blockIdx.x; int tid = threadIdx.x; __shared__ size_t out_idx; @@ -260,6 +260,14 @@ struct MergeAdd { framework::SelectedRows operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input) { framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CUDADeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + framework::SelectedRows& out = *output; framework::Vector input_rows(input.rows()); std::set row_set(input_rows.begin(), input_rows.end()); std::vector merge_rows(row_set.begin(), row_set.end()); @@ -281,16 +289,12 @@ struct MergeAdd { const int block_size = 256; dim3 threads(block_size, 1); - dim3 grid1(1, input_rows.size()); + dim3 grid1(input_rows.size(), 1); - MergeAddKernel< - T, 256><<(context) - .stream()>>>( + MergeAddKernel<<>>( input_data, input_rows.CUDAData(context.GetPlace()), out_data, out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.rows().size(), input_width); - return out; } }; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 18304f83f8706f822ce628e2374b00a71f1cc171..aa419f74fcd2a53cdd734ec270bc154b78c9f2ff 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -65,6 +65,9 @@ struct MergeAdd { // the input SelectedRows object. framework::SelectedRows operator()(const DeviceContext& context, const framework::SelectedRows& input); + void operator()(const DeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output); }; template diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ef7b16a19e10a28bd1cc34496fb908580c5d7330..ad09005d866b10146e6fcd7cf108c51f34322607 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -43,11 +43,7 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, - learning_rate, - regularization=None, - LARS_weight_decay=0.0, - name=None): + def __init__(self, learning_rate, regularization=None, name=None): if not isinstance(learning_rate, float) and \ not isinstance(learning_rate, framework.Variable): raise TypeError("learning rate should be float or Variable") @@ -68,7 +64,6 @@ class Optimizer(object): # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} self._accumulators = defaultdict(lambda: dict()) self.helper = None - self._LARS_weight_decay = LARS_weight_decay def _create_global_learning_rate(self): lr = self._global_learning_rate() @@ -109,7 +104,6 @@ class Optimizer(object): param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] if type(param_lr) == Variable: - # param learning rate has been updated (LARS) print("returns updated param lr ", param_lr) return param_lr else: @@ -227,10 +221,6 @@ class Optimizer(object): self._create_accumulators(loss.block, [p[0] for p in parameters_and_grads]) self._create_global_learning_rate() - if self._LARS_weight_decay > 0.0: - layers.append_LARS(parameters_and_grads, - self._global_learning_rate(), - self._LARS_weight_decay) optimize_ops = [] for param_and_grad in parameters_and_grads: @@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer): Args: learning_rate (float|Variable): the learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer): sgd_optimizer.minimize(cost) """ - def __init__(self, learning_rate, **kwargs): + def __init__(self, learning_rate, regularization=None, name=None): assert learning_rate is not None super(SGDOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "sgd" def _append_optimize_op(self, block, param_and_grad): @@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer): Can be a float value or a Variable with one float value as data element. momentum (float): momentum factor use_nesterov (bool): enables Nesterov momentum + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer): """ _velocity_acc_str = "velocity" - def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): + def __init__(self, + learning_rate, + momentum, + use_nesterov=False, + regularization=None, + name=None): assert learning_rate is not None assert momentum is not None super(MomentumOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "momentum" self._momentum = momentum self._use_nesterov = bool(use_nesterov) @@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer): learning_rate (float|Variable): the learning rate used to update parameters. \ Can be a float value or a Variable with one float value as data element. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): + def __init__(self, + learning_rate, + epsilon=1.0e-6, + regularization=None, + name=None): assert learning_rate is not None assert epsilon is not None super(AdagradOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adagrad" self._epsilon = epsilon @@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - **kwargs): + regularization=None, + name=None): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None super(AdamOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adam" self._beta1 = beta1 self._beta2 = beta2 @@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - **kwargs): + regularization=None, + name=None): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None super(AdamaxOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adamax" self._beta1 = beta1 self._beta2 = beta2 @@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer): Can be a float value or a Variable with one float value as data element. decay (float): decay rate. epsilon (float): a small float value for numerical stability. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): + def __init__(self, + learning_rate, + decay=0.95, + epsilon=1.0e-6, + regularization=None, + name=None): assert learning_rate is not None assert decay is not None assert epsilon is not None super(DecayedAdagradOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "decayed_adagrad" self._decay = decay self._epsilon = epsilon @@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer): learning_rate(float): global learning rate rho(float): rho in equation epsilon(float): epsilon in equation + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer): _avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_update_acc_str = "_avg_squared_update" - def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): + def __init__(self, + learning_rate, + epsilon=1.0e-6, + rho=0.95, + regularization=None, + name=None): if learning_rate is None: raise ValueError("learning_rate is not set.") if epsilon is None: @@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer): if rho is None: raise ValueError("rho is not set.") super(AdadeltaOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) self.type = "adadelta" self._epsilon = epsilon self._rho = rho @@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer): the gradient; if False, by the uncentered second moment. Setting this to True may help with training, but is slightly more expensive in terms of computation and memory. Defaults to False. + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Raises: ValueError: If learning_rate, rho, epsilon, momentum are None. @@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer): epsilon=1.0e-6, momentum=0.0, centered=False, - **kwargs): + regularization=None, + name=None): super(RMSPropOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) if learning_rate is None: raise ValueError("learning_rate is not set.") if rho is None: @@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer): l1 (float): l2 (float): lr_power (float): + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Raises: ValueError: If learning_rate, rho, epsilon, momentum are None. @@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer): _squared_acc_str = "squared" _linear_acc_str = "linear" - def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): + def __init__(self, + learning_rate, + l1=0.0, + l2=0.0, + lr_power=-0.5, + regularization=None, + name=None): super(FtrlOptimizer, self).__init__( - learning_rate=learning_rate, **kwargs) + learning_rate=learning_rate, + regularization=regularization, + name=name) if learning_rate is None: raise ValueError("learning_rate is not set.") @@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer): average_window_rate: The rate of average window. min_average_window: The minimum size of average window. max_average_window: The maximum size of average window. - + regularization: A Regularizer, such as + fluid.regularizer.L2DecayRegularizer. + name: A optional name prefix. Examples: .. code-block:: python @@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer): average_window_rate, min_average_window=10000, max_average_window=10000, - **kwargs): - super(ModelAverage, self).__init__(0.0, **kwargs) + regularization=None, + name=None): + super(ModelAverage, self).__init__( + 0.0, regularization=regularization, name=name) self.average_window = average_window_rate self.min_average_window = min_average_window self.max_average_window = max_average_window diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 8f4678649f2146c84150ad2659e497bbf0365d03..a4336e955f21b0b09bf3dadbd437855c06745860 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer): Examples: .. code-block:: python - program = fluid.framework.Program() - block = program.global_block() - mul_x = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="mul.x", - regularizer=fluid.regularizer.L1DecayRegularizer(0.5)) + optimizer = fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L1DecayRegularizer( + regularization_coeff=0.1)) + optimizer.minimize(avg_cost) """ def __init__(self, regularization_coeff=0.0): diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 135f11d24c8fde35995cfe577859874ca8428cd4..4b4f3e403776625fb5ca2f9b03d14ee7efe23d53 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -99,7 +99,7 @@ def train(nn_type, test_program = fluid.default_main_program().clone(for_test=True) - optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3) + optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(avg_loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 88d36fe639c7437b478efb2ee292d792677403d9..f53fe6d69d0855c8ba88eac8059708b690d2475b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -34,12 +34,13 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) endif(NOT WITH_DISTRIBUTE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_dist_se_resnext") + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) + list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) endif() function(py_test_modules TARGET_NAME)