diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index aec60166a10b700601f12292561554f8b6b440d7..50ffef72baa1c5f210fd6e92de05d24a39ac86b4 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -418,7 +418,7 @@ paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode', 'min_row_size_to_use_multithread'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False, 0)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 955f9f455f0b6be3883118ec9df9a125cb13e3ff..54e0f5146dab3e19713d19e15c6c81868179b319 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -120,7 +120,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "min_row_size_to_use_multithread and " "inner_op_parallelism is larger then 0, sparse update " "will run in multithread mode") - .SetDefault(0); + .SetDefault(1000); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index f3c9be63d1cc98945a309d0e56578dcc2721bc25..db44cd6ec989d27f2994625e1641b5ac60880a8a 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -494,16 +494,16 @@ class AdamOpKernel : public framework::OpKernel { << " min_row_size_to_use_multithread=" << min_row_size_to_use_multithread; if (FLAGS_inner_op_parallelism > 10) { - LOG(WARNING) << "FLAGS_inner_op_parallelism " - << FLAGS_inner_op_parallelism << " is two large!"; + VLOG(1) << "FLAGS_inner_op_parallelism " + << FLAGS_inner_op_parallelism << " is two large!"; } auto& grad_rows = grad_merge.rows(); std::unordered_map row_id_to_grad_row_offset; size_t param_row_count = param.numel() / row_numel; if (param_row_count < 1000) { - LOG(WARNING) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; } for (size_t i = 0; i < grad_rows.size(); ++i) { row_id_to_grad_row_offset[grad_rows[i]] = i; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 906d64ffdd3a2329685115366115e89cc9ff4eaf..f01a0eda9a711abb3265fe5bb86ecb702a6ac6aa 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -734,8 +734,6 @@ class AdamOptimizer(Optimizer): may be very slow. The lazy mode only update the element that has gradient is the current mini-batch, so it will be much more faster. But this mode has different semantics with the original Adam algorithm and may lead to different result. - min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large, - you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize. Examples: .. code-block:: python @@ -756,8 +754,7 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - lazy_mode=False, - min_row_size_to_use_multithread=0): + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -771,7 +768,6 @@ class AdamOptimizer(Optimizer): self._beta2 = beta2 self._epsilon = epsilon self._lazy_mode = lazy_mode - self._min_row_size_to_use_multithread = min_row_size_to_use_multithread def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -826,9 +822,7 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": - self._min_row_size_to_use_multithread + "lazy_mode": self._lazy_mode }, stop_gradient=True)