From 8c516a24e5d670dea5982bdfb6a07a79c03cd31d Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 15 Jan 2019 09:56:40 +0800 Subject: [PATCH] remote min_row_size_to_use_multithread in adam interface test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/optimizers/adam_op.cc | 2 +- paddle/fluid/operators/optimizers/adam_op.h | 10 +++++----- python/paddle/fluid/optimizer.py | 10 ++-------- 4 files changed, 9 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index aec60166a10..50ffef72baa 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -418,7 +418,7 @@ paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode', 'min_row_size_to_use_multithread'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False, 0)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 955f9f455f0..54e0f5146da 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -120,7 +120,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "min_row_size_to_use_multithread and " "inner_op_parallelism is larger then 0, sparse update " "will run in multithread mode") - .SetDefault(0); + .SetDefault(1000); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index f3c9be63d1c..db44cd6ec98 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -494,16 +494,16 @@ class AdamOpKernel : public framework::OpKernel { << " min_row_size_to_use_multithread=" << min_row_size_to_use_multithread; if (FLAGS_inner_op_parallelism > 10) { - LOG(WARNING) << "FLAGS_inner_op_parallelism " - << FLAGS_inner_op_parallelism << " is two large!"; + VLOG(1) << "FLAGS_inner_op_parallelism " + << FLAGS_inner_op_parallelism << " is two large!"; } auto& grad_rows = grad_merge.rows(); std::unordered_map row_id_to_grad_row_offset; size_t param_row_count = param.numel() / row_numel; if (param_row_count < 1000) { - LOG(WARNING) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; } for (size_t i = 0; i < grad_rows.size(); ++i) { row_id_to_grad_row_offset[grad_rows[i]] = i; diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 906d64ffdd3..f01a0eda9a7 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -734,8 +734,6 @@ class AdamOptimizer(Optimizer): may be very slow. The lazy mode only update the element that has gradient is the current mini-batch, so it will be much more faster. But this mode has different semantics with the original Adam algorithm and may lead to different result. - min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large, - you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize. Examples: .. code-block:: python @@ -756,8 +754,7 @@ class AdamOptimizer(Optimizer): epsilon=1e-8, regularization=None, name=None, - lazy_mode=False, - min_row_size_to_use_multithread=0): + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -771,7 +768,6 @@ class AdamOptimizer(Optimizer): self._beta2 = beta2 self._epsilon = epsilon self._lazy_mode = lazy_mode - self._min_row_size_to_use_multithread = min_row_size_to_use_multithread def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -826,9 +822,7 @@ class AdamOptimizer(Optimizer): "beta1": self._beta1, "beta2": self._beta2, "epsilon": self._epsilon, - "lazy_mode": self._lazy_mode, - "min_row_size_to_use_multithread": - self._min_row_size_to_use_multithread + "lazy_mode": self._lazy_mode }, stop_gradient=True) -- GitLab