提交 8c516a24 编写于 作者: Q Qiao Longfei

remote min_row_size_to_use_multithread in adam interface test=develop

上级 7fd15ce5
...@@ -418,7 +418,7 @@ paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning ...@@ -418,7 +418,7 @@ paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode', 'min_row_size_to_use_multithread'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False, 0)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdamOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdamOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -120,7 +120,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -120,7 +120,7 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"min_row_size_to_use_multithread and " "min_row_size_to_use_multithread and "
"inner_op_parallelism is larger then 0, sparse update " "inner_op_parallelism is larger then 0, sparse update "
"will run in multithread mode") "will run in multithread mode")
.SetDefault(0); .SetDefault(1000);
AddComment(R"DOC( AddComment(R"DOC(
Adam Optimizer. Adam Optimizer.
......
...@@ -494,16 +494,16 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -494,16 +494,16 @@ class AdamOpKernel : public framework::OpKernel<T> {
<< " min_row_size_to_use_multithread=" << " min_row_size_to_use_multithread="
<< min_row_size_to_use_multithread; << min_row_size_to_use_multithread;
if (FLAGS_inner_op_parallelism > 10) { if (FLAGS_inner_op_parallelism > 10) {
LOG(WARNING) << "FLAGS_inner_op_parallelism " VLOG(1) << "FLAGS_inner_op_parallelism "
<< FLAGS_inner_op_parallelism << " is two large!"; << FLAGS_inner_op_parallelism << " is two large!";
} }
auto& grad_rows = grad_merge.rows(); auto& grad_rows = grad_merge.rows();
std::unordered_map<size_t, int> row_id_to_grad_row_offset; std::unordered_map<size_t, int> row_id_to_grad_row_offset;
size_t param_row_count = param.numel() / row_numel; size_t param_row_count = param.numel() / row_numel;
if (param_row_count < 1000) { if (param_row_count < 1000) {
LOG(WARNING) << "param_row_count should be larger then 1000 to use " VLOG(1) << "param_row_count should be larger then 1000 to use "
"multi thread, currently " "multi thread, currently "
<< param_row_count; << param_row_count;
} }
for (size_t i = 0; i < grad_rows.size(); ++i) { for (size_t i = 0; i < grad_rows.size(); ++i) {
row_id_to_grad_row_offset[grad_rows[i]] = i; row_id_to_grad_row_offset[grad_rows[i]] = i;
......
...@@ -734,8 +734,6 @@ class AdamOptimizer(Optimizer): ...@@ -734,8 +734,6 @@ class AdamOptimizer(Optimizer):
may be very slow. The lazy mode only update the element that has gradient is the current may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result. original Adam algorithm and may lead to different result.
min_row_size_to_use_multithread: if adam use sparse update and the param rows is very large,
you can use FLAGS_inner_op_parallelism and this flag to enable multi thread optimize.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -756,8 +754,7 @@ class AdamOptimizer(Optimizer): ...@@ -756,8 +754,7 @@ class AdamOptimizer(Optimizer):
epsilon=1e-8, epsilon=1e-8,
regularization=None, regularization=None,
name=None, name=None,
lazy_mode=False, lazy_mode=False):
min_row_size_to_use_multithread=0):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -771,7 +768,6 @@ class AdamOptimizer(Optimizer): ...@@ -771,7 +768,6 @@ class AdamOptimizer(Optimizer):
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._lazy_mode = lazy_mode self._lazy_mode = lazy_mode
self._min_row_size_to_use_multithread = min_row_size_to_use_multithread
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -826,9 +822,7 @@ class AdamOptimizer(Optimizer): ...@@ -826,9 +822,7 @@ class AdamOptimizer(Optimizer):
"beta1": self._beta1, "beta1": self._beta1,
"beta2": self._beta2, "beta2": self._beta2,
"epsilon": self._epsilon, "epsilon": self._epsilon,
"lazy_mode": self._lazy_mode, "lazy_mode": self._lazy_mode
"min_row_size_to_use_multithread":
self._min_row_size_to_use_multithread
}, },
stop_gradient=True) stop_gradient=True)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册