diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py index 0edebb176255da995333cfcabf50c548595f9b94..e4826997761c1d7ab80ffd1aa1e2bee9868b3e4e 100644 --- a/python/paddle/optimizer/adadelta.py +++ b/python/paddle/optimizer/adadelta.py @@ -70,39 +70,39 @@ class Adadelta(Optimizer): Examples: .. code-block:: python - import paddle - - inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1) - linear = paddle.nn.Linear(10, 10) - out = linear(inp) - loss = paddle.mean(out) - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") - adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) - back = out.backward() - adadelta.step() - adadelta.clear_grad() - - #Note that the learning_rate of linear_2 is 0.01. - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) - out = linear_1(inp) - out = linear_2(out) - loss = paddle.mean(out) - adadelta = paddle.optimizer.Adadelta( - learning_rate=0.1, - parameters=[{ - 'params': linear_1.parameters() - }, { - 'params': linear_2.parameters(), - 'weight_decay': 0.001, - 'learning_rate': 0.1, - }], - weight_decay=0.01) - out.backward() - adadelta.step() - adadelta.clear_grad() + >>> import paddle + + >>> inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1) + >>> linear = paddle.nn.Linear(10, 10) + >>> out = linear(inp) + >>> loss = paddle.mean(out) + >>> beta1 = paddle.to_tensor([0.9], dtype="float32") + >>> beta2 = paddle.to_tensor([0.99], dtype="float32") + >>> adadelta = paddle.optimizer.Adadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01) + >>> back = out.backward() + >>> adadelta.step() + >>> adadelta.clear_grad() + + >>> # Note that the learning_rate of linear_2 is 0.01. + >>> linear_1 = paddle.nn.Linear(10, 10) + >>> linear_2 = paddle.nn.Linear(10, 10) + >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) + >>> out = linear_1(inp) + >>> out = linear_2(out) + >>> loss = paddle.mean(out) + >>> adadelta = paddle.optimizer.Adadelta( + ... learning_rate=0.1, + ... parameters=[{ + ... 'params': linear_1.parameters() + ... }, { + ... 'params': linear_2.parameters(), + ... 'weight_decay': 0.001, + ... 'learning_rate': 0.1, + ... }], + ... weight_decay=0.01) + >>> out.backward() + >>> adadelta.step() + >>> adadelta.clear_grad() """ diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py index c19b3116de3fcff66fd058f2633a5ef71b673210..3373866ba048a405b5603bf64ff13bdf7d825f4a 100644 --- a/python/paddle/optimizer/adagrad.py +++ b/python/paddle/optimizer/adagrad.py @@ -70,38 +70,38 @@ class Adagrad(Optimizer): Examples: .. code-block:: python - import paddle - - inp = paddle.rand(shape=[10, 10]) - linear = paddle.nn.Linear(10, 10) - out = linear(inp) - loss = paddle.mean(out) - adagrad = paddle.optimizer.Adagrad(learning_rate=0.1, - parameters=linear.parameters()) - out.backward() - adagrad.step() - adagrad.clear_grad() - - #Note that the learning_rate of linear_2 is 0.01. - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) - out = linear_1(inp) - out = linear_2(out) - loss = paddle.mean(out) - adagrad = paddle.optimizer.Adagrad( - learning_rate=0.1, - parameters=[{ - 'params': linear_1.parameters() - }, { - 'params': linear_2.parameters(), - 'weight_decay': 0.001, - 'learning_rate': 0.1, - }], - weight_decay=0.01) - out.backward() - adagrad.step() - adagrad.clear_grad() + >>> import paddle + + >>> inp = paddle.rand(shape=[10, 10]) + >>> linear = paddle.nn.Linear(10, 10) + >>> out = linear(inp) + >>> loss = paddle.mean(out) + >>> adagrad = paddle.optimizer.Adagrad(learning_rate=0.1, + ... parameters=linear.parameters()) + >>> out.backward() + >>> adagrad.step() + >>> adagrad.clear_grad() + + >>> # Note that the learning_rate of linear_2 is 0.01. + >>> linear_1 = paddle.nn.Linear(10, 10) + >>> linear_2 = paddle.nn.Linear(10, 10) + >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) + >>> out = linear_1(inp) + >>> out = linear_2(out) + >>> loss = paddle.mean(out) + >>> adagrad = paddle.optimizer.Adagrad( + ... learning_rate=0.1, + ... parameters=[{ + ... 'params': linear_1.parameters() + ... }, { + ... 'params': linear_2.parameters(), + ... 'weight_decay': 0.001, + ... 'learning_rate': 0.1, + ... }], + ... weight_decay=0.01) + >>> out.backward() + >>> adagrad.step() + >>> adagrad.clear_grad() """ _moment_acc_str = "moment" diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index c1a4019b76e3bdd5e302c9d582e77aaf70d220df..f58b82a13188c8ef849e7240209eb9c6b73a9c32 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -98,63 +98,61 @@ class Adam(Optimizer): .. code-block:: python :name: code-example1 - import paddle - - linear = paddle.nn.Linear(10, 10) - inp = paddle.rand([10,10], dtype="float32") - out = linear(inp) - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=0.1, - parameters=linear.parameters()) - loss.backward() - adam.step() - adam.clear_grad() + >>> import paddle + + >>> linear = paddle.nn.Linear(10, 10) + >>> inp = paddle.rand([10,10], dtype="float32") + >>> out = linear(inp) + >>> loss = paddle.mean(out) + >>> adam = paddle.optimizer.Adam(learning_rate=0.1, + ... parameters=linear.parameters()) + >>> loss.backward() + >>> adam.step() + >>> adam.clear_grad() .. code-block:: python :name: code-example2 - # Adam with beta1/beta2 as Tensor and weight_decay as float - import paddle - - linear = paddle.nn.Linear(10, 10) - inp = paddle.rand([10,10], dtype="float32") - out = linear(inp) - loss = paddle.mean(out) - - beta1 = paddle.to_tensor([0.9], dtype="float32") - beta2 = paddle.to_tensor([0.99], dtype="float32") - - adam = paddle.optimizer.Adam(learning_rate=0.1, - parameters=linear.parameters(), - beta1=beta1, - beta2=beta2, - weight_decay=0.01) - loss.backward() - adam.step() - adam.clear_grad() - - #Note that the learning_rate of linear_2 is 0.01. - linear_1 = paddle.nn.Linear(10, 10) - linear_2 = paddle.nn.Linear(10, 10) - inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) - out = linear_1(inp) - out = linear_2(out) - loss = paddle.mean(out) - adam = paddle.optimizer.Adam( - learning_rate=0.1, - parameters=[{ - 'params': linear_1.parameters() - }, { - 'params': linear_2.parameters(), - 'weight_decay': 0.001, - 'learning_rate': 0.1, - 'beta1': 0.8 - }], - weight_decay=0.01, - beta1=0.9) - loss.backward() - adam.step() - adam.clear_grad() + >>> # Adam with beta1/beta2 as Tensor and weight_decay as float + >>> import paddle + + >>> linear = paddle.nn.Linear(10, 10) + >>> inp = paddle.rand([10,10], dtype="float32") + >>> out = linear(inp) + >>> loss = paddle.mean(out) + >>> beta1 = paddle.to_tensor([0.9], dtype="float32") + >>> beta2 = paddle.to_tensor([0.99], dtype="float32") + >>> adam = paddle.optimizer.Adam(learning_rate=0.1, + ... parameters=linear.parameters(), + ... beta1=beta1, + ... beta2=beta2, + ... weight_decay=0.01) + >>> loss.backward() + >>> adam.step() + >>> adam.clear_grad() + + >>> # Note that the learning_rate of linear_2 is 0.01. + >>> linear_1 = paddle.nn.Linear(10, 10) + >>> linear_2 = paddle.nn.Linear(10, 10) + >>> inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1) + >>> out = linear_1(inp) + >>> out = linear_2(out) + >>> loss = paddle.mean(out) + >>> adam = paddle.optimizer.Adam( + ... learning_rate=0.1, + ... parameters=[{ + ... 'params': linear_1.parameters() + ... }, { + ... 'params': linear_2.parameters(), + ... 'weight_decay': 0.001, + ... 'learning_rate': 0.1, + ... 'beta1': 0.8 + ... }], + ... weight_decay=0.01, + ... beta1=0.9) + >>> loss.backward() + >>> adam.step() + >>> adam.clear_grad() """ _moment1_acc_str = "moment1" @@ -409,17 +407,17 @@ class Adam(Optimizer): Examples: .. code-block:: python - import paddle - - a = paddle.rand([2,13], dtype="float32") - linear = paddle.nn.Linear(13, 5) - # This can be any optimizer supported by dygraph. - adam = paddle.optimizer.Adam(learning_rate = 0.01, - parameters = linear.parameters()) - out = linear(a) - out.backward() - adam.step() - adam.clear_grad() + >>> import paddle + + >>> a = paddle.rand([2,13], dtype="float32") + >>> linear = paddle.nn.Linear(13, 5) + >>> # This can be any optimizer supported by dygraph. + >>> adam = paddle.optimizer.Adam(learning_rate = 0.01, + ... parameters = linear.parameters()) + >>> out = linear(a) + >>> out.backward() + >>> adam.step() + >>> adam.clear_grad() """ if paddle.fluid.dygraph.base.in_declarative_mode(): self._declarative_step()