Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
fe0dc40d
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
fe0dc40d
编写于
1月 03, 2023
作者:
骑
骑马小猫
提交者:
GitHub
1月 03, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[FluidAPI]remove clip api (#48946)
上级
822ea0f9
变更
43
显示空白变更内容
内联
并排
Showing
43 changed file
with
1174 addition
and
1279 deletion
+1174
-1279
python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
...paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+3
-3
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
...optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+3
-4
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
.../meta_parallel/sharding/group_sharded_optimizer_stage2.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage3.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+3
-2
python/paddle/distributed/fleet/metrics/metric.py
python/paddle/distributed/fleet/metrics/metric.py
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+0
-2
python/paddle/fluid/clip.py
python/paddle/fluid/clip.py
+0
-944
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+3
-3
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+0
-200
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+4
-11
python/paddle/fluid/tests/test_error_clip.py
python/paddle/fluid/tests/test_error_clip.py
+2
-2
python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
.../fluid/tests/unittests/collective/fleet/pipeline_mnist.py
+1
-1
python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
...unittests/collective/fleet/pipeline_mnist_multi_device.py
+1
-1
python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
...id/tests/unittests/collective/fleet/test_dgc_optimizer.py
+4
-4
python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
...ests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
+4
-4
python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
...ts/collective/fleet/test_fleet_sharding_meta_optimizer.py
+3
-3
python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
...fluid/tests/unittests/distributed_fused_lamb_test_base.py
+1
-1
python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
...addle/fluid/tests/unittests/dygraph_to_static/test_len.py
+3
-2
python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
...e/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
+2
-2
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
...s/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+3
-3
python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
.../tests/unittests/ir/inference/test_trt_activation_pass.py
+1
-1
python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
+2
-2
python/paddle/fluid/tests/unittests/test_adam_op.py
python/paddle/fluid/tests/unittests/test_adam_op.py
+1
-1
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+2
-1
python/paddle/fluid/tests/unittests/test_clip_op.py
python/paddle/fluid/tests/unittests/test_clip_op.py
+1
-7
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+1
-1
python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
.../fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+2
-2
python/paddle/fluid/tests/unittests/test_fleet_executor.py
python/paddle/fluid/tests/unittests/test_fleet_executor.py
+1
-1
python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
...d/tests/unittests/test_fleet_executor_origin_scheduler.py
+1
-1
python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
...id/tests/unittests/test_fleet_executor_with_task_nodes.py
+1
-1
python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
.../tests/unittests/test_get_tensor_from_selected_rows_op.py
+3
-2
python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
...n/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
+4
-10
python/paddle/fluid/tests/unittests/test_gradient_clip.py
python/paddle/fluid/tests/unittests/test_gradient_clip.py
+21
-25
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
...addle/fluid/tests/unittests/test_imperative_auto_prune.py
+2
-2
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
...le/fluid/tests/unittests/test_imperative_selected_rows.py
+2
-2
python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
+2
-2
python/paddle/hapi/model.py
python/paddle/hapi/model.py
+1
-1
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+5
-4
python/paddle/incubate/optimizer/distributed_fused_lamb.py
python/paddle/incubate/optimizer/distributed_fused_lamb.py
+1
-1
python/paddle/nn/clip.py
python/paddle/nn/clip.py
+1069
-4
python/paddle/optimizer/adamw.py
python/paddle/optimizer/adamw.py
+1
-1
python/paddle/optimizer/optimizer.py
python/paddle/optimizer/optimizer.py
+7
-12
未找到文件。
python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -20,11 +20,11 @@ __all__ = []
import
paddle
from
paddle.common_ops_import
import
LayerHelper
from
paddle.fluid.clip
import
GradientClipByNorm
,
append_gradient_clip_ops
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.optimizer
import
Momentum
,
Optimizer
from
paddle.framework
import
core
from
paddle.nn.clip
import
ClipGradByNorm
,
append_gradient_clip_ops
from
paddle.static
import
create_global_var
...
...
@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer):
self
.
_dgc_clip_norm
=
None
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClip
ByNorm
):
if
not
isinstance
(
grad_clip
,
ClipGrad
ByNorm
):
raise
TypeError
(
"The type of grad_clip should be '
GradientClipByNorm', because DGCMomentumOptimizer only support GradientClip
ByNorm"
"The type of grad_clip should be '
ClipGradByNorm', because DGCMomentumOptimizer only support ClipGrad
ByNorm"
)
assert
isinstance
(
num_trainers
,
int
),
(
"The type of num_trainers should be 'int', but received %s"
...
...
python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -15,9 +15,8 @@
import
paddle
from
paddle
import
framework
from
paddle.autograd
import
no_grad
from
paddle.fluid
import
layers
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.framework
import
core
from
paddle.nn
import
ClipGradByGlobalNorm
,
clip
from
...base.topology
import
ParallelMode
from
...utils.hybrid_parallel_util
import
(
...
...
@@ -62,8 +61,8 @@ class HybridParallelClipGrad:
continue
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
merge_selected_rows
(
g
)
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
merge_grad
)
merge_grad
=
clip
.
merge_selected_rows
(
g
)
merge_grad
=
clip
.
get_tensor_from_selected_rows
(
merge_grad
)
square
=
paddle
.
square
(
merge_grad
)
sum_square
=
paddle
.
sum
(
square
)
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
浏览文件 @
fe0dc40d
...
...
@@ -30,7 +30,7 @@ import paddle
import
paddle.distributed
as
dist
from
paddle.distributed
import
ParallelMode
,
fleet
from
paddle.fluid
import
core
from
paddle.
fluid.clip
import
ClipGradByGlobalNorm
from
paddle.
nn
import
ClipGradByGlobalNorm
from
paddle.optimizer
import
Optimizer
HybridParallelClipGrad
=
(
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
浏览文件 @
fe0dc40d
...
...
@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework
from
paddle
import
nn
from
paddle.autograd
import
PyLayer
from
paddle.distributed
import
collective
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.fluid.framework
import
EagerParamBase
from
paddle.nn
import
ClipGradByGlobalNorm
from
.group_sharded_storage
import
GradStorage
from
.group_sharded_utils
import
GroupShardedClipGrad
,
Type
,
device_guard
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
fe0dc40d
...
...
@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops
from
paddle.fluid
import
core
,
layers
from
paddle.fluid.dygraph
import
to_variable
from
paddle.fluid.framework
import
dygraph_only
from
paddle.nn
import
clip
class
Taskflow
:
...
...
@@ -65,8 +66,8 @@ class GroupShardedClipGrad:
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
layers
.
merge_selected_rows
(
g
)
merge_grad
=
clip
.
get_tensor_from_selected_rows
(
clip
.
merge_selected_rows
(
g
)
)
square
=
paddle
.
square
(
merge_grad
)
sum_square
=
paddle
.
sum
(
square
)
...
...
python/paddle/distributed/fleet/metrics/metric.py
浏览文件 @
fe0dc40d
...
...
@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
.. code-block:: python
# in model.py
similarity_norm = fluid.layers.sigmoid(
fluid.layers
.clip(output, min=-15.0, max=15.0))
similarity_norm = fluid.layers.sigmoid(
paddle
.clip(output, min=-15.0, max=15.0))
binary_predict = fluid.layers.concat(
input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
...
...
python/paddle/fluid/__init__.py
浏览文件 @
fe0dc40d
...
...
@@ -90,7 +90,6 @@ from .transpiler import (
DistributeTranspilerConfig
,
)
from
.lod_tensor
import
create_lod_tensor
,
create_random_int_lodtensor
from
.
import
clip
from
.
import
profiler
from
.
import
unique_name
from
.
import
parallel_executor
...
...
@@ -164,7 +163,6 @@ __all__ = (
'ParamAttr'
,
'WeightNormParamAttr'
,
'DataFeeder'
,
'clip'
,
'profiler'
,
'unique_name'
,
'Scope'
,
...
...
python/paddle/fluid/clip.py
已删除
100644 → 0
浏览文件 @
822ea0f9
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
copy
import
warnings
import
functools
import
paddle
from
.
import
layers
from
.
import
framework
from
.
import
core
from
.
import
name_scope
from
.dygraph
import
base
as
imperative_base
from
.data_feeder
import
check_variable_and_dtype
from
.framework
import
in_dygraph_mode
from
.layer_helper
import
LayerHelper
from
.framework
import
default_main_program
from
paddle
import
_C_ops
,
_legacy_C_ops
__all__
=
[
'set_gradient_clip'
,
'ErrorClipByValue'
,
'ClipGradByValue'
,
'ClipGradByNorm'
,
'ClipGradByGlobalNorm'
,
]
_clip_by_global_norm_using_mp_type_flag
=
False
def
_clip_by_global_norm_using_mp_type
(
*
args
):
global
_clip_by_global_norm_using_mp_type_flag
assert
len
(
args
)
<=
1
if
len
(
args
)
==
1
:
assert
isinstance
(
args
[
0
],
bool
)
old_value
=
_clip_by_global_norm_using_mp_type_flag
_clip_by_global_norm_using_mp_type_flag
=
args
[
0
]
return
old_value
else
:
return
_clip_by_global_norm_using_mp_type_flag
def
_cast_to_mp_type_if_enabled
(
x
):
if
(
x
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
)
and
_clip_by_global_norm_using_mp_type
():
return
x
.
astype
(
core
.
VarDesc
.
VarType
.
FP32
)
else
:
return
x
def
_squared_l2_norm
(
x
):
r
"""
This OP returns the squared L2 norm of a tensor.
"""
x
=
_cast_to_mp_type_if_enabled
(
x
)
if
(
core
.
is_compiled_with_xpu
()
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
):
square
=
paddle
.
square
(
x
)
sum_square
=
paddle
.
sum
(
square
)
return
sum_square
if
in_dygraph_mode
():
return
_C_ops
.
squared_l2_norm
(
x
)
else
:
op_type
=
'squared_l2_norm'
check_variable_and_dtype
(
x
,
'x'
,
[
'float32'
,
'float64'
],
op_type
)
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
inputs
=
{
"X"
:
x
}
outputs
=
{
'Out'
:
out
}
helper
.
append_op
(
type
=
op_type
,
inputs
=
inputs
,
outputs
=
outputs
)
return
out
class
BaseErrorClipAttr
:
def
__str__
(
self
):
raise
NotImplementedError
()
def
_append_clip_op
(
self
,
block
,
grad_name
):
raise
NotImplementedError
()
class
ErrorClipByValue
(
BaseErrorClipAttr
):
r
"""
Clips tensor values to the range [min, max].
Given a tensor ``t`` (see Examples below), this operation clips its value \
to ``min`` and ``max`` inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to ``-max`` by framework.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
paddle.enable_static()
BATCH_SIZE = 128
CLIP_MAX = 2e-6
CLIP_MIN = -1e-6
prog = fluid.framework.Program()
with fluid.program_guard(main_program=prog):
image = fluid.layers.data(
name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(
input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = paddle.nn.functional.cross_entropy(input=predict, label=label, reduction='none', use_softmax=False)
avg_cost = paddle.mean(cost)
prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name)._set_error_clip(
fluid.clip.ErrorClipByValue(
max=CLIP_MAX, min=CLIP_MIN
)
)
"""
def
__init__
(
self
,
max
,
min
=
None
):
max
=
float
(
max
)
if
min
is
None
:
min
=
-
max
else
:
min
=
float
(
min
)
self
.
max
=
max
self
.
min
=
min
def
__str__
(
self
):
return
"ByValue, min=%f, max=%f"
%
(
self
.
min
,
self
.
max
)
def
_append_clip_op
(
self
,
block
,
grad_name
):
clip_op_desc
=
block
.
desc
.
append_op
()
clip_op_desc
.
set_type
(
"clip"
)
clip_op_desc
.
set_input
(
"X"
,
[
grad_name
])
clip_op_desc
.
set_output
(
"Out"
,
[
grad_name
])
clip_op_desc
.
_set_attr
(
"min"
,
self
.
min
)
clip_op_desc
.
_set_attr
(
"max"
,
self
.
max
)
def
error_clip_callback
(
block
,
context
):
# the context is a grad_to_var map
grad_to_var
=
context
op_desc
=
block
.
desc
.
op
(
block
.
desc
.
op_size
()
-
1
)
for
grad_n
in
[
n
for
n
in
op_desc
.
output_arg_names
()
if
n
in
grad_to_var
]:
fwd_var
=
block
.
_var_recursive
(
grad_to_var
[
grad_n
])
error_clip
=
getattr
(
fwd_var
,
"error_clip"
,
None
)
if
not
(
error_clip
is
None
or
isinstance
(
error_clip
,
BaseErrorClipAttr
)
):
raise
TypeError
(
"Variable's error_clip should be an instance of BaseErrorClipAttr or None."
)
if
error_clip
is
not
None
:
error_clip
.
_append_clip_op
(
block
,
grad_n
)
class
ClipGradBase
:
def
__init__
(
self
):
super
().
__init__
()
def
__str__
(
self
):
raise
NotImplementedError
()
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
raise
NotImplementedError
def
_static_clip
(
self
,
params_grads
):
raise
NotImplementedError
def
__call__
(
self
,
params_grads
):
if
in_dygraph_mode
():
return
self
.
_dygraph_clip
(
params_grads
)
else
:
for
p
,
g
in
params_grads
:
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
break
return
self
.
_static_clip
(
params_grads
)
def
_process_context
(
self
,
context
,
param
,
grad
):
raise
NotImplementedError
()
def
_create_operators
(
self
,
param
,
grad
):
raise
NotImplementedError
()
class
ClipGradByValue
(
ClipGradBase
):
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
- Any values less than min are set to ``min``.
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByValue(min=-1, max=1)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
max
,
min
=
None
):
super
().
__init__
()
if
min
is
None
:
assert
max
>
0.0
min
=
-
max
self
.
max
=
float
(
max
)
self
.
min
=
float
(
min
)
def
__str__
(
self
):
return
"Clip Gradient By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
paddle
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
param_new_grad_name_dict
=
dict
()
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
pass
def
_create_operators
(
self
,
param
,
grad
):
new_grad
=
layers
.
clip
(
x
=
grad
,
min
=
self
.
min
,
max
=
self
.
max
)
return
param
,
new_grad
class
ClipGradByNorm
(
ClipGradBase
):
r
"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
- If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
.. math::
Out =
\left\{
\begin{array}{ccl}
X & & if (norm(X) \leq clip\_norm) \\
\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
\end{array}
\right.
where :math:`norm(X)` represents the L2 norm of :math:`X`.
.. math::
norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm(float): The maximum norm value.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
):
super
().
__init__
()
self
.
clip_norm
=
float
(
clip_norm
)
def
__str__
(
self
):
return
"Gradient Clip By Norm, clip_norm=%f"
%
self
.
clip_norm
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
layers
.
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
pass
def
_create_operators
(
self
,
param
,
grad
):
new_grad
=
layers
.
clip_by_norm
(
x
=
grad
,
max_norm
=
self
.
clip_norm
)
return
param
,
new_grad
_allow_pure_fp16_global_norm_clip_flag
=
False
def
_allow_pure_fp16_global_norm_clip
(
*
args
):
global
_allow_pure_fp16_global_norm_clip_flag
if
len
(
args
)
==
0
:
return
_allow_pure_fp16_global_norm_clip_flag
else
:
assert
len
(
args
)
==
1
and
isinstance
(
args
[
0
],
bool
)
old_value
=
_allow_pure_fp16_global_norm_clip_flag
_allow_pure_fp16_global_norm_clip_flag
=
args
[
0
]
return
old_value
class
ClipGradByGlobalNorm
(
ClipGradBase
):
r
"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
- If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
.. math::
t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group``.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
,
auto_skip_clip
=
False
):
super
().
__init__
()
self
.
clip_norm
=
float
(
clip_norm
)
self
.
group_name
=
group_name
assert
isinstance
(
auto_skip_clip
,
bool
)
self
.
auto_skip_clip
=
auto_skip_clip
def
__str__
(
self
):
return
"Gradient Clip By GlobalNorm, global_norm=%f"
%
(
self
.
clip_norm
)
@
imperative_base
.
no_grad
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_list
=
[]
sum_square_list_fp16
=
[]
sum_square_list_fp32
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
if
in_dygraph_mode
()
and
g
.
is_selected_rows
():
merge_grad
=
layers
.
merge_selected_rows
(
g
)
merge_grad
=
merge_grad
.
_get_tensor_from_selected_rows
()
elif
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
merge_selected_rows
(
g
)
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
merge_grad
)
sum_square
=
_squared_l2_norm
(
merge_grad
)
if
(
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
):
sum_square_list_fp16
.
append
(
sum_square
)
elif
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP32
:
sum_square_list_fp32
.
append
(
sum_square
)
else
:
sum_square_list
.
append
(
sum_square
)
# all parameters have been filterd out
if
(
len
(
sum_square_list
)
+
len
(
sum_square_list_fp16
)
+
len
(
sum_square_list_fp32
)
==
0
):
return
params_grads
sum_dtype
=
'float64'
if
len
(
sum_square_list
)
>
0
else
"float32"
global_norm_var
=
[]
if
len
(
sum_square_list_fp16
)
>
0
:
global_norm_var_fp16
=
paddle
.
add_n
(
sum_square_list_fp16
)
global_norm_var
.
append
(
global_norm_var_fp16
.
astype
(
sum_dtype
))
if
len
(
sum_square_list_fp32
)
>
0
:
global_norm_var_fp32
=
paddle
.
add_n
(
sum_square_list_fp32
)
if
sum_dtype
==
'float32'
:
global_norm_var
.
append
(
global_norm_var_fp32
)
else
:
global_norm_var
.
append
(
global_norm_var_fp32
.
astype
(
sum_dtype
))
if
len
(
sum_square_list
)
>
0
:
global_norm_var_fp64
=
paddle
.
add_n
(
sum_square_list
)
global_norm_var
.
append
(
global_norm_var_fp64
)
global_norm_var
=
paddle
.
add_n
(
global_norm_var
)
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
need_clip
=
False
if
not
self
.
auto_skip_clip
:
# always apply clip
need_clip
=
True
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
elif
global_norm_var
>
max_global_norm
:
# only when global_norm_var > max_global_norm, grad need clip
need_clip
=
True
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
global_norm_var
)
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
# TODO(wangxi): use inplace elementwise_mul
if
need_clip
:
clip_input
=
(
clip_var
.
astype
(
g
.
dtype
)
if
clip_var
.
dtype
!=
g
.
dtype
else
clip_var
)
new_grad
=
paddle
.
multiply
(
g
,
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
else
:
params_and_grads
.
append
((
p
,
g
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_list
=
[]
sum_square_list_fp16
=
[]
sum_square_list_fp32
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
merge_selected_rows
(
g
)
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
merge_grad
)
sum_square
=
_squared_l2_norm
(
merge_grad
)
if
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
sum_square_list_fp16
.
append
(
sum_square
)
elif
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP32
:
sum_square_list_fp32
.
append
(
sum_square
)
else
:
sum_square_list
.
append
(
sum_square
)
# all parameters have been filterd out
if
(
len
(
sum_square_list
)
+
len
(
sum_square_list_fp16
)
+
len
(
sum_square_list_fp32
)
==
0
):
return
params_grads
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
sum_dtype
=
'float64'
if
len
(
sum_square_list
)
>
0
else
"float32"
global_norm_var
=
[]
if
len
(
sum_square_list_fp16
)
>
0
:
global_norm_var_fp16
=
layers
.
sums
(
sum_square_list_fp16
)
if
(
sum_square_list_fp32
or
sum_square_list
or
not
_allow_pure_fp16_global_norm_clip
()
):
global_norm_var
.
append
(
global_norm_var_fp16
.
astype
(
sum_dtype
)
)
else
:
global_norm_var
.
append
(
global_norm_var_fp16
)
if
len
(
sum_square_list_fp32
)
>
0
:
global_norm_var_fp32
=
layers
.
sums
(
sum_square_list_fp32
)
if
sum_dtype
==
'float32'
:
global_norm_var
.
append
(
global_norm_var_fp32
)
else
:
global_norm_var
.
append
(
global_norm_var_fp32
.
astype
(
sum_dtype
)
)
if
len
(
sum_square_list
)
>
0
:
# fp64
global_norm_var_other_dtype
=
layers
.
sums
(
sum_square_list
)
global_norm_var
.
append
(
global_norm_var_other_dtype
)
global_norm_var
=
(
layers
.
sums
(
global_norm_var
)
if
len
(
global_norm_var
)
>
1
else
global_norm_var
[
0
]
)
global_norm_var
=
paddle
.
sqrt
(
x
=
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
)
scale_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
)
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_g
=
_cast_to_mp_type_if_enabled
(
g
)
# inplace
scale_input
=
(
scale_var
.
astype
(
'float16'
)
if
new_g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
scale_var
.
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
else
scale_var
)
# NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
# will be in different blocks with the gradient clip related ops.
# We need to handle the correct block, otherwise will encounter
# a 'NotFoundError' during compile time.
block
=
default_main_program
().
current_block
()
block
.
append_op
(
type
=
'elementwise_mul'
,
inputs
=
{
'X'
:
new_g
,
'Y'
:
scale_input
},
outputs
=
{
'Out'
:
new_g
},
)
if
new_g
is
not
g
:
block
.
append_op
(
type
=
'cast'
,
inputs
=
{
'X'
:
new_g
},
outputs
=
{
'Out'
:
g
},
attrs
=
{
'in_dtype'
:
new_g
.
dtype
,
'out_dtype'
:
g
.
dtype
,
},
)
param_new_grad_name_dict
[
p
.
name
]
=
g
.
name
params_and_grads
.
append
((
p
,
g
))
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
if
self
.
group_name
not
in
context
:
context
[
self
.
group_name
]
=
[]
context
[
self
.
group_name
+
"_clip_value"
]
=
self
.
clip_norm
context
[
self
.
group_name
+
"_clip"
]
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
grad
.
dtype
,
value
=
self
.
clip_norm
)
else
:
if
not
self
.
clip_norm
==
context
[
self
.
group_name
+
"_clip_value"
]:
raise
ValueError
(
"All parameters' 'clip_norm' of a same group should be the same"
)
merge_grad
=
grad
if
grad
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
merge_selected_rows
(
grad
)
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
merge_grad
)
local_norm_var
=
_squared_l2_norm
(
merge_grad
)
context
[
self
.
group_name
].
append
(
local_norm_var
)
self
.
context
=
context
def
_create_operators
(
self
,
param
,
grad
):
group_scale_name
=
self
.
group_name
+
"_scale"
if
group_scale_name
not
in
self
.
context
:
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
group_scale_var
=
paddle
.
divide
(
x
=
clip_var
,
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
)
assert
group_scale_var
.
shape
==
(
1
,)
self
.
context
[
group_scale_name
]
=
group_scale_var
# inplace
param
.
block
.
append_op
(
type
=
'elementwise_mul'
,
inputs
=
{
'X'
:
grad
,
'Y'
:
self
.
context
[
group_scale_name
]},
outputs
=
{
'Out'
:
grad
},
)
return
param
,
grad
@
framework
.
dygraph_not_support
def
set_gradient_clip
(
clip
,
param_list
=
None
,
program
=
None
):
"""
:api_attr: Static Graph
Warning:
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
Args:
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
param_list (list(Variable), optional): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
Default None, meaning that all parameters in the program will be included.
program (Program, optional): The program where parameters are located.
Default None, meaning that using :ref:`api_fluid_default_main_program` .
Returns:
None
Examples:
.. code-block:: python
import paddle.fluid as fluid
def network():
image = fluid.data(name='image', shape=[
None, 28], dtype='float32')
param_attr1 = fluid.ParamAttr("fc1_param")
fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
param_attr2 = fluid.ParamAttr("fc2_param")
fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
loss = fluid.layers.reduce_mean(fc2)
return loss
# network 1: clip all parameter gradient
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=2.0))
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 2: clip parameter gradient by name
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
param_list=["fc1_param", "fc2_param"])
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 3: clip parameter gradient by value
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
param_var1 = fluid.default_main_program().global_block().var("fc1_param")
param_var2 = fluid.default_main_program().global_block().var("fc2_param")
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByValue(min=-1.0, max=1.0),
param_list=[param_var1, param_var2])
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
clip1 = fluid.clip.GradientClipByValue(min=-1.0, max=1.0)
clip2 = fluid.clip.GradientClipByNorm(clip_norm=1.0)
# Set the gradient clipping strategy: clip1
fluid.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
"""
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy: set 'grad_clip' "
"when initializing the 'optimizer'. "
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'."
)
if
not
isinstance
(
clip
,
ClipGradBase
):
raise
TypeError
(
"'clip' should be an instance of ClipGradBase's derived class"
)
if
program
is
None
:
program
=
framework
.
default_main_program
()
for
op
in
program
.
block
(
0
).
ops
:
if
'op_namescope'
in
op
.
all_attrs
()
and
"optimizer"
in
op
.
attr
(
"op_namescope"
):
warnings
.
warn
(
"'minimize' has been invoked before, this will make 'set_gradient_clip' "
"be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
)
break
if
param_list
is
None
:
param_list
=
program
.
block
(
0
).
all_parameters
()
if
all
(
isinstance
(
elem
,
str
)
for
elem
in
param_list
):
param_list
=
[
program
.
block
(
0
).
var
(
elem
)
for
elem
in
param_list
]
if
not
all
(
isinstance
(
elem
,
framework
.
Parameter
)
for
elem
in
param_list
):
raise
TypeError
(
"'param_list' should be a list of Parameter or basestring(parameter's name)."
)
for
param
in
param_list
:
param
.
gradient_clip_attr
=
copy
.
deepcopy
(
clip
)
def
append_gradient_clip_ops
(
param_grads
):
context
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]),
framework
.
name_scope
(
'gradient_clip'
):
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
None
)
if
clip_attr
is
None
:
return
param_grads
if
not
isinstance
(
clip_attr
,
ClipGradBase
):
raise
TypeError
(
"clip attribute should be an instance of GradientClipBase"
)
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]),
framework
.
name_scope
(
'gradient_clip'
):
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param_new_grad_name_dict
[
param
.
name
]
=
new_grad
.
name
res
.
append
([
param
,
new_grad
])
_correct_clip_op_role_var
(
res
,
param_new_grad_name_dict
)
return
res
# change wrong mapping relation between param & grad in clip op
# Note: This function is sensitive to the time cost of the network with gradient clipping
# and should not be changed easily. If you must change, please test the time cost.
def
_correct_clip_op_role_var
(
params_grads
,
param_new_grad_name_dict
):
block_id_list
=
[]
if
len
(
param_new_grad_name_dict
)
==
0
:
return
for
param
,
grad
in
params_grads
:
if
grad
is
None
:
continue
block_id
=
param
.
block
.
idx
if
block_id
in
block_id_list
:
continue
block_id_list
.
append
(
block_id
)
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
if
(
op
.
has_attr
(
"op_namescope"
)
and
"gradient_clip"
in
op
.
attr
(
"op_namescope"
)
and
op
.
attr
(
'op_role_var'
)
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
if
param_name
in
param_new_grad_name_dict
:
correct_p_g
=
[
param_name
,
param_new_grad_name_dict
[
param_name
],
]
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
GradientClipBase
=
ClipGradBase
GradientClipByValue
=
ClipGradByValue
GradientClipByNorm
=
ClipGradByNorm
GradientClipByGlobalNorm
=
ClipGradByGlobalNorm
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
浏览文件 @
fe0dc40d
...
...
@@ -185,7 +185,7 @@ class FleetUtil:
# below is part of model
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(
fluid.layers
.clip(\
similarity_norm = fluid.layers.sigmoid(
paddle
.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
...
...
@@ -1374,7 +1374,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(
fluid.layers
.clip(\
similarity_norm = fluid.layers.sigmoid(
paddle
.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
...
...
@@ -1574,7 +1574,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(
fluid.layers
.clip(\
similarity_norm = fluid.layers.sigmoid(
paddle
.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
fe0dc40d
...
...
@@ -63,10 +63,6 @@ __all__ = [
'fc'
,
'embedding'
,
'autoincreased_step_counter'
,
'clip'
,
'clip_by_norm'
,
'merge_selected_rows'
,
'get_tensor_from_selected_rows'
,
]
OP_NAMEMAPPING
=
{
...
...
@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
)
return
out
@
templatedoc
()
def
clip
(
x
,
min
,
max
,
name
=
None
):
"""
:old_api: paddle.fluid.layers.clip
${comment}
Args:
x(${x_type}): ${x_comment}
min(float): ${min_comment}
max(float): ${max_comment}
name(str, optional): The default value is None.
Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`
Returns:
${out_comment}
Return Type:
${out_type}
Examples:
.. code-block:: python
import paddle.fluid as fluid
input = fluid.data(
name='data', shape=[1], dtype='float32')
reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
"""
helper
=
LayerHelper
(
"clip"
,
**
locals
())
check_variable_and_dtype
(
x
,
'x'
,
[
'float16'
,
'float32'
,
'float64'
],
'clip'
)
if
name
is
None
:
name
=
unique_name
.
generate_with_ignorable_key
(
"."
.
join
([
helper
.
name
,
'tmp'
])
)
out
=
helper
.
create_variable
(
type
=
x
.
type
,
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"clip"
,
inputs
=
{
"X"
:
x
},
attrs
=
{
"min"
:
min
,
"max"
:
max
},
outputs
=
{
"Out"
:
out
},
)
return
out
@
templatedoc
()
def
clip_by_norm
(
x
,
max_norm
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
max_norm(${max_norm_type}): ${max_norm_comment}
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
# [[0.5, 0.5], [0.5, 0.5]]
"""
if
in_dygraph_mode
():
return
_C_ops
.
clip_by_norm
(
x
,
max_norm
)
else
:
helper
=
LayerHelper
(
"clip_by_norm"
,
**
locals
())
check_variable_and_dtype
(
x
,
'X'
,
[
'float32'
,
'float16'
],
'clip_by_norm'
)
check_type
(
max_norm
,
'max_norm'
,
(
float
),
'clip_by_norm'
)
if
name
is
None
:
name
=
unique_name
.
generate_with_ignorable_key
(
"."
.
join
([
helper
.
name
,
'tmp'
])
)
out
=
helper
.
create_variable
(
type
=
x
.
type
,
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"clip_by_norm"
,
inputs
=
{
"X"
:
x
},
attrs
=
{
"max_norm"
:
max_norm
},
outputs
=
{
"Out"
:
out
},
)
return
out
@
templatedoc
()
def
merge_selected_rows
(
x
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
var = b.create_var(
name="X", dtype="float32", persistable=True,
type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
y = fluid.layers.merge_selected_rows(var)
"""
if
in_dygraph_mode
():
return
_C_ops
.
merge_selected_rows
(
x
)
else
:
helper
=
LayerHelper
(
"merge_selected_rows"
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"merge_selected_rows"
,
inputs
=
{
"X"
:
x
},
attrs
=
{},
outputs
=
{
"Out"
:
out
},
)
return
out
@
templatedoc
()
def
get_tensor_from_selected_rows
(
x
,
name
=
None
):
"""
This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
.. code-block:: text
input x is SelectedRows:
x.rows = [0, 5, 5, 4, 19]
x.height = 20
x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
Output is LoDTensor:
out.shape = [5, 2]
out.data = [[1, 1],
[2, 2],
[2, 2],
[3, 3],
[6, 6]]
Args:
x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
name(str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .
Returns:
Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
out = fluid.layers.get_tensor_from_selected_rows(input)
"""
check_type
(
x
,
'x'
,
Variable
,
'get_tensor_from_selected_rows'
)
if
x
.
type
!=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
raise
TypeError
(
"The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
)
helper
=
LayerHelper
(
'get_tensor_from_selected_rows'
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'get_tensor_from_selected_rows'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{},
)
return
out
python/paddle/fluid/optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -38,13 +38,6 @@ from .backward import (
_append_grad_suffix_
,
_get_no_grad_set_name
,
)
from
.clip
import
(
GradientClipBase
,
GradientClipByNorm
,
error_clip_callback
,
append_gradient_clip_ops
,
ClipGradByGlobalNorm
,
)
from
.framework
import
program_guard
from
.initializer
import
Constant
from
.layer_helper
import
LayerHelper
...
...
@@ -160,7 +153,7 @@ class Optimizer:
)
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
if
not
isinstance
(
grad_clip
,
paddle
.
nn
.
clip
.
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
...
...
@@ -1030,7 +1023,7 @@ class Optimizer:
params_grads
.
append
((
param
,
grad_var
))
else
:
if
callbacks
is
None
:
callbacks
=
[
error_clip_callback
]
callbacks
=
[
paddle
.
nn
.
clip
.
error_clip_callback
]
else
:
assert
isinstance
(
callbacks
,
list
)
program
=
loss
.
block
.
program
...
...
@@ -1260,7 +1253,7 @@ class Optimizer:
# NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
if
self
.
_flatten_param_grads
and
self
.
regularization
is
None
:
if
self
.
_grad_clip
is
None
or
isinstance
(
self
.
_grad_clip
,
ClipGradByGlobalNorm
self
.
_grad_clip
,
paddle
.
nn
.
ClipGradByGlobalNorm
):
params_grads
=
self
.
flatten_param_grads
(
params_grads
)
...
...
@@ -1268,7 +1261,7 @@ class Optimizer:
if
self
.
_grad_clip
is
not
None
:
params_grads
=
self
.
_grad_clip
(
params_grads
)
else
:
params_grads
=
append_gradient_clip_ops
(
params_grads
)
params_grads
=
paddle
.
nn
.
clip
.
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
self
.
append_regularization_ops
(
...
...
python/paddle/fluid/tests/test_error_clip.py
浏览文件 @
fe0dc40d
...
...
@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog):
prog_clip
=
prog
.
clone
()
prog_clip
.
block
(
0
).
var
(
hidden1
.
name
).
_set_error_clip
(
fluid
.
clip
.
ErrorClipByValue
(
max
=
CLIP_MAX
,
min
=
CLIP_MIN
)
paddle
.
nn
.
clip
.
ErrorClipByValue
(
max
=
CLIP_MAX
,
min
=
CLIP_MIN
)
)
avg_cost_clip
=
prog_clip
.
block
(
0
).
var
(
avg_cost
.
name
)
fluid
.
backward
.
append_backward
(
loss
=
avg_cost
)
fluid
.
backward
.
append_backward
(
loss
=
avg_cost_clip
,
callbacks
=
[
fluid
.
clip
.
error_clip_callback
]
loss
=
avg_cost_clip
,
callbacks
=
[
paddle
.
nn
.
clip
.
error_clip_callback
]
)
hidden1_grad
=
prog
.
block
(
0
).
var
(
hidden1
.
name
+
"@GRAD"
)
...
...
python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist.py
浏览文件 @
fe0dc40d
...
...
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
),
)
acc_steps
=
2
# accumulated steps for pipeline
...
...
python/paddle/fluid/tests/unittests/collective/fleet/pipeline_mnist_multi_device.py
浏览文件 @
fe0dc40d
...
...
@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt
=
fluid
.
optimizer
.
Momentum
(
learning_rate
=
lr_val
,
momentum
=
0.9
,
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
),
)
acc_steps
=
2
# accumulated steps for pipeline
...
...
python/paddle/fluid/tests/unittests/collective/fleet/test_dgc_optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -15,10 +15,10 @@
import
unittest
import
paddle
import
paddle.fluid.clip
as
clip
import
paddle.fluid.framework
as
framework
import
paddle.fluid.optimizer
as
optimizer
import
paddle.fluid.regularizer
as
regularizer
import
paddle.nn.clip
as
clip
paddle
.
enable_static
()
...
...
@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
rampup_begin_step
=
0
,
num_trainers
=
2
,
regularization
=
regularization
,
grad_clip
=
clip
.
GradientClip
ByNorm
(
1.0
),
grad_clip
=
clip
.
ClipGrad
ByNorm
(
1.0
),
)
if
use_recompute
:
...
...
@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
print
(
"dgc regular_coeff="
+
str
(
coeff
))
def
test_tpyeError
(
self
):
# the type of DGCMomentumOptimizer(grad_clip=) must be '
GradientClip
ByNorm'
# the type of DGCMomentumOptimizer(grad_clip=) must be '
ClipGrad
ByNorm'
with
self
.
assertRaises
(
TypeError
):
dgc_momentum_optimizer
=
self
.
MockDGCMomentum
(
learning_rate
=
0.01
,
momentum
=
0.2
,
rampup_begin_step
=
0
,
num_trainers
=
2
,
grad_clip
=
clip
.
GradientClip
ByGlobalNorm
(
1.0
),
grad_clip
=
clip
.
ClipGrad
ByGlobalNorm
(
1.0
),
)
def
test_momentum_without_dgc
(
self
):
...
...
python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_hybrid_meta_optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
}
strategy
.
fuse_all_reduce_ops
=
True
strategy
.
fuse_grad_size_in_MB
=
32
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
...
...
@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
strategy
.
fuse_all_reduce_ops
=
True
strategy
.
fuse_grad_size_in_MB
=
32
strategy
.
fuse_grad_merge
=
True
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
...
...
@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
}
strategy
.
fuse_all_reduce_ops
=
True
strategy
.
fuse_grad_size_in_MB
=
32
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
...
...
@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
}
strategy
.
fuse_all_reduce_ops
=
True
strategy
.
fuse_grad_size_in_MB
=
32
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
...
...
python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_sharding_meta_optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
)
avg_cost
,
strategy
=
self
.
net
(
train_prog
,
startup_prog
)
self
.
set_strategy
(
strategy
,
'sharding'
)
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
)
...
...
@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size"
:
2
,
"accumulate_steps"
:
4
,
}
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
train_prog
,
startup_prog
,
grad_clip
=
clip
)
...
...
@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size"
:
2
,
"accumulate_steps"
:
4
,
}
clip
=
paddle
.
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
)
self
.
optimizer
(
avg_cost
,
strategy
,
...
...
python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
浏览文件 @
fe0dc40d
...
...
@@ -22,8 +22,8 @@ import paddle
import
paddle.distributed.fleet
as
fleet
import
paddle.fluid.core
as
core
from
paddle.distributed.fleet.meta_optimizers.common
import
CollectiveHelper
from
paddle.fluid.clip
import
ClipGradBase
,
_clip_by_global_norm_using_mp_type
from
paddle.incubate
import
DistributedFusedLamb
from
paddle.nn.clip
import
ClipGradBase
,
_clip_by_global_norm_using_mp_type
from
paddle.vision.models
import
resnet18
as
resnet
...
...
python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py
浏览文件 @
fe0dc40d
...
...
@@ -19,6 +19,7 @@ import numpy as np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.jit.dy2static
import
Call
from
paddle.nn
import
clip
SEED
=
2020
np
.
random
.
seed
(
SEED
)
...
...
@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
type
=
fluid
.
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
,
)
# y is Variable(SelectedRows)
y
=
fluid
.
layers
.
merge_selected_rows
(
var
)
y
=
clip
.
merge_selected_rows
(
var
)
y_len
=
Call
(
len
)(
y
)
# z is inner tensor with shape [4, 2]
z
=
fluid
.
layers
.
get_tensor_from_selected_rows
(
y
)
z
=
clip
.
get_tensor_from_selected_rows
(
y
)
z_len
=
Call
(
len
)(
z
)
# set data for selected_rows
...
...
python/paddle/fluid/tests/unittests/dygraph_to_static/test_seq2seq.py
浏览文件 @
fe0dc40d
...
...
@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel
from
seq2seq_utils
import
Seq2SeqModelHyperParams
,
get_data_iter
import
paddle.fluid
as
fluid
from
paddle.fluid.clip
import
GradientClipByGlobalNorm
from
paddle.jit
import
ProgramTranslator
from
paddle.nn
import
ClipGradByGlobalNorm
place
=
(
fluid
.
CUDAPlace
(
0
)
if
fluid
.
is_compiled_with_cuda
()
else
fluid
.
CPUPlace
()
...
...
@@ -71,7 +71,7 @@ def train(args, attn_model=False):
dropout
=
args
.
dropout
,
)
gloabl_norm_clip
=
GradientClip
ByGlobalNorm
(
args
.
max_grad_norm
)
gloabl_norm_clip
=
ClipGrad
ByGlobalNorm
(
args
.
max_grad_norm
)
optimizer
=
fluid
.
optimizer
.
SGD
(
args
.
learning_rate
,
parameter_list
=
model
.
parameters
(),
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
浏览文件 @
fe0dc40d
...
...
@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
):
def
set_params
(
self
):
self
.
operand
=
paddle
.
add
self
.
act
=
fluid
.
layers
.
clip
self
.
act
=
paddle
.
clip
self
.
act_alpha
=
0.0
self
.
act_beta
=
10.0
...
...
@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
):
def
set_params
(
self
):
self
.
operand
=
paddle
.
subtract
self
.
act
=
fluid
.
layers
.
clip
self
.
act
=
paddle
.
clip
self
.
act_alpha
=
0.0
self
.
act_beta
=
10.0
...
...
@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
):
def
set_params
(
self
):
self
.
operand
=
paddle
.
multiply
self
.
act
=
fluid
.
layers
.
clip
self
.
act
=
paddle
.
clip
self
.
act_alpha
=
0.0
self
.
act_beta
=
10.0
...
...
python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
浏览文件 @
fe0dc40d
...
...
@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
class
TensorRTSubgraphPassClipTest
(
TensorRTSubgraphPassActivationTest
):
def
append_act
(
self
,
x
):
return
fluid
.
layers
.
clip
(
x
,
0
,
1
)
return
paddle
.
clip
(
x
,
0
,
1
)
class
TensorRTSubgraphPassTanhTest
(
TensorRTSubgraphPassActivationTest
):
...
...
python/paddle/fluid/tests/unittests/npu/test_clip_op_npu.py
浏览文件 @
fe0dc40d
...
...
@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase):
input_data
=
np
.
random
.
random
((
2
,
4
)).
astype
(
"float32"
)
def
test_Variable
():
fluid
.
layers
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
paddle
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_Variable
)
def
test_dtype
():
x2
=
fluid
.
layers
.
data
(
name
=
'x2'
,
shape
=
[
1
],
dtype
=
'int32'
)
fluid
.
layers
.
clip
(
x
=
x2
,
min
=-
1.0
,
max
=
1.0
)
paddle
.
clip
(
x
=
x2
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_dtype
)
paddle
.
disable_static
()
...
...
python/paddle/fluid/tests/unittests/test_adam_op.py
浏览文件 @
fe0dc40d
...
...
@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase):
value
=
np
.
arange
(
26
).
reshape
(
2
,
13
).
astype
(
"float32"
)
a
=
fluid
.
dygraph
.
to_variable
(
value
)
linear
=
paddle
.
nn
.
Linear
(
13
,
5
)
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
)
adam
=
paddle
.
optimizer
.
Adam
(
0.1
,
parameters
=
linear
.
parameters
(),
grad_clip
=
clip
)
...
...
python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
浏览文件 @
fe0dc40d
...
...
@@ -20,12 +20,13 @@ from op_test import OpTest
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.nn
import
clip
class
TestClipByNormOp
(
OpTest
):
def
setUp
(
self
):
self
.
max_relative_error
=
0.006
self
.
python_api
=
fluid
.
layers
.
clip_by_norm
self
.
python_api
=
clip
.
clip_by_norm
self
.
init_dtype
()
self
.
initTestCase
()
input
=
np
.
random
.
random
(
self
.
shape
).
astype
(
self
.
dtype
)
...
...
python/paddle/fluid/tests/unittests/test_clip_op.py
浏览文件 @
fe0dc40d
...
...
@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase):
input_data
=
np
.
random
.
random
((
2
,
4
)).
astype
(
"float32"
)
def
test_Variable
():
fluid
.
layers
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
paddle
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_Variable
)
def
test_dtype
():
x2
=
fluid
.
layers
.
data
(
name
=
'x2'
,
shape
=
[
1
],
dtype
=
'int32'
)
fluid
.
layers
.
clip
(
x
=
x2
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_dtype
)
paddle
.
disable_static
()
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
fe0dc40d
...
...
@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest):
def
filter
(
param
):
return
param
.
name
==
"fc_w"
clip
=
fluid
.
clip
.
GradientClip
ByValue
(
0.1
,
need_clip
=
filter
)
clip
=
paddle
.
nn
.
ClipGrad
ByValue
(
0.1
,
need_clip
=
filter
)
sgd_optimizer
.
minimize
(
avg_cost
,
grad_clip
=
clip
)
def
transpiler_test_impl
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
浏览文件 @
fe0dc40d
...
...
@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase):
self
.
feed_order
,
)
=
res_vars
fluid
.
clip
.
set_gradient_clip
(
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
paddle
.
nn
.
clip
.
set_gradient_clip
(
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
config
.
max_grad_norm
)
)
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor.py
浏览文件 @
fe0dc40d
...
...
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
),
)
opt
.
minimize
(
loss
)
# TODO: section_program will be removed in the future
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor_origin_scheduler.py
浏览文件 @
fe0dc40d
...
...
@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
),
)
opt
.
minimize
(
loss
)
# TODO: section_program will be removed in the future
...
...
python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
浏览文件 @
fe0dc40d
...
...
@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt
=
paddle
.
optimizer
.
AdamW
(
learning_rate
=
lr_val
,
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
1.0
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
1.0
),
)
opt
.
minimize
(
loss
)
# TODO: section_program will be removed in the future
...
...
python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py
浏览文件 @
fe0dc40d
...
...
@@ -20,6 +20,7 @@ import paddle.fluid as fluid
import
paddle.fluid.core
as
core
from
paddle.fluid
import
Program
,
program_guard
from
paddle.fluid.op
import
Operator
from
paddle.nn
import
clip
class
TestGetTensorFromSelectedRowsError
(
unittest
.
TestCase
):
...
...
@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase):
x_data
=
np
.
random
.
random
((
2
,
4
)).
astype
(
"float32"
)
def
test_Variable
():
fluid
.
layers
.
get_tensor_from_selected_rows
(
x
=
x_data
)
clip
.
get_tensor_from_selected_rows
(
x
=
x_data
)
self
.
assertRaises
(
TypeError
,
test_Variable
)
def
test_SELECTED_ROWS
():
fluid
.
layers
.
get_tensor_from_selected_rows
(
x
=
x_var
)
clip
.
get_tensor_from_selected_rows
(
x
=
x_var
)
self
.
assertRaises
(
TypeError
,
test_SELECTED_ROWS
)
...
...
python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py
浏览文件 @
fe0dc40d
...
...
@@ -17,12 +17,8 @@ import unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddle.fluid.clip
import
(
GradientClipByGlobalNorm
,
GradientClipByNorm
,
GradientClipByValue
,
)
from
paddle.fluid.dygraph.base
import
to_variable
from
paddle.nn
import
ClipGradByGlobalNorm
,
ClipGradByNorm
,
ClipGradByValue
class
TestGradClipByGlobalNorm
(
unittest
.
TestCase
):
...
...
@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def
get_dygrap_global_norm_result
(
self
):
with
fluid
.
dygraph
.
guard
():
gloabl_norm_clip
=
GradientClip
ByGlobalNorm
(
self
.
max_global_norm
)
gloabl_norm_clip
=
ClipGrad
ByGlobalNorm
(
self
.
max_global_norm
)
p_g_var
=
[]
for
p
,
g
in
self
.
para_and_grad
:
new_p
=
to_variable
(
p
)
...
...
@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase):
def
get_dygrap_norm_result
(
self
):
with
fluid
.
dygraph
.
guard
():
norm_clip
=
GradientClip
ByNorm
(
self
.
max_norm
)
norm_clip
=
ClipGrad
ByNorm
(
self
.
max_norm
)
p_g_var
=
[]
for
p
,
g
in
self
.
para_and_grad
:
new_p
=
to_variable
(
p
)
...
...
@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase):
def
get_dygrap_clip_result
(
self
):
with
fluid
.
dygraph
.
guard
():
value_clip
=
GradientClipByValue
(
max
=
self
.
max_value
,
min
=
self
.
min_value
)
value_clip
=
ClipGradByValue
(
max
=
self
.
max_value
,
min
=
self
.
min_value
)
p_g_var
=
[]
for
p
,
g
in
self
.
para_and_grad
:
new_p
=
to_variable
(
p
)
...
...
python/paddle/fluid/tests/unittests/test_gradient_clip.py
浏览文件 @
fe0dc40d
...
...
@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
paddle.
fluid
.clip
import
_allow_pure_fp16_global_norm_clip
from
paddle.
nn
.clip
import
_allow_pure_fp16_global_norm_clip
paddle
.
enable_static
()
...
...
@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use 'set_gradient_clip'
def
test_old_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
return
fluid
.
clip
.
append_gradient_clip_ops
(
params_grads
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
paddle
.
nn
.
clip
.
set_gradient_clip
(
clip
)
return
paddle
.
nn
.
clip
.
append_gradient_clip_ops
(
params_grads
)
self
.
clip_gradient
=
func
self
.
check_gradient_clip
(
fluid
.
CPUPlace
())
...
...
@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip
def
test_new_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
return
clip
(
params_grads
)
self
.
clip_gradient
=
func
...
...
@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip under float64
def
test_new_gradient_clip_fp64
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
return
clip
(
params_grads
)
self
.
clip_gradient
=
func
...
...
@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
def
test_wrong_API_order
(
self
):
def
backward_func
(
cost
):
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
5.0
)
fluid
.
clip
.
set_gradient_clip
(
clip
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
5.0
)
paddle
.
nn
.
clip
.
set_gradient_clip
(
clip
)
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.01
,
grad_clip
=
clip
)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer
.
minimize
(
cost
)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid
.
clip
.
set_gradient_clip
(
clip
)
paddle
.
nn
.
clip
.
set_gradient_clip
(
clip
)
self
.
backward_and_optimize
=
backward_func
for
place
in
self
.
get_places
():
...
...
@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
with
fluid
.
program_guard
(
main_program
=
prog
,
startup_program
=
startup_program
):
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
self
.
clip_norm
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
self
.
clip_norm
)
x
=
(
fluid
.
default_main_program
()
.
global_block
()
...
...
@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip):
# test whether the output is right when use grad_clip
def
test_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClip
ByNorm
(
clip_norm
=
self
.
clip_norm
)
clip
=
paddle
.
nn
.
ClipGrad
ByNorm
(
clip_norm
=
self
.
clip_norm
)
return
clip
(
params_grads
)
self
.
clip_gradient
=
func
...
...
@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
def
test_none_grad
(
self
):
clip
=
fluid
.
clip
.
GradientClip
ByNorm
(
self
.
clip_norm
)
clip
=
paddle
.
nn
.
ClipGrad
ByNorm
(
self
.
clip_norm
)
x
=
(
fluid
.
default_main_program
()
.
global_block
()
...
...
@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip):
# test whether the output is right when use grad_clip
def
test_gradient_clip
(
self
):
def
func
(
params_grads
):
clip
=
fluid
.
clip
.
GradientClip
ByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
clip
=
paddle
.
nn
.
ClipGrad
ByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
return
clip
(
params_grads
)
self
.
clip_gradient
=
func
...
...
@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
def
test_none_grad
(
self
):
clip
=
fluid
.
clip
.
GradientClip
ByValue
(
self
.
max
,
self
.
min
)
clip
=
paddle
.
nn
.
ClipGrad
ByValue
(
self
.
max
,
self
.
min
)
x
=
(
fluid
.
default_main_program
()
.
global_block
()
...
...
@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase):
sgd_optimizer
=
fluid
.
optimizer
.
SGD
(
learning_rate
=
0.0
,
parameter_list
=
linear
.
parameters
(),
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
0.1
),
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
0.1
),
)
self
.
check_clip_result
(
loss
,
sgd_optimizer
)
...
...
@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase):
class
TestDygraphGradientClipByGlobalNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
self
.
clip_norm
=
0.8
self
.
clip1
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
self
.
clip2
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
self
.
clip1
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
self
.
clip2
=
paddle
.
nn
.
ClipGradByGlobalNorm
(
clip_norm
=
self
.
clip_norm
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
...
...
@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class
TestDygraphGradientClipByNorm
(
TestDygraphGradientClip
):
def
setUp
(
self
):
self
.
clip_norm
=
0.8
self
.
clip
=
fluid
.
clip
.
GradientClip
ByNorm
(
clip_norm
=
self
.
clip_norm
)
self
.
clip
=
paddle
.
nn
.
ClipGrad
ByNorm
(
clip_norm
=
self
.
clip_norm
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
...
...
@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def
setUp
(
self
):
self
.
max
=
0.2
self
.
min
=
0.1
self
.
clip
=
fluid
.
clip
.
GradientClip
ByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
self
.
clip
=
paddle
.
nn
.
ClipGrad
ByValue
(
max
=
self
.
max
,
min
=
self
.
min
)
def
check_clip_result
(
self
,
loss
,
optimizer
):
# if grad is None
...
...
@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
params_grads
.
append
((
param
,
param
.
_grad_ivar
()))
_
,
grads
=
zip
(
*
params_grads
)
# clip grads
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
0.8
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
0.8
)
params_grads
=
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
# param update
...
...
@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
params_grads
.
append
((
param
,
param
.
_grad_ivar
()))
_
,
grads
=
zip
(
*
params_grads
)
# clip grads
clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
clip_norm
=
0.1
)
clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
clip_norm
=
0.1
)
params_grads
=
clip
(
params_grads
)
_
,
grads_clip
=
zip
(
*
params_grads
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
浏览文件 @
fe0dc40d
...
...
@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
place
=
fluid
.
CPUPlace
()
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer
(
size
,
vocab_size
,
size
)
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
0.001
)
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
...
...
@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
with
fluid
.
dygraph
.
guard
(
place
):
model
=
MyLayer2
(
size
,
vocab_size
,
size
)
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
0.001
)
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
0.001
)
optimizer
=
fluid
.
optimizer
.
AdamOptimizer
(
0.001
,
parameter_list
=
model
.
parameters
(),
grad_clip
=
grad_clip
)
...
...
python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py
浏览文件 @
fe0dc40d
...
...
@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase):
fluid
.
set_flags
(
{
'FLAGS_sort_sum_gradient'
:
sort_sum_gradient
}
)
# grad_clip =
fluid.clip.GradientClip
ByGlobalNorm(5.0)
# grad_clip =
paddle.nn.ClipGrad
ByGlobalNorm(5.0)
input_word
=
np
.
array
([[
1
,
2
],
[
2
,
1
]]).
astype
(
'int64'
)
input
=
paddle
.
to_tensor
(
input_word
)
...
...
@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase):
fluid
.
set_flags
(
{
'FLAGS_sort_sum_gradient'
:
sort_sum_gradient
}
)
grad_clip
=
fluid
.
clip
.
GradientClip
ByGlobalNorm
(
5.0
)
grad_clip
=
paddle
.
nn
.
ClipGrad
ByGlobalNorm
(
5.0
)
input_word
=
np
.
array
([[
1
,
2
],
[
2
,
1
]]).
astype
(
'int64'
)
input
=
to_variable
(
input_word
)
...
...
python/paddle/fluid/tests/unittests/xpu/test_clip_op_xpu.py
浏览文件 @
fe0dc40d
...
...
@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase):
input_data
=
np
.
random
.
random
((
2
,
4
)).
astype
(
"float32"
)
def
test_Variable
():
fluid
.
layers
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
paddle
.
clip
(
x
=
input_data
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_Variable
)
def
test_dtype
():
x2
=
fluid
.
layers
.
data
(
name
=
'x2'
,
shape
=
[
1
],
dtype
=
'int32'
)
fluid
.
layers
.
clip
(
x
=
x2
,
min
=-
1.0
,
max
=
1.0
)
paddle
.
clip
(
x
=
x2
,
min
=-
1.0
,
max
=
1.0
)
self
.
assertRaises
(
TypeError
,
test_dtype
)
paddle
.
disable_static
()
...
...
python/paddle/hapi/model.py
浏览文件 @
fe0dc40d
...
...
@@ -1535,7 +1535,7 @@ class Model:
assert
isinstance
(
self
.
_optimizer
.
_grad_clip
,
(
paddle
.
nn
.
ClipGradByGlobalNorm
,
paddle
.
nn
.
ClipGradByNorm
),
),
"Only
GradientClipByNorm and GradientClip
ByGlobalNorm are supported in amp training with level=O2 currently."
),
"Only
ClipGradByNorm and ClipGrad
ByGlobalNorm are supported in amp training with level=O2 currently."
self
.
_adapter
.
_amp_custom_lists
=
{}
self
.
_adapter
.
_amp_configs
=
{}
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
fe0dc40d
...
...
@@ -15,13 +15,14 @@
import
paddle
import
paddle.distributed
as
dist
from
paddle.fluid
import
core
,
layers
from
paddle.fluid.clip
import
ClipGradBase
,
_squared_l2_norm
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.nn
import
clip
from
paddle.nn.clip
import
ClipGradBase
,
_squared_l2_norm
class
ClipGradForMOEByGlobalNorm
(
ClipGradBase
):
r
"""
The Algrithm is the same as paddle.
fluid.clip
.ClipGradByGlobalNorm
The Algrithm is the same as paddle.
nn
.ClipGradByGlobalNorm
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
...
...
@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
continue
merge_grad
=
g
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
layers
.
merge_selected_rows
(
g
)
merge_grad
=
layers
.
get_tensor_from_selected_rows
(
merge_grad
)
merge_grad
=
clip
.
merge_selected_rows
(
g
)
merge_grad
=
clip
.
get_tensor_from_selected_rows
(
merge_grad
)
sum_square
=
_squared_l2_norm
(
merge_grad
)
if
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
sum_square_list_fp16
.
append
(
sum_square
)
...
...
python/paddle/incubate/optimizer/distributed_fused_lamb.py
浏览文件 @
fe0dc40d
...
...
@@ -16,11 +16,11 @@ import os
import
paddle
from
paddle.fluid
import
core
,
framework
,
unique_name
from
paddle.fluid.clip
import
ClipGradByGlobalNorm
from
paddle.fluid.executor
import
global_scope
from
paddle.fluid.framework
import
Variable
,
name_scope
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.fluid.optimizer
import
Optimizer
from
paddle.nn
import
ClipGradByGlobalNorm
def
init_communicator
(
block
,
rank
,
ranks
,
ring_id
):
...
...
python/paddle/nn/clip.py
浏览文件 @
fe0dc40d
...
...
@@ -12,9 +12,1074 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# TODO: define the functions to clip gradient of parameter
from
..fluid.clip
import
ClipGradByGlobalNorm
# noqa: F401
from
..fluid.clip
import
ClipGradByNorm
# noqa: F401
from
..fluid.clip
import
ClipGradByValue
# noqa: F401
import
copy
import
warnings
import
paddle
import
paddle.autograd
as
imperative_base
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle.common_ops_import
import
Variable
,
check_type
,
default_main_program
from
paddle.fluid
import
core
,
framework
,
layers
,
unique_name
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.framework
import
LayerHelper
,
_non_static_mode
,
in_dygraph_mode
from
paddle.tensor.layer_function_generator
import
templatedoc
__all__
=
[]
@
templatedoc
()
def
clip_by_norm
(
x
,
max_norm
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
max_norm(${max_norm_type}): ${max_norm_comment}
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle
from paddle.nn import clip
input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
reward = clip.clip_by_norm(x=input, max_norm=1.0)
# [[0.5, 0.5], [0.5, 0.5]]
"""
if
in_dygraph_mode
():
return
_C_ops
.
clip_by_norm
(
x
,
max_norm
)
if
_non_static_mode
():
return
_legacy_C_ops
.
clip_by_norm
(
x
,
'max_norm'
,
max_norm
)
helper
=
LayerHelper
(
"clip_by_norm"
,
**
locals
())
check_variable_and_dtype
(
x
,
'X'
,
[
'float32'
,
'float16'
],
'clip_by_norm'
)
check_type
(
max_norm
,
'max_norm'
,
(
float
),
'clip_by_norm'
)
if
name
is
None
:
name
=
unique_name
.
generate_with_ignorable_key
(
"."
.
join
([
helper
.
name
,
'tmp'
])
)
out
=
helper
.
create_variable
(
type
=
x
.
type
,
name
=
name
,
dtype
=
x
.
dtype
,
persistable
=
False
)
helper
.
append_op
(
type
=
"clip_by_norm"
,
inputs
=
{
"X"
:
x
},
attrs
=
{
"max_norm"
:
max_norm
},
outputs
=
{
"Out"
:
out
},
)
return
out
@
templatedoc
()
def
merge_selected_rows
(
x
,
name
=
None
):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
var = b.create_var(
name="X", dtype="float32", persistable=True,
type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
y = nn.merge_selected_rows(var)
"""
if
in_dygraph_mode
():
return
_C_ops
.
merge_selected_rows
(
x
)
if
_non_static_mode
():
return
_legacy_C_ops
.
merge_selected_rows
(
x
)
helper
=
LayerHelper
(
"merge_selected_rows"
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
"merge_selected_rows"
,
inputs
=
{
"X"
:
x
},
attrs
=
{},
outputs
=
{
"Out"
:
out
},
)
return
out
@
templatedoc
()
def
get_tensor_from_selected_rows
(
x
,
name
=
None
):
"""
Get tensor data from input with SelectedRows type, and outputs a Tensor.
.. code-block:: text
input x is SelectedRows:
x.rows = [0, 5, 5, 4, 19]
x.height = 20
x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
Output is LoDTensor:
out.shape = [5, 2]
out.data = [[1, 1],
[2, 2],
[2, 2],
[3, 3],
[6, 6]]
Args:
x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
name(str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .
Returns:
Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
Examples:
.. code-block:: python
from paddle import nnp.py
b = fluid.default_main_program().global_block()
input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
out = nn.get_tensor_from_selected_rows(input)
"""
check_type
(
x
,
'x'
,
Variable
,
'get_tensor_from_selected_rows'
)
if
x
.
type
!=
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
raise
TypeError
(
"The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
)
helper
=
LayerHelper
(
'get_tensor_from_selected_rows'
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
x
.
dtype
)
helper
.
append_op
(
type
=
'get_tensor_from_selected_rows'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Out'
:
out
},
attrs
=
{},
)
return
out
_clip_by_global_norm_using_mp_type_flag
=
False
def
_clip_by_global_norm_using_mp_type
(
*
args
):
global
_clip_by_global_norm_using_mp_type_flag
assert
len
(
args
)
<=
1
if
len
(
args
)
==
1
:
assert
isinstance
(
args
[
0
],
bool
)
old_value
=
_clip_by_global_norm_using_mp_type_flag
_clip_by_global_norm_using_mp_type_flag
=
args
[
0
]
return
old_value
else
:
return
_clip_by_global_norm_using_mp_type_flag
def
_cast_to_mp_type_if_enabled
(
x
):
if
(
x
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
)
and
_clip_by_global_norm_using_mp_type
():
return
x
.
astype
(
core
.
VarDesc
.
VarType
.
FP32
)
else
:
return
x
def
_squared_l2_norm
(
x
):
r
"""
Return the squared L2 norm of a tensor.
"""
x
=
_cast_to_mp_type_if_enabled
(
x
)
if
(
core
.
is_compiled_with_xpu
()
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
x
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
):
square
=
paddle
.
square
(
x
)
sum_square
=
paddle
.
sum
(
square
)
return
sum_square
if
in_dygraph_mode
():
return
_C_ops
.
squared_l2_norm
(
x
)
op_type
=
'squared_l2_norm'
check_variable_and_dtype
(
x
,
'x'
,
[
'float32'
,
'float64'
],
op_type
)
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
inputs
=
{
"X"
:
x
}
outputs
=
{
'Out'
:
out
}
helper
.
append_op
(
type
=
op_type
,
inputs
=
inputs
,
outputs
=
outputs
)
return
out
class
BaseErrorClipAttr
:
def
__str__
(
self
):
raise
NotImplementedError
()
def
_append_clip_op
(
self
,
block
,
grad_name
):
raise
NotImplementedError
()
class
ErrorClipByValue
(
BaseErrorClipAttr
):
r
"""
Clip tensor values to the range [min, max].
Given a tensor ``t`` (see Examples below), this operation clips its value \
to ``min`` and ``max`` inplace.
- Any values less than min are set to min.
- Any values greater than max are set to max.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, \
will be set to ``-max`` by framework.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import paddle
paddle.enable_static()
BATCH_SIZE = 128
CLIP_MAX = 2e-6
CLIP_MIN = -1e-6
prog = fluid.framework.Program()
with fluid.program_guard(main_program=prog):
image = fluid.layers.data(
name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(
input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = paddle.nn.functional.cross_entropy(input=predict, label=label)
avg_cost = paddle.mean(cost)
prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name)._set_error_clip(
paddle.nn.clip.ErrorClipByValue(
max=CLIP_MAX, min=CLIP_MIN)
)
"""
def
__init__
(
self
,
max
,
min
=
None
):
max
=
float
(
max
)
if
min
is
None
:
min
=
-
max
else
:
min
=
float
(
min
)
self
.
max
=
max
self
.
min
=
min
def
__str__
(
self
):
return
"ByValue, min=%f, max=%f"
%
(
self
.
min
,
self
.
max
)
def
_append_clip_op
(
self
,
block
,
grad_name
):
clip_op_desc
=
block
.
desc
.
append_op
()
clip_op_desc
.
set_type
(
"clip"
)
clip_op_desc
.
set_input
(
"X"
,
[
grad_name
])
clip_op_desc
.
set_output
(
"Out"
,
[
grad_name
])
clip_op_desc
.
_set_attr
(
"min"
,
self
.
min
)
clip_op_desc
.
_set_attr
(
"max"
,
self
.
max
)
def
error_clip_callback
(
block
,
context
):
# the context is a grad_to_var map
grad_to_var
=
context
op_desc
=
block
.
desc
.
op
(
block
.
desc
.
op_size
()
-
1
)
for
grad_n
in
[
n
for
n
in
op_desc
.
output_arg_names
()
if
n
in
grad_to_var
]:
fwd_var
=
block
.
_var_recursive
(
grad_to_var
[
grad_n
])
error_clip
=
getattr
(
fwd_var
,
"error_clip"
,
None
)
if
not
(
error_clip
is
None
or
isinstance
(
error_clip
,
BaseErrorClipAttr
)
):
raise
TypeError
(
"Variable's error_clip should be an instance of BaseErrorClipAttr or None."
)
if
error_clip
is
not
None
:
error_clip
.
_append_clip_op
(
block
,
grad_n
)
class
ClipGradBase
:
def
__init__
(
self
):
super
().
__init__
()
def
__str__
(
self
):
raise
NotImplementedError
()
@
imperative_base
.
no_grad
()
def
_dygraph_clip
(
self
,
params_grads
):
raise
NotImplementedError
def
_static_clip
(
self
,
params_grads
):
raise
NotImplementedError
def
__call__
(
self
,
params_grads
):
if
_non_static_mode
():
return
self
.
_dygraph_clip
(
params_grads
)
else
:
for
p
,
g
in
params_grads
:
if
getattr
(
p
,
'gradient_clip_attr'
,
None
)
is
not
None
:
warnings
.
warn
(
"'set_gradient_clip' will be ineffective, because you have "
"set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
"is redundant and you can remove it."
)
break
return
self
.
_static_clip
(
params_grads
)
def
_process_context
(
self
,
context
,
param
,
grad
):
raise
NotImplementedError
()
def
_create_operators
(
self
,
param
,
grad
):
raise
NotImplementedError
()
class
ClipGradByValue
(
ClipGradBase
):
"""
Limit the value of multi-dimensional Tensor :math:`X` to the range [min, max].
- Any values less than min are set to ``min``.
- Any values greater than max are set to ``max``.
The multi-dimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
Note:
``need_clip`` of ``ClipGradByValue`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
max (float): The maximum value to clip by.
min (float, optional): The minimum value to clip by. if not set by user, it will be set to ``-max``
automatically. In this case, ``max`` must be greater than 0.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByValue(min=-1, max=1)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
max
,
min
=
None
):
super
().
__init__
()
if
min
is
None
:
assert
max
>
0.0
min
=
-
max
self
.
max
=
float
(
max
)
self
.
min
=
float
(
min
)
def
__str__
(
self
):
return
"Clip Gradient By Value, min = %f, max=%f"
%
(
self
.
min
,
self
.
max
)
@
imperative_base
.
no_grad
()
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
paddle
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
param_new_grad_name_dict
=
dict
()
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
paddle
.
clip
(
x
=
g
,
min
=
self
.
min
,
max
=
self
.
max
)
params_and_grads
.
append
((
p
,
new_grad
))
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
pass
def
_create_operators
(
self
,
param
,
grad
):
new_grad
=
paddle
.
clip
(
x
=
grad
,
min
=
self
.
min
,
max
=
self
.
max
)
return
param
,
new_grad
class
ClipGradByNorm
(
ClipGradBase
):
r
"""
Limit the l2 norm of multi-dimensional Tensor :math:`X` to ``clip_norm`` .
- If the l2 norm of :math:`X` is greater than ``clip_norm`` , :math:`X` will be compressed by a ratio.
- If the l2 norm of :math:`X` is less than or equal to ``clip_norm`` , nothing will be done.
The multidimensional Tensor :math:`X` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
.. math::
Out =
\left\{
\begin{array}{ccl}
X & & if (norm(X) \leq clip\_norm) \\
\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\
\end{array}
\right.
where :math:`norm(X)` represents the L2 norm of :math:`X`.
.. math::
norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}}
Note:
``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm(float): The maximum norm value.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
):
super
().
__init__
()
self
.
clip_norm
=
float
(
clip_norm
)
def
__str__
(
self
):
return
"Gradient Clip By Norm, clip_norm=%f"
%
self
.
clip_norm
@
imperative_base
.
no_grad
()
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
new_grad
=
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
params_and_grads
.
append
((
p
,
new_grad
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_grad
=
clip_by_norm
(
x
=
g
,
max_norm
=
self
.
clip_norm
)
param_new_grad_name_dict
[
p
.
name
]
=
new_grad
.
name
params_and_grads
.
append
((
p
,
new_grad
))
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
pass
def
_create_operators
(
self
,
param
,
grad
):
new_grad
=
clip_by_norm
(
x
=
grad
,
max_norm
=
self
.
clip_norm
)
return
param
,
new_grad
_allow_pure_fp16_global_norm_clip_flag
=
False
def
_allow_pure_fp16_global_norm_clip
(
*
args
):
global
_allow_pure_fp16_global_norm_clip_flag
if
len
(
args
)
==
0
:
return
_allow_pure_fp16_global_norm_clip_flag
else
:
assert
len
(
args
)
==
1
and
isinstance
(
args
[
0
],
bool
)
old_value
=
_allow_pure_fp16_global_norm_clip_flag
_allow_pure_fp16_global_norm_clip_flag
=
args
[
0
]
return
old_value
class
ClipGradByGlobalNorm
(
ClipGradBase
):
r
"""
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
- If the global norm is greater than ``clip_norm`` , all elements of :math:`t\_list` will be compressed by a ratio.
- If the global norm is less than or equal to ``clip_norm`` , nothing will be done.
The list of Tensor :math:`t\_list` is not passed from this class, but the gradients of all parameters set in ``optimizer``.
If ``need_clip`` of specific param is ``False`` in its ``ParamAttr``, then the gradients of this param will not be clipped.
Gradient clip will takes effect after being set in ``optimizer`` , see the document ``optimizer``
(for example: :ref:`api_paddle_optimizer_SGD`).
The clipping formula is:
.. math::
t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)}
where:
.. math::
global\_norm = \sqrt{\sum_{i=0}^{N-1}(l2norm(t\_list[i]))^2}
Note:
``need_clip`` of ``ClipGradyGlobalNorm`` HAS BEEN DEPRECATED since 2.0.
Please use ``need_clip`` in ``ParamAttr`` to speficiy the clip scope.
Args:
clip_norm (float): The maximum norm value.
group_name (str, optional): The group name for this clip. Default value is ``default_group``.
auto_skip_clip (bool, optional): skip clipping gradient. Default value is ``False``.
Examples:
.. code-block:: python
import paddle
x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
linear = paddle.nn.Linear(in_features=10, out_features=10,
weight_attr=paddle.ParamAttr(need_clip=True),
bias_attr=paddle.ParamAttr(need_clip=False))
out = linear(x)
loss = paddle.mean(out)
loss.backward()
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters(), grad_clip=clip)
sdg.step()
"""
def
__init__
(
self
,
clip_norm
,
group_name
=
"default_group"
,
auto_skip_clip
=
False
):
super
().
__init__
()
self
.
clip_norm
=
float
(
clip_norm
)
self
.
group_name
=
group_name
assert
isinstance
(
auto_skip_clip
,
bool
)
self
.
auto_skip_clip
=
auto_skip_clip
def
__str__
(
self
):
return
"Gradient Clip By GlobalNorm, global_norm=%f"
%
(
self
.
clip_norm
)
@
imperative_base
.
no_grad
()
def
_dygraph_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_list
=
[]
sum_square_list_fp16
=
[]
sum_square_list_fp32
=
[]
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
if
in_dygraph_mode
()
and
g
.
is_selected_rows
():
merge_grad
=
merge_selected_rows
(
g
)
merge_grad
=
merge_grad
.
_get_tensor_from_selected_rows
()
elif
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
merge_selected_rows
(
g
)
merge_grad
=
get_tensor_from_selected_rows
(
merge_grad
)
sum_square
=
_squared_l2_norm
(
merge_grad
)
if
(
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
or
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
BF16
):
sum_square_list_fp16
.
append
(
sum_square
)
elif
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP32
:
sum_square_list_fp32
.
append
(
sum_square
)
else
:
sum_square_list
.
append
(
sum_square
)
# all parameters have been filterd out
if
(
len
(
sum_square_list
)
+
len
(
sum_square_list_fp16
)
+
len
(
sum_square_list_fp32
)
==
0
):
return
params_grads
sum_dtype
=
'float64'
if
len
(
sum_square_list
)
>
0
else
"float32"
global_norm_var
=
[]
if
len
(
sum_square_list_fp16
)
>
0
:
global_norm_var_fp16
=
paddle
.
add_n
(
sum_square_list_fp16
)
global_norm_var
.
append
(
global_norm_var_fp16
.
astype
(
sum_dtype
))
if
len
(
sum_square_list_fp32
)
>
0
:
global_norm_var_fp32
=
paddle
.
add_n
(
sum_square_list_fp32
)
if
sum_dtype
==
'float32'
:
global_norm_var
.
append
(
global_norm_var_fp32
)
else
:
global_norm_var
.
append
(
global_norm_var_fp32
.
astype
(
sum_dtype
))
if
len
(
sum_square_list
)
>
0
:
global_norm_var_fp64
=
paddle
.
add_n
(
sum_square_list
)
global_norm_var
.
append
(
global_norm_var_fp64
)
global_norm_var
=
paddle
.
add_n
(
global_norm_var
)
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
)
max_global_norm
=
paddle
.
full
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
fill_value
=
self
.
clip_norm
)
need_clip
=
False
if
not
self
.
auto_skip_clip
:
# always apply clip
need_clip
=
True
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
global_norm_var
,
y
=
max_global_norm
),
)
elif
global_norm_var
>
max_global_norm
:
# only when global_norm_var > max_global_norm, grad need clip
need_clip
=
True
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
global_norm_var
)
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
# TODO(wangxi): use inplace elementwise_mul
if
need_clip
:
clip_input
=
(
clip_var
.
astype
(
g
.
dtype
)
if
clip_var
.
dtype
!=
g
.
dtype
else
clip_var
)
new_grad
=
paddle
.
multiply
(
g
,
clip_input
)
params_and_grads
.
append
((
p
,
new_grad
))
else
:
params_and_grads
.
append
((
p
,
g
))
return
params_and_grads
def
_static_clip
(
self
,
params_grads
):
params_and_grads
=
[]
sum_square_list
=
[]
sum_square_list_fp16
=
[]
sum_square_list_fp32
=
[]
with
framework
.
name_scope
(
'gradient_clip'
):
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
continue
merge_grad
=
g
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
if
g
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
merge_selected_rows
(
g
)
merge_grad
=
get_tensor_from_selected_rows
(
merge_grad
)
sum_square
=
_squared_l2_norm
(
merge_grad
)
if
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
:
sum_square_list_fp16
.
append
(
sum_square
)
elif
sum_square
.
dtype
==
core
.
VarDesc
.
VarType
.
FP32
:
sum_square_list_fp32
.
append
(
sum_square
)
else
:
sum_square_list
.
append
(
sum_square
)
# all parameters have been filterd out
if
(
len
(
sum_square_list
)
+
len
(
sum_square_list_fp16
)
+
len
(
sum_square_list_fp32
)
==
0
):
return
params_grads
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
sum_dtype
=
'float64'
if
len
(
sum_square_list
)
>
0
else
"float32"
global_norm_var
=
[]
if
len
(
sum_square_list_fp16
)
>
0
:
global_norm_var_fp16
=
layers
.
sums
(
sum_square_list_fp16
)
if
(
sum_square_list_fp32
or
sum_square_list
or
not
_allow_pure_fp16_global_norm_clip
()
):
global_norm_var
.
append
(
global_norm_var_fp16
.
astype
(
sum_dtype
)
)
else
:
global_norm_var
.
append
(
global_norm_var_fp16
)
if
len
(
sum_square_list_fp32
)
>
0
:
global_norm_var_fp32
=
layers
.
sums
(
sum_square_list_fp32
)
if
sum_dtype
==
'float32'
:
global_norm_var
.
append
(
global_norm_var_fp32
)
else
:
global_norm_var
.
append
(
global_norm_var_fp32
.
astype
(
sum_dtype
)
)
if
len
(
sum_square_list
)
>
0
:
# fp64
global_norm_var_other_dtype
=
layers
.
sums
(
sum_square_list
)
global_norm_var
.
append
(
global_norm_var_other_dtype
)
global_norm_var
=
(
layers
.
sums
(
global_norm_var
)
if
len
(
global_norm_var
)
>
1
else
global_norm_var
[
0
]
)
global_norm_var
=
paddle
.
sqrt
(
x
=
global_norm_var
)
max_global_norm
=
paddle
.
full
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
fill_value
=
self
.
clip_norm
,
)
scale_var
=
paddle
.
divide
(
x
=
max_global_norm
,
y
=
paddle
.
maximum
(
x
=
max_global_norm
,
y
=
global_norm_var
),
)
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
params_grads
:
if
g
is
None
:
continue
if
getattr
(
p
,
'need_clip'
,
True
)
is
False
:
params_and_grads
.
append
((
p
,
g
))
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]):
new_g
=
_cast_to_mp_type_if_enabled
(
g
)
# inplace
scale_input
=
(
scale_var
.
astype
(
'float16'
)
if
new_g
.
dtype
==
core
.
VarDesc
.
VarType
.
FP16
and
scale_var
.
dtype
!=
core
.
VarDesc
.
VarType
.
FP16
else
scale_var
)
# NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
# will be in different blocks with the gradient clip related ops.
# We need to handle the correct block, otherwise will encounter
# a 'NotFoundError' during compile time.
block
=
default_main_program
().
current_block
()
block
.
append_op
(
type
=
'elementwise_mul'
,
inputs
=
{
'X'
:
new_g
,
'Y'
:
scale_input
},
outputs
=
{
'Out'
:
new_g
},
)
if
new_g
is
not
g
:
block
.
append_op
(
type
=
'cast'
,
inputs
=
{
'X'
:
new_g
},
outputs
=
{
'Out'
:
g
},
attrs
=
{
'in_dtype'
:
new_g
.
dtype
,
'out_dtype'
:
g
.
dtype
,
},
)
param_new_grad_name_dict
[
p
.
name
]
=
g
.
name
params_and_grads
.
append
((
p
,
g
))
_correct_clip_op_role_var
(
params_and_grads
,
param_new_grad_name_dict
)
return
params_and_grads
def
_process_context
(
self
,
context
,
param
,
grad
):
if
self
.
group_name
not
in
context
:
context
[
self
.
group_name
]
=
[]
context
[
self
.
group_name
+
"_clip_value"
]
=
self
.
clip_norm
context
[
self
.
group_name
+
"_clip"
]
=
paddle
.
full
(
shape
=
[
1
],
dtype
=
grad
.
dtype
,
fill_value
=
self
.
clip_norm
)
else
:
if
not
self
.
clip_norm
==
context
[
self
.
group_name
+
"_clip_value"
]:
raise
ValueError
(
"All parameters' 'clip_norm' of a same group should be the same"
)
merge_grad
=
grad
if
grad
.
type
==
core
.
VarDesc
.
VarType
.
SELECTED_ROWS
:
merge_grad
=
merge_selected_rows
(
grad
)
merge_grad
=
get_tensor_from_selected_rows
(
merge_grad
)
local_norm_var
=
_squared_l2_norm
(
merge_grad
)
context
[
self
.
group_name
].
append
(
local_norm_var
)
self
.
context
=
context
def
_create_operators
(
self
,
param
,
grad
):
group_scale_name
=
self
.
group_name
+
"_scale"
if
group_scale_name
not
in
self
.
context
:
group_norm_var
=
layers
.
sums
(
input
=
self
.
context
[
self
.
group_name
])
group_norm_var
=
paddle
.
sqrt
(
x
=
group_norm_var
)
clip_var
=
self
.
context
[
self
.
group_name
+
"_clip"
]
group_scale_var
=
paddle
.
divide
(
x
=
clip_var
,
y
=
paddle
.
maximum
(
x
=
clip_var
,
y
=
group_norm_var
),
)
assert
group_scale_var
.
shape
==
(
1
,)
self
.
context
[
group_scale_name
]
=
group_scale_var
# inplace
param
.
block
.
append_op
(
type
=
'elementwise_mul'
,
inputs
=
{
'X'
:
grad
,
'Y'
:
self
.
context
[
group_scale_name
]},
outputs
=
{
'Out'
:
grad
},
)
return
param
,
grad
@
framework
.
dygraph_not_support
def
set_gradient_clip
(
clip
,
param_list
=
None
,
program
=
None
):
"""
Warning:
This API must be used after building network, and before ``minimize`` ,
and it may be removed in future releases, so it is not recommended.
It is recommended to set ``grad_clip`` when initializing the ``optimizer`` ,
this is a better method to clip gradient. There are three clipping strategies:
:ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` .
To specify parameters that require gradient clip.
Args:
grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
some derived class of ``GradientClipBase`` . There are three cliping strategies
( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
:ref:`api_fluid_clip_GradientClipByValue` ). Default value: None, and there is no
gradient clipping.
param_list (list(Variable), optional): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
Default None, meaning that all parameters in the program will be included.
program (Program, optional): The program where parameters are located.
Default None, meaning that using :ref:`api_fluid_default_main_program` .
Returns:
None
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
paddle.enable_static()
def network():
image = fluid.data(name='image', shape=[
None, 28], dtype='float32')
param_attr1 = fluid.ParamAttr("fc1_param")
fc1 = fluid.layers.fc(image, size=10, param_attr=param_attr1)
param_attr2 = fluid.ParamAttr("fc2_param")
fc2 = fluid.layers.fc(fc1, size=10, param_attr=param_attr2)
loss = paddle.mean(fc2)
return loss
# network 1: clip all parameter gradient
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
paddle.nn.clip.set_gradient_clip(
paddle.nn.ClipGradByGlobalNorm(clip_norm=2.0))
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 2: clip parameter gradient by name
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
paddle.nn.clip.set_gradient_clip(
paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
param_list=["fc1_param", "fc2_param"])
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 3: clip parameter gradient by value
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
param_var1 = fluid.default_main_program().global_block().var("fc1_param")
param_var2 = fluid.default_main_program().global_block().var("fc2_param")
paddle.nn.clip.set_gradient_clip(
paddle.nn.ClipGradByValue(min=-1.0, max=1.0),
param_list=[param_var1, param_var2])
sgd = fluid.optimizer.SGD(learning_rate=1e-3)
sgd.minimize(loss)
# network 4: use 'set_gradient_clip' and 'optimize(grad_clip=clip)' together
with fluid.program_guard(fluid.Program(), fluid.Program()):
loss = network()
clip1 = paddle.nn.ClipGradByValue(min=-1.0, max=1.0)
clip2 = paddle.nn.ClipGradByNorm(clip_norm=1.0)
# Set the gradient clipping strategy: clip1
paddle.nn.clip.set_gradient_clip(clip1)
# Set the gradient clipping strategy: clip2
sgd = fluid.optimizer.SGD(learning_rate=1e-3, grad_clip=clip2)
sgd.minimize(loss)
# 'set_gradient_clip' will not take effect when setting has a conflict,
# and the gradient clipping strategy will be 'clip2'
"""
warnings
.
warn
(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy: set 'grad_clip' "
"when initializing the 'optimizer'. "
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'."
)
if
not
isinstance
(
clip
,
ClipGradBase
):
raise
TypeError
(
"'clip' should be an instance of ClipGradBase's derived class"
)
if
program
is
None
:
program
=
framework
.
default_main_program
()
for
op
in
program
.
block
(
0
).
ops
:
if
'op_namescope'
in
op
.
all_attrs
()
and
"optimizer"
in
op
.
attr
(
"op_namescope"
):
warnings
.
warn
(
"'minimize' has been invoked before, this will make 'set_gradient_clip' "
"be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
)
break
if
param_list
is
None
:
param_list
=
program
.
block
(
0
).
all_parameters
()
if
all
(
isinstance
(
elem
,
str
)
for
elem
in
param_list
):
param_list
=
[
program
.
block
(
0
).
var
(
elem
)
for
elem
in
param_list
]
if
not
all
(
isinstance
(
elem
,
framework
.
Parameter
)
for
elem
in
param_list
):
raise
TypeError
(
"'param_list' should be a list of Parameter or basestring(parameter's name)."
)
for
param
in
param_list
:
param
.
gradient_clip_attr
=
copy
.
deepcopy
(
clip
)
def
append_gradient_clip_ops
(
param_grads
):
context
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]),
framework
.
name_scope
(
'gradient_clip'
):
clip_attr
=
getattr
(
p
,
'gradient_clip_attr'
,
None
)
if
clip_attr
is
None
:
return
param_grads
if
not
isinstance
(
clip_attr
,
ClipGradBase
):
raise
TypeError
(
"clip attribute should be an instance of GradientClipBase"
)
clip_attr
.
_process_context
(
context
=
context
,
param
=
p
,
grad
=
g
)
res
=
[]
param_new_grad_name_dict
=
dict
()
for
p
,
g
in
param_grads
:
if
g
is
None
:
continue
with
p
.
block
.
program
.
_optimized_guard
([
p
,
g
]),
framework
.
name_scope
(
'gradient_clip'
):
param
,
new_grad
=
clip_attr
.
_create_operators
(
param
=
p
,
grad
=
g
)
param_new_grad_name_dict
[
param
.
name
]
=
new_grad
.
name
res
.
append
([
param
,
new_grad
])
_correct_clip_op_role_var
(
res
,
param_new_grad_name_dict
)
return
res
# change wrong mapping relation between param & grad in clip op
# Note: This function is sensitive to the time cost of the network with gradient clipping
# and should not be changed easily. If you must change, please test the time cost.
def
_correct_clip_op_role_var
(
params_grads
,
param_new_grad_name_dict
):
block_id_list
=
[]
if
len
(
param_new_grad_name_dict
)
==
0
:
return
for
param
,
grad
in
params_grads
:
if
grad
is
None
:
continue
block_id
=
param
.
block
.
idx
if
block_id
in
block_id_list
:
continue
block_id_list
.
append
(
block_id
)
for
op
in
param
.
block
.
program
.
global_block
().
ops
:
if
(
op
.
has_attr
(
"op_namescope"
)
and
"gradient_clip"
in
op
.
attr
(
"op_namescope"
)
and
op
.
attr
(
'op_role_var'
)
):
param_name
=
op
.
attr
(
'op_role_var'
)[
0
]
if
param_name
in
param_new_grad_name_dict
:
correct_p_g
=
[
param_name
,
param_new_grad_name_dict
[
param_name
],
]
op
.
_set_attr
(
'op_role_var'
,
correct_p_g
)
GradientClipBase
=
ClipGradBase
GradientClipByValue
=
ClipGradByValue
GradientClipByNorm
=
ClipGradByNorm
GradientClipByGlobalNorm
=
ClipGradByGlobalNorm
python/paddle/optimizer/adamw.py
浏览文件 @
fe0dc40d
...
...
@@ -20,10 +20,10 @@ import paddle
from
..
import
_C_ops
from
..fluid
import
core
,
framework
,
unique_name
from
..fluid.clip
import
GradientClipBase
from
..fluid.dygraph
import
base
as
imperative_base
from
..fluid.framework
import
Parameter
,
Variable
from
..fluid.layer_helper
import
LayerHelper
from
..nn.clip
import
GradientClipBase
from
.lr
import
LRScheduler
from
.optimizer
import
Optimizer
...
...
python/paddle/optimizer/optimizer.py
浏览文件 @
fe0dc40d
...
...
@@ -18,6 +18,7 @@ from collections import defaultdict
import
numpy
as
np
import
paddle
import
paddle.autograd
as
imperative_base
from
paddle
import
_C_ops
from
paddle.fluid
import
core
from
paddle.fluid.framework
import
(
...
...
@@ -32,12 +33,6 @@ from paddle.fluid.framework import (
from
..fluid
import
framework
,
unique_name
from
..fluid.backward
import
_get_no_grad_set_name
,
append_backward
from
..fluid.clip
import
(
GradientClipBase
,
append_gradient_clip_ops
,
error_clip_callback
,
)
from
..fluid.dygraph
import
base
as
imperative_base
from
..fluid.framework
import
Parameter
,
program_guard
from
..fluid.initializer
import
Constant
from
..fluid.layer_helper
import
LayerHelper
...
...
@@ -168,7 +163,7 @@ class Optimizer:
"""
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
()
def
__init__
(
self
,
learning_rate
,
...
...
@@ -225,7 +220,7 @@ class Optimizer:
%
type
(
learning_rate
)
)
if
grad_clip
is
not
None
:
if
not
isinstance
(
grad_clip
,
GradientClipBase
):
if
not
isinstance
(
grad_clip
,
paddle
.
nn
.
clip
.
GradientClipBase
):
raise
TypeError
(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
...
...
@@ -1042,7 +1037,7 @@ class Optimizer:
params_grads
.
append
((
parameter_list
[
index
],
grad
))
else
:
if
callbacks
is
None
:
callbacks
=
[
error_clip_callback
]
callbacks
=
[
paddle
.
nn
.
clip
.
error_clip_callback
]
else
:
assert
isinstance
(
callbacks
,
list
)
program
=
loss
.
block
.
program
...
...
@@ -1103,7 +1098,7 @@ class Optimizer:
params_grads
=
self
.
_grad_clip
(
params_grads
)
else
:
params_grads
=
append_gradient_clip_ops
(
params_grads
)
params_grads
=
paddle
.
nn
.
clip
.
append_gradient_clip_ops
(
params_grads
)
# Add regularization if any
params_grads
=
self
.
append_regularization_ops
(
...
...
@@ -1317,7 +1312,7 @@ class Optimizer:
else
:
core
.
clear_gradients
(
param_list
,
set_to_zero
)
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
()
def
minimize
(
self
,
loss
,
startup_program
=
None
,
parameters
=
None
,
no_grad_set
=
None
):
...
...
@@ -1380,7 +1375,7 @@ class Optimizer:
return
optimize_ops
,
params_grads
@
imperative_base
.
no_grad
@
imperative_base
.
no_grad
()
@
framework
.
dygraph_only
def
step
(
self
):
"""
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录