未验证 提交 fe0dc40d 编写于 作者: 骑马小猫 提交者: GitHub

[FluidAPI]remove clip api (#48946)

上级 822ea0f9
...@@ -20,11 +20,11 @@ __all__ = [] ...@@ -20,11 +20,11 @@ __all__ = []
import paddle import paddle
from paddle.common_ops_import import LayerHelper from paddle.common_ops_import import LayerHelper
from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.optimizer import Momentum, Optimizer from paddle.fluid.optimizer import Momentum, Optimizer
from paddle.framework import core from paddle.framework import core
from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
from paddle.static import create_global_var from paddle.static import create_global_var
...@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer): ...@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer):
self._dgc_clip_norm = None self._dgc_clip_norm = None
if grad_clip is not None: if grad_clip is not None:
if not isinstance(grad_clip, GradientClipByNorm): if not isinstance(grad_clip, ClipGradByNorm):
raise TypeError( raise TypeError(
"The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm" "The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm"
) )
assert isinstance(num_trainers, int), ( assert isinstance(num_trainers, int), (
"The type of num_trainers should be 'int', but received %s" "The type of num_trainers should be 'int', but received %s"
......
...@@ -15,9 +15,8 @@ ...@@ -15,9 +15,8 @@
import paddle import paddle
from paddle import framework from paddle import framework
from paddle.autograd import no_grad from paddle.autograd import no_grad
from paddle.fluid import layers
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.framework import core from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm, clip
from ...base.topology import ParallelMode from ...base.topology import ParallelMode
from ...utils.hybrid_parallel_util import ( from ...utils.hybrid_parallel_util import (
...@@ -62,8 +61,8 @@ class HybridParallelClipGrad: ...@@ -62,8 +61,8 @@ class HybridParallelClipGrad:
continue continue
merge_grad = g merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS: if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g) merge_grad = clip.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad) merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
square = paddle.square(merge_grad) square = paddle.square(merge_grad)
sum_square = paddle.sum(square) sum_square = paddle.sum(square)
......
...@@ -30,7 +30,7 @@ import paddle ...@@ -30,7 +30,7 @@ import paddle
import paddle.distributed as dist import paddle.distributed as dist
from paddle.distributed import ParallelMode, fleet from paddle.distributed import ParallelMode, fleet
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.clip import ClipGradByGlobalNorm from paddle.nn import ClipGradByGlobalNorm
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
HybridParallelClipGrad = ( HybridParallelClipGrad = (
......
...@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework ...@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework
from paddle import nn from paddle import nn
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.distributed import collective from paddle.distributed import collective
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.fluid.framework import EagerParamBase from paddle.fluid.framework import EagerParamBase
from paddle.nn import ClipGradByGlobalNorm
from .group_sharded_storage import GradStorage from .group_sharded_storage import GradStorage
from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard
......
...@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops ...@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops
from paddle.fluid import core, layers from paddle.fluid import core, layers
from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only from paddle.fluid.framework import dygraph_only
from paddle.nn import clip
class Taskflow: class Taskflow:
...@@ -65,8 +66,8 @@ class GroupShardedClipGrad: ...@@ -65,8 +66,8 @@ class GroupShardedClipGrad:
merge_grad = g merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS: if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.get_tensor_from_selected_rows( merge_grad = clip.get_tensor_from_selected_rows(
layers.merge_selected_rows(g) clip.merge_selected_rows(g)
) )
square = paddle.square(merge_grad) square = paddle.square(merge_grad)
sum_square = paddle.sum(square) sum_square = paddle.sum(square)
......
...@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None): ...@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
.. code-block:: python .. code-block:: python
# in model.py # in model.py
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0)) similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
binary_predict = fluid.layers.concat( binary_predict = fluid.layers.concat(
input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1) input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] = self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
......
...@@ -90,7 +90,6 @@ from .transpiler import ( ...@@ -90,7 +90,6 @@ from .transpiler import (
DistributeTranspilerConfig, DistributeTranspilerConfig,
) )
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip
from . import profiler from . import profiler
from . import unique_name from . import unique_name
from . import parallel_executor from . import parallel_executor
...@@ -164,7 +163,6 @@ __all__ = ( ...@@ -164,7 +163,6 @@ __all__ = (
'ParamAttr', 'ParamAttr',
'WeightNormParamAttr', 'WeightNormParamAttr',
'DataFeeder', 'DataFeeder',
'clip',
'profiler', 'profiler',
'unique_name', 'unique_name',
'Scope', 'Scope',
......
此差异已折叠。
...@@ -185,7 +185,7 @@ class FleetUtil: ...@@ -185,7 +185,7 @@ class FleetUtil:
# below is part of model # below is part of model
emb = my_slot_net(slots, label) # emb can be fc layer of size 1 emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\ emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\ binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\ paddle.subtract(\
...@@ -1374,7 +1374,7 @@ class FleetUtil: ...@@ -1374,7 +1374,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\ label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False) dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1 emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\ emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\ binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\ paddle.subtract(\
...@@ -1574,7 +1574,7 @@ class FleetUtil: ...@@ -1574,7 +1574,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\ label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False) dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1 emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\ similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\ emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\ binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\ paddle.subtract(\
......
...@@ -63,10 +63,6 @@ __all__ = [ ...@@ -63,10 +63,6 @@ __all__ = [
'fc', 'fc',
'embedding', 'embedding',
'autoincreased_step_counter', 'autoincreased_step_counter',
'clip',
'clip_by_norm',
'merge_selected_rows',
'get_tensor_from_selected_rows',
] ]
OP_NAMEMAPPING = { OP_NAMEMAPPING = {
...@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True): ...@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
) )
return out return out
@templatedoc()
def clip(x, min, max, name=None):
"""
:old_api: paddle.fluid.layers.clip
${comment}
Args:
x(${x_type}): ${x_comment}
min(float): ${min_comment}
max(float): ${max_comment}
name(str, optional): The default value is None.
Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`
Returns:
${out_comment}
Return Type:
${out_type}
Examples:
.. code-block:: python
import paddle.fluid as fluid
input = fluid.data(
name='data', shape=[1], dtype='float32')
reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
"""
helper = LayerHelper("clip", **locals())
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
if name is None:
name = unique_name.generate_with_ignorable_key(
".".join([helper.name, 'tmp'])
)
out = helper.create_variable(
type=x.type, name=name, dtype=x.dtype, persistable=False
)
helper.append_op(
type="clip",
inputs={"X": x},
attrs={"min": min, "max": max},
outputs={"Out": out},
)
return out
@templatedoc()
def clip_by_norm(x, max_norm, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
max_norm(${max_norm_type}): ${max_norm_comment}
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
# [[0.5, 0.5], [0.5, 0.5]]
"""
if in_dygraph_mode():
return _C_ops.clip_by_norm(x, max_norm)
else:
helper = LayerHelper("clip_by_norm", **locals())
check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
if name is None:
name = unique_name.generate_with_ignorable_key(
".".join([helper.name, 'tmp'])
)
out = helper.create_variable(
type=x.type, name=name, dtype=x.dtype, persistable=False
)
helper.append_op(
type="clip_by_norm",
inputs={"X": x},
attrs={"max_norm": max_norm},
outputs={"Out": out},
)
return out
@templatedoc()
def merge_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
var = b.create_var(
name="X", dtype="float32", persistable=True,
type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
y = fluid.layers.merge_selected_rows(var)
"""
if in_dygraph_mode():
return _C_ops.merge_selected_rows(x)
else:
helper = LayerHelper("merge_selected_rows", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="merge_selected_rows",
inputs={"X": x},
attrs={},
outputs={"Out": out},
)
return out
@templatedoc()
def get_tensor_from_selected_rows(x, name=None):
"""
This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
.. code-block:: text
input x is SelectedRows:
x.rows = [0, 5, 5, 4, 19]
x.height = 20
x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
Output is LoDTensor:
out.shape = [5, 2]
out.data = [[1, 1],
[2, 2],
[2, 2],
[3, 3],
[6, 6]]
Args:
x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
name(str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .
Returns:
Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
out = fluid.layers.get_tensor_from_selected_rows(input)
"""
check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
if x.type != core.VarDesc.VarType.SELECTED_ROWS:
raise TypeError(
"The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
)
helper = LayerHelper('get_tensor_from_selected_rows', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='get_tensor_from_selected_rows',
inputs={'X': x},
outputs={'Out': out},
attrs={},
)
return out
...@@ -38,13 +38,6 @@ from .backward import ( ...@@ -38,13 +38,6 @@ from .backward import (
_append_grad_suffix_, _append_grad_suffix_,
_get_no_grad_set_name, _get_no_grad_set_name,
) )
from .clip import (
GradientClipBase,
GradientClipByNorm,
error_clip_callback,
append_gradient_clip_ops,
ClipGradByGlobalNorm,
)
from .framework import program_guard from .framework import program_guard
from .initializer import Constant from .initializer import Constant
from .layer_helper import LayerHelper from .layer_helper import LayerHelper
...@@ -160,7 +153,7 @@ class Optimizer: ...@@ -160,7 +153,7 @@ class Optimizer:
) )
if grad_clip is not None: if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase): if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
raise TypeError( raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class" "'grad_clip' should be an instance of GradientClipBase's derived class"
) )
...@@ -1030,7 +1023,7 @@ class Optimizer: ...@@ -1030,7 +1023,7 @@ class Optimizer:
params_grads.append((param, grad_var)) params_grads.append((param, grad_var))
else: else:
if callbacks is None: if callbacks is None:
callbacks = [error_clip_callback] callbacks = [paddle.nn.clip.error_clip_callback]
else: else:
assert isinstance(callbacks, list) assert isinstance(callbacks, list)
program = loss.block.program program = loss.block.program
...@@ -1260,7 +1253,7 @@ class Optimizer: ...@@ -1260,7 +1253,7 @@ class Optimizer:
# NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization. # NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
if self._flatten_param_grads and self.regularization is None: if self._flatten_param_grads and self.regularization is None:
if self._grad_clip is None or isinstance( if self._grad_clip is None or isinstance(
self._grad_clip, ClipGradByGlobalNorm self._grad_clip, paddle.nn.ClipGradByGlobalNorm
): ):
params_grads = self.flatten_param_grads(params_grads) params_grads = self.flatten_param_grads(params_grads)
...@@ -1268,7 +1261,7 @@ class Optimizer: ...@@ -1268,7 +1261,7 @@ class Optimizer:
if self._grad_clip is not None: if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads) params_grads = self._grad_clip(params_grads)
else: else:
params_grads = append_gradient_clip_ops(params_grads) params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = self.append_regularization_ops( params_grads = self.append_regularization_ops(
......
...@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog): ...@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog):
prog_clip = prog.clone() prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name)._set_error_clip( prog_clip.block(0).var(hidden1.name)._set_error_clip(
fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN) paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
) )
avg_cost_clip = prog_clip.block(0).var(avg_cost.name) avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
fluid.backward.append_backward(loss=avg_cost) fluid.backward.append_backward(loss=avg_cost)
fluid.backward.append_backward( fluid.backward.append_backward(
loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback] loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback]
) )
hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD") hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
......
...@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt = paddle.optimizer.AdamW( opt = paddle.optimizer.AdamW(
learning_rate=lr_val, learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
) )
acc_steps = 2 # accumulated steps for pipeline acc_steps = 2 # accumulated steps for pipeline
......
...@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase): ...@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt = fluid.optimizer.Momentum( opt = fluid.optimizer.Momentum(
learning_rate=lr_val, learning_rate=lr_val,
momentum=0.9, momentum=0.9,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
) )
acc_steps = 2 # accumulated steps for pipeline acc_steps = 2 # accumulated steps for pipeline
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
import unittest import unittest
import paddle import paddle
import paddle.fluid.clip as clip
import paddle.fluid.framework as framework import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer import paddle.fluid.regularizer as regularizer
import paddle.nn.clip as clip
paddle.enable_static() paddle.enable_static()
...@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
rampup_begin_step=0, rampup_begin_step=0,
num_trainers=2, num_trainers=2,
regularization=regularization, regularization=regularization,
grad_clip=clip.GradientClipByNorm(1.0), grad_clip=clip.ClipGradByNorm(1.0),
) )
if use_recompute: if use_recompute:
...@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase): ...@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
print("dgc regular_coeff=" + str(coeff)) print("dgc regular_coeff=" + str(coeff))
def test_tpyeError(self): def test_tpyeError(self):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm' # the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm'
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
dgc_momentum_optimizer = self.MockDGCMomentum( dgc_momentum_optimizer = self.MockDGCMomentum(
learning_rate=0.01, learning_rate=0.01,
momentum=0.2, momentum=0.2,
rampup_begin_step=0, rampup_begin_step=0,
num_trainers=2, num_trainers=2,
grad_clip=clip.GradientClipByGlobalNorm(1.0), grad_clip=clip.ClipGradByGlobalNorm(1.0),
) )
def test_momentum_without_dgc(self): def test_momentum_without_dgc(self):
......
...@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
} }
strategy.fuse_all_reduce_ops = True strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
...@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer): ...@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
strategy.fuse_all_reduce_ops = True strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_size_in_MB = 32
strategy.fuse_grad_merge = True strategy.fuse_grad_merge = True
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
...@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
} }
strategy.fuse_all_reduce_ops = True strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
...@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): ...@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
} }
strategy.fuse_all_reduce_ops = True strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32 strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0) clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
......
...@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ...@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
) )
avg_cost, strategy = self.net(train_prog, startup_prog) avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'sharding') self.set_strategy(strategy, 'sharding')
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
) )
...@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size": 2, "micro_batch_size": 2,
"accumulate_steps": 4, "accumulate_steps": 4,
} }
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer( self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
) )
...@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer): ...@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size": 2, "micro_batch_size": 2,
"accumulate_steps": 4, "accumulate_steps": 4,
} }
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer( self.optimizer(
avg_cost, avg_cost,
strategy, strategy,
......
...@@ -22,8 +22,8 @@ import paddle ...@@ -22,8 +22,8 @@ import paddle
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
from paddle.incubate import DistributedFusedLamb from paddle.incubate import DistributedFusedLamb
from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
from paddle.vision.models import resnet18 as resnet from paddle.vision.models import resnet18 as resnet
......
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.jit.dy2static import Call from paddle.jit.dy2static import Call
from paddle.nn import clip
SEED = 2020 SEED = 2020
np.random.seed(SEED) np.random.seed(SEED)
...@@ -89,11 +90,11 @@ def len_with_selected_rows(place): ...@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
type=fluid.core.VarDesc.VarType.SELECTED_ROWS, type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
) )
# y is Variable(SelectedRows) # y is Variable(SelectedRows)
y = fluid.layers.merge_selected_rows(var) y = clip.merge_selected_rows(var)
y_len = Call(len)(y) y_len = Call(len)(y)
# z is inner tensor with shape [4, 2] # z is inner tensor with shape [4, 2]
z = fluid.layers.get_tensor_from_selected_rows(y) z = clip.get_tensor_from_selected_rows(y)
z_len = Call(len)(z) z_len = Call(len)(z)
# set data for selected_rows # set data for selected_rows
......
...@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel ...@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel
from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.clip import GradientClipByGlobalNorm
from paddle.jit import ProgramTranslator from paddle.jit import ProgramTranslator
from paddle.nn import ClipGradByGlobalNorm
place = ( place = (
fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace() fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
...@@ -71,7 +71,7 @@ def train(args, attn_model=False): ...@@ -71,7 +71,7 @@ def train(args, attn_model=False):
dropout=args.dropout, dropout=args.dropout,
) )
gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm) gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm)
optimizer = fluid.optimizer.SGD( optimizer = fluid.optimizer.SGD(
args.learning_rate, args.learning_rate,
parameter_list=model.parameters(), parameter_list=model.parameters(),
......
...@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip( ...@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
): ):
def set_params(self): def set_params(self):
self.operand = paddle.add self.operand = paddle.add
self.act = fluid.layers.clip self.act = paddle.clip
self.act_alpha = 0.0 self.act_alpha = 0.0
self.act_beta = 10.0 self.act_beta = 10.0
...@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip( ...@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
): ):
def set_params(self): def set_params(self):
self.operand = paddle.subtract self.operand = paddle.subtract
self.act = fluid.layers.clip self.act = paddle.clip
self.act_alpha = 0.0 self.act_alpha = 0.0
self.act_beta = 10.0 self.act_beta = 10.0
...@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip( ...@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
): ):
def set_params(self): def set_params(self):
self.operand = paddle.multiply self.operand = paddle.multiply
self.act = fluid.layers.clip self.act = paddle.clip
self.act_alpha = 0.0 self.act_alpha = 0.0
self.act_beta = 10.0 self.act_beta = 10.0
......
...@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest( ...@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest): class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
def append_act(self, x): def append_act(self, x):
return fluid.layers.clip(x, 0, 1) return paddle.clip(x, 0, 1)
class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest): class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
......
...@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase): ...@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32") input_data = np.random.random((2, 4)).astype("float32")
def test_Variable(): def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0) paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable) self.assertRaises(TypeError, test_Variable)
def test_dtype(): def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0) paddle.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype) self.assertRaises(TypeError, test_dtype)
paddle.disable_static() paddle.disable_static()
......
...@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase):
value = np.arange(26).reshape(2, 13).astype("float32") value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value) a = fluid.dygraph.to_variable(value)
linear = paddle.nn.Linear(13, 5) linear = paddle.nn.Linear(13, 5)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam( adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip 0.1, parameters=linear.parameters(), grad_clip=clip
) )
......
...@@ -20,12 +20,13 @@ from op_test import OpTest ...@@ -20,12 +20,13 @@ from op_test import OpTest
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.nn import clip
class TestClipByNormOp(OpTest): class TestClipByNormOp(OpTest):
def setUp(self): def setUp(self):
self.max_relative_error = 0.006 self.max_relative_error = 0.006
self.python_api = fluid.layers.clip_by_norm self.python_api = clip.clip_by_norm
self.init_dtype() self.init_dtype()
self.initTestCase() self.initTestCase()
input = np.random.random(self.shape).astype(self.dtype) input = np.random.random(self.shape).astype(self.dtype)
......
...@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase): ...@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32") input_data = np.random.random((2, 4)).astype("float32")
def test_Variable(): def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0) paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable) self.assertRaises(TypeError, test_Variable)
def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype)
paddle.disable_static() paddle.disable_static()
......
...@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest): ...@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest):
def filter(param): def filter(param):
return param.name == "fc_w" return param.name == "fc_w"
clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter) clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter)
sgd_optimizer.minimize(avg_cost, grad_clip=clip) sgd_optimizer.minimize(avg_cost, grad_clip=clip)
def transpiler_test_impl(self): def transpiler_test_impl(self):
......
...@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase): ...@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase):
self.feed_order, self.feed_order,
) = res_vars ) = res_vars
fluid.clip.set_gradient_clip( paddle.nn.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm( clip=paddle.nn.ClipGradByGlobalNorm(
clip_norm=config.max_grad_norm clip_norm=config.max_grad_norm
) )
) )
......
...@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase): ...@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
) )
opt = paddle.optimizer.AdamW( opt = paddle.optimizer.AdamW(
learning_rate=lr_val, learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
) )
opt.minimize(loss) opt.minimize(loss)
# TODO: section_program will be removed in the future # TODO: section_program will be removed in the future
......
...@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase): ...@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
) )
opt = paddle.optimizer.AdamW( opt = paddle.optimizer.AdamW(
learning_rate=lr_val, learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
) )
opt.minimize(loss) opt.minimize(loss)
# TODO: section_program will be removed in the future # TODO: section_program will be removed in the future
......
...@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase): ...@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase):
) )
opt = paddle.optimizer.AdamW( opt = paddle.optimizer.AdamW(
learning_rate=lr_val, learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0), grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
) )
opt.minimize(loss) opt.minimize(loss)
# TODO: section_program will be removed in the future # TODO: section_program will be removed in the future
......
...@@ -20,6 +20,7 @@ import paddle.fluid as fluid ...@@ -20,6 +20,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid import Program, program_guard from paddle.fluid import Program, program_guard
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
from paddle.nn import clip
class TestGetTensorFromSelectedRowsError(unittest.TestCase): class TestGetTensorFromSelectedRowsError(unittest.TestCase):
...@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase): ...@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase):
x_data = np.random.random((2, 4)).astype("float32") x_data = np.random.random((2, 4)).astype("float32")
def test_Variable(): def test_Variable():
fluid.layers.get_tensor_from_selected_rows(x=x_data) clip.get_tensor_from_selected_rows(x=x_data)
self.assertRaises(TypeError, test_Variable) self.assertRaises(TypeError, test_Variable)
def test_SELECTED_ROWS(): def test_SELECTED_ROWS():
fluid.layers.get_tensor_from_selected_rows(x=x_var) clip.get_tensor_from_selected_rows(x=x_var)
self.assertRaises(TypeError, test_SELECTED_ROWS) self.assertRaises(TypeError, test_SELECTED_ROWS)
......
...@@ -17,12 +17,8 @@ import unittest ...@@ -17,12 +17,8 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.clip import (
GradientClipByGlobalNorm,
GradientClipByNorm,
GradientClipByValue,
)
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue
class TestGradClipByGlobalNorm(unittest.TestCase): class TestGradClipByGlobalNorm(unittest.TestCase):
...@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase): ...@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def get_dygrap_global_norm_result(self): def get_dygrap_global_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm) gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
...@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase): ...@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase):
def get_dygrap_norm_result(self): def get_dygrap_norm_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
norm_clip = GradientClipByNorm(self.max_norm) norm_clip = ClipGradByNorm(self.max_norm)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
...@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase): ...@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase):
def get_dygrap_clip_result(self): def get_dygrap_clip_result(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
value_clip = GradientClipByValue( value_clip = ClipGradByValue(max=self.max_value, min=self.min_value)
max=self.max_value, min=self.min_value
)
p_g_var = [] p_g_var = []
for p, g in self.para_and_grad: for p, g in self.para_and_grad:
new_p = to_variable(p) new_p = to_variable(p)
......
...@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader ...@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip from paddle.nn.clip import _allow_pure_fp16_global_norm_clip
paddle.enable_static() paddle.enable_static()
...@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use 'set_gradient_clip' # test whether the output is right when use 'set_gradient_clip'
def test_old_gradient_clip(self): def test_old_gradient_clip(self):
def func(params_grads): def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
fluid.clip.set_gradient_clip(clip) paddle.nn.clip.set_gradient_clip(clip)
return fluid.clip.append_gradient_clip_ops(params_grads) return paddle.nn.clip.append_gradient_clip_ops(params_grads)
self.clip_gradient = func self.clip_gradient = func
self.check_gradient_clip(fluid.CPUPlace()) self.check_gradient_clip(fluid.CPUPlace())
...@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip # test whether the output is right when use grad_clip
def test_new_gradient_clip(self): def test_new_gradient_clip(self):
def func(params_grads): def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
return clip(params_grads) return clip(params_grads)
self.clip_gradient = func self.clip_gradient = func
...@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip under float64 # test whether the output is right when use grad_clip under float64
def test_new_gradient_clip_fp64(self): def test_new_gradient_clip_fp64(self):
def func(params_grads): def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
return clip(params_grads) return clip(params_grads)
self.clip_gradient = func self.clip_gradient = func
...@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order # invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self): def test_wrong_API_order(self):
def backward_func(cost): def backward_func(cost):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0)
fluid.clip.set_gradient_clip(clip) paddle.nn.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.01, grad_clip=clip learning_rate=0.01, grad_clip=clip
) )
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective # if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer.minimize(cost) sgd_optimizer.minimize(cost)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective # 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid.clip.set_gradient_clip(clip) paddle.nn.clip.set_gradient_clip(clip)
self.backward_and_optimize = backward_func self.backward_and_optimize = backward_func
for place in self.get_places(): for place in self.get_places():
...@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip): ...@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
with fluid.program_guard( with fluid.program_guard(
main_program=prog, startup_program=startup_program main_program=prog, startup_program=startup_program
): ):
clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm) clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
x = ( x = (
fluid.default_main_program() fluid.default_main_program()
.global_block() .global_block()
...@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip): ...@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip):
# test whether the output is right when use grad_clip # test whether the output is right when use grad_clip
def test_gradient_clip(self): def test_gradient_clip(self):
def func(params_grads): def func(params_grads):
clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
return clip(params_grads) return clip(params_grads)
self.clip_gradient = func self.clip_gradient = func
...@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip): ...@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
clip = fluid.clip.GradientClipByNorm(self.clip_norm) clip = paddle.nn.ClipGradByNorm(self.clip_norm)
x = ( x = (
fluid.default_main_program() fluid.default_main_program()
.global_block() .global_block()
...@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip): ...@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip):
# test whether the output is right when use grad_clip # test whether the output is right when use grad_clip
def test_gradient_clip(self): def test_gradient_clip(self):
def func(params_grads): def func(params_grads):
clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
return clip(params_grads) return clip(params_grads)
self.clip_gradient = func self.clip_gradient = func
...@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip): ...@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip # if grad is None or not need clip
def test_none_grad(self): def test_none_grad(self):
clip = fluid.clip.GradientClipByValue(self.max, self.min) clip = paddle.nn.ClipGradByValue(self.max, self.min)
x = ( x = (
fluid.default_main_program() fluid.default_main_program()
.global_block() .global_block()
...@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase): ...@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase):
sgd_optimizer = fluid.optimizer.SGD( sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.0, learning_rate=0.0,
parameter_list=linear.parameters(), parameter_list=linear.parameters(),
grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1), grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1),
) )
self.check_clip_result(loss, sgd_optimizer) self.check_clip_result(loss, sgd_optimizer)
...@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase): ...@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm( self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
clip_norm=self.clip_norm self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
)
self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm
)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
...@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip): ...@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip): class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self): def setUp(self):
self.clip_norm = 0.8 self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm) self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
...@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip): ...@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self): def setUp(self):
self.max = 0.2 self.max = 0.2
self.min = 0.1 self.min = 0.1
self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min) self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
def check_clip_result(self, loss, optimizer): def check_clip_result(self, loss, optimizer):
# if grad is None # if grad is None
...@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase): ...@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
params_grads.append((param, param._grad_ivar())) params_grads.append((param, param._grad_ivar()))
_, grads = zip(*params_grads) _, grads = zip(*params_grads)
# clip grads # clip grads
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8)
params_grads = clip(params_grads) params_grads = clip(params_grads)
_, grads_clip = zip(*params_grads) _, grads_clip = zip(*params_grads)
# param update # param update
...@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase): ...@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
params_grads.append((param, param._grad_ivar())) params_grads.append((param, param._grad_ivar()))
_, grads = zip(*params_grads) _, grads = zip(*params_grads)
# clip grads # clip grads
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1) clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1)
params_grads = clip(params_grads) params_grads = clip(params_grads)
_, grads_clip = zip(*params_grads) _, grads_clip = zip(*params_grads)
......
...@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace() place = fluid.CPUPlace()
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
model = MyLayer(size, vocab_size, size) model = MyLayer(size, vocab_size, size)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
) )
...@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
model = MyLayer2(size, vocab_size, size) model = MyLayer2(size, vocab_size, size)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001) grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer( optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip 0.001, parameter_list=model.parameters(), grad_clip=grad_clip
) )
......
...@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase):
fluid.set_flags( fluid.set_flags(
{'FLAGS_sort_sum_gradient': sort_sum_gradient} {'FLAGS_sort_sum_gradient': sort_sum_gradient}
) )
# grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) # grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = paddle.to_tensor(input_word) input = paddle.to_tensor(input_word)
...@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase): ...@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase):
fluid.set_flags( fluid.set_flags(
{'FLAGS_sort_sum_gradient': sort_sum_gradient} {'FLAGS_sort_sum_gradient': sort_sum_gradient}
) )
grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0) grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64') input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word) input = to_variable(input_word)
......
...@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase): ...@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32") input_data = np.random.random((2, 4)).astype("float32")
def test_Variable(): def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0) paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable) self.assertRaises(TypeError, test_Variable)
def test_dtype(): def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32') x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0) paddle.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype) self.assertRaises(TypeError, test_dtype)
paddle.disable_static() paddle.disable_static()
......
...@@ -1535,7 +1535,7 @@ class Model: ...@@ -1535,7 +1535,7 @@ class Model:
assert isinstance( assert isinstance(
self._optimizer._grad_clip, self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm), (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently." ), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {} self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {} self._adapter._amp_configs = {}
......
...@@ -15,13 +15,14 @@ ...@@ -15,13 +15,14 @@
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
from paddle.fluid import core, layers from paddle.fluid import core, layers
from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.dygraph import base as imperative_base
from paddle.nn import clip
from paddle.nn.clip import ClipGradBase, _squared_l2_norm
class ClipGradForMOEByGlobalNorm(ClipGradBase): class ClipGradForMOEByGlobalNorm(ClipGradBase):
r""" r"""
The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` . :math:`t\_list` , and limit it to ``clip_norm`` .
...@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
continue continue
merge_grad = g merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS: if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g) merge_grad = clip.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad) merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
sum_square = _squared_l2_norm(merge_grad) sum_square = _squared_l2_norm(merge_grad)
if sum_square.dtype == core.VarDesc.VarType.FP16: if sum_square.dtype == core.VarDesc.VarType.FP16:
sum_square_list_fp16.append(sum_square) sum_square_list_fp16.append(sum_square)
......
...@@ -16,11 +16,11 @@ import os ...@@ -16,11 +16,11 @@ import os
import paddle import paddle
from paddle.fluid import core, framework, unique_name from paddle.fluid import core, framework, unique_name
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.fluid.executor import global_scope from paddle.fluid.executor import global_scope
from paddle.fluid.framework import Variable, name_scope from paddle.fluid.framework import Variable, name_scope
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.optimizer import Optimizer from paddle.fluid.optimizer import Optimizer
from paddle.nn import ClipGradByGlobalNorm
def init_communicator(block, rank, ranks, ring_id): def init_communicator(block, rank, ranks, ring_id):
......
此差异已折叠。
...@@ -20,10 +20,10 @@ import paddle ...@@ -20,10 +20,10 @@ import paddle
from .. import _C_ops from .. import _C_ops
from ..fluid import core, framework, unique_name from ..fluid import core, framework, unique_name
from ..fluid.clip import GradientClipBase
from ..fluid.dygraph import base as imperative_base from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Parameter, Variable from ..fluid.framework import Parameter, Variable
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
from ..nn.clip import GradientClipBase
from .lr import LRScheduler from .lr import LRScheduler
from .optimizer import Optimizer from .optimizer import Optimizer
......
...@@ -18,6 +18,7 @@ from collections import defaultdict ...@@ -18,6 +18,7 @@ from collections import defaultdict
import numpy as np import numpy as np
import paddle import paddle
import paddle.autograd as imperative_base
from paddle import _C_ops from paddle import _C_ops
from paddle.fluid import core from paddle.fluid import core
from paddle.fluid.framework import ( from paddle.fluid.framework import (
...@@ -32,12 +33,6 @@ from paddle.fluid.framework import ( ...@@ -32,12 +33,6 @@ from paddle.fluid.framework import (
from ..fluid import framework, unique_name from ..fluid import framework, unique_name
from ..fluid.backward import _get_no_grad_set_name, append_backward from ..fluid.backward import _get_no_grad_set_name, append_backward
from ..fluid.clip import (
GradientClipBase,
append_gradient_clip_ops,
error_clip_callback,
)
from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Parameter, program_guard from ..fluid.framework import Parameter, program_guard
from ..fluid.initializer import Constant from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper from ..fluid.layer_helper import LayerHelper
...@@ -168,7 +163,7 @@ class Optimizer: ...@@ -168,7 +163,7 @@ class Optimizer:
""" """
@imperative_base.no_grad @imperative_base.no_grad()
def __init__( def __init__(
self, self,
learning_rate, learning_rate,
...@@ -225,7 +220,7 @@ class Optimizer: ...@@ -225,7 +220,7 @@ class Optimizer:
% type(learning_rate) % type(learning_rate)
) )
if grad_clip is not None: if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase): if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
raise TypeError( raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class" "'grad_clip' should be an instance of GradientClipBase's derived class"
) )
...@@ -1042,7 +1037,7 @@ class Optimizer: ...@@ -1042,7 +1037,7 @@ class Optimizer:
params_grads.append((parameter_list[index], grad)) params_grads.append((parameter_list[index], grad))
else: else:
if callbacks is None: if callbacks is None:
callbacks = [error_clip_callback] callbacks = [paddle.nn.clip.error_clip_callback]
else: else:
assert isinstance(callbacks, list) assert isinstance(callbacks, list)
program = loss.block.program program = loss.block.program
...@@ -1103,7 +1098,7 @@ class Optimizer: ...@@ -1103,7 +1098,7 @@ class Optimizer:
params_grads = self._grad_clip(params_grads) params_grads = self._grad_clip(params_grads)
else: else:
params_grads = append_gradient_clip_ops(params_grads) params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
# Add regularization if any # Add regularization if any
params_grads = self.append_regularization_ops( params_grads = self.append_regularization_ops(
...@@ -1317,7 +1312,7 @@ class Optimizer: ...@@ -1317,7 +1312,7 @@ class Optimizer:
else: else:
core.clear_gradients(param_list, set_to_zero) core.clear_gradients(param_list, set_to_zero)
@imperative_base.no_grad @imperative_base.no_grad()
def minimize( def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None self, loss, startup_program=None, parameters=None, no_grad_set=None
): ):
...@@ -1380,7 +1375,7 @@ class Optimizer: ...@@ -1380,7 +1375,7 @@ class Optimizer:
return optimize_ops, params_grads return optimize_ops, params_grads
@imperative_base.no_grad @imperative_base.no_grad()
@framework.dygraph_only @framework.dygraph_only
def step(self): def step(self):
""" """
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册