未验证 提交 fe0dc40d 编写于 作者: 骑马小猫 提交者: GitHub

[FluidAPI]remove clip api (#48946)

上级 822ea0f9
......@@ -20,11 +20,11 @@ __all__ = []
import paddle
from paddle.common_ops_import import LayerHelper
from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.optimizer import Momentum, Optimizer
from paddle.framework import core
from paddle.nn.clip import ClipGradByNorm, append_gradient_clip_ops
from paddle.static import create_global_var
......@@ -76,9 +76,9 @@ class DGCMomentumOptimizer(Optimizer):
self._dgc_clip_norm = None
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipByNorm):
if not isinstance(grad_clip, ClipGradByNorm):
raise TypeError(
"The type of grad_clip should be 'GradientClipByNorm', because DGCMomentumOptimizer only support GradientClipByNorm"
"The type of grad_clip should be 'ClipGradByNorm', because DGCMomentumOptimizer only support ClipGradByNorm"
)
assert isinstance(num_trainers, int), (
"The type of num_trainers should be 'int', but received %s"
......
......@@ -15,9 +15,8 @@
import paddle
from paddle import framework
from paddle.autograd import no_grad
from paddle.fluid import layers
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm, clip
from ...base.topology import ParallelMode
from ...utils.hybrid_parallel_util import (
......@@ -62,8 +61,8 @@ class HybridParallelClipGrad:
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
merge_grad = clip.merge_selected_rows(g)
merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
square = paddle.square(merge_grad)
sum_square = paddle.sum(square)
......
......@@ -30,7 +30,7 @@ import paddle
import paddle.distributed as dist
from paddle.distributed import ParallelMode, fleet
from paddle.fluid import core
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.nn import ClipGradByGlobalNorm
from paddle.optimizer import Optimizer
HybridParallelClipGrad = (
......
......@@ -25,8 +25,8 @@ import paddle.fluid.framework as framework
from paddle import nn
from paddle.autograd import PyLayer
from paddle.distributed import collective
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.fluid.framework import EagerParamBase
from paddle.nn import ClipGradByGlobalNorm
from .group_sharded_storage import GradStorage
from .group_sharded_utils import GroupShardedClipGrad, Type, device_guard
......
......@@ -23,6 +23,7 @@ from paddle import _legacy_C_ops
from paddle.fluid import core, layers
from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only
from paddle.nn import clip
class Taskflow:
......@@ -65,8 +66,8 @@ class GroupShardedClipGrad:
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.get_tensor_from_selected_rows(
layers.merge_selected_rows(g)
merge_grad = clip.get_tensor_from_selected_rows(
clip.merge_selected_rows(g)
)
square = paddle.square(merge_grad)
sum_square = paddle.sum(square)
......
......@@ -159,7 +159,7 @@ def auc(stat_pos, stat_neg, scope=None, util=None):
.. code-block:: python
# in model.py
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(output, min=-15.0, max=15.0))
similarity_norm = fluid.layers.sigmoid(paddle.clip(output, min=-15.0, max=15.0))
binary_predict = fluid.layers.concat(
input=[paddle.subtract(fluid.layers.ceil(similarity_norm), similarity_norm), similarity_norm], axis=1)
self.auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg] =
......
......@@ -90,7 +90,6 @@ from .transpiler import (
DistributeTranspilerConfig,
)
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip
from . import profiler
from . import unique_name
from . import parallel_executor
......@@ -164,7 +163,6 @@ __all__ = (
'ParamAttr',
'WeightNormParamAttr',
'DataFeeder',
'clip',
'profiler',
'unique_name',
'Scope',
......
此差异已折叠。
......@@ -185,7 +185,7 @@ class FleetUtil:
# below is part of model
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
......@@ -1374,7 +1374,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
......@@ -1574,7 +1574,7 @@ class FleetUtil:
label = fluid.layers.data(name="click", shape=[-1, 1],\
dtype="int64", lod_level=0, append_batch_size=False)
emb = my_slot_net(slots, label) # emb can be fc layer of size 1
similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(\
similarity_norm = fluid.layers.sigmoid(paddle.clip(\
emb, min=-15.0, max=15.0), name="similarity_norm")\
binary_predict = fluid.layers.concat(input=[\
paddle.subtract(\
......
......@@ -63,10 +63,6 @@ __all__ = [
'fc',
'embedding',
'autoincreased_step_counter',
'clip',
'clip_by_norm',
'merge_selected_rows',
'get_tensor_from_selected_rows',
]
OP_NAMEMAPPING = {
......@@ -997,199 +993,3 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
)
return out
@templatedoc()
def clip(x, min, max, name=None):
"""
:old_api: paddle.fluid.layers.clip
${comment}
Args:
x(${x_type}): ${x_comment}
min(float): ${min_comment}
max(float): ${max_comment}
name(str, optional): The default value is None.
Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`
Returns:
${out_comment}
Return Type:
${out_type}
Examples:
.. code-block:: python
import paddle.fluid as fluid
input = fluid.data(
name='data', shape=[1], dtype='float32')
reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
"""
helper = LayerHelper("clip", **locals())
check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
if name is None:
name = unique_name.generate_with_ignorable_key(
".".join([helper.name, 'tmp'])
)
out = helper.create_variable(
type=x.type, name=name, dtype=x.dtype, persistable=False
)
helper.append_op(
type="clip",
inputs={"X": x},
attrs={"min": min, "max": max},
outputs={"Out": out},
)
return out
@templatedoc()
def clip_by_norm(x, max_norm, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
max_norm(${max_norm_type}): ${max_norm_comment}
name(str, optional): For detailed information, please refer
to :ref:`api_guide_Name`. Usually name is no need to set and
None by default.
Returns:
Tensor:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
input = paddle.to_tensor([[2.0, 2.0], [2.0, 2.0]], dtype='float32')
reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
# [[0.5, 0.5], [0.5, 0.5]]
"""
if in_dygraph_mode():
return _C_ops.clip_by_norm(x, max_norm)
else:
helper = LayerHelper("clip_by_norm", **locals())
check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm')
check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
if name is None:
name = unique_name.generate_with_ignorable_key(
".".join([helper.name, 'tmp'])
)
out = helper.create_variable(
type=x.type, name=name, dtype=x.dtype, persistable=False
)
helper.append_op(
type="clip_by_norm",
inputs={"X": x},
attrs={"max_norm": max_norm},
outputs={"Out": out},
)
return out
@templatedoc()
def merge_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
var = b.create_var(
name="X", dtype="float32", persistable=True,
type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
y = fluid.layers.merge_selected_rows(var)
"""
if in_dygraph_mode():
return _C_ops.merge_selected_rows(x)
else:
helper = LayerHelper("merge_selected_rows", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="merge_selected_rows",
inputs={"X": x},
attrs={},
outputs={"Out": out},
)
return out
@templatedoc()
def get_tensor_from_selected_rows(x, name=None):
"""
This operator gets tensor data from input with SelectedRows type, and outputs a LoDTensor.
.. code-block:: text
input x is SelectedRows:
x.rows = [0, 5, 5, 4, 19]
x.height = 20
x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
Output is LoDTensor:
out.shape = [5, 2]
out.data = [[1, 1],
[2, 2],
[2, 2],
[3, 3],
[6, 6]]
Args:
x(SelectedRows): Input with SelectedRows type. The data type is float32, float64, int32 or int64.
name(str, optional): The default value is None. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` .
Returns:
Variable: LoDTensor transformed from SelectedRows. The data type is same with input.
Examples:
.. code-block:: python
import paddle.fluid as fluid
b = fluid.default_main_program().global_block()
input = b.create_var(name="X", dtype="float32", persistable=True, type=fluid.core.VarDesc.VarType.SELECTED_ROWS)
out = fluid.layers.get_tensor_from_selected_rows(input)
"""
check_type(x, 'x', Variable, 'get_tensor_from_selected_rows')
if x.type != core.VarDesc.VarType.SELECTED_ROWS:
raise TypeError(
"The type of 'x' in get_tensor_from_selected_rows must be SELECTED_ROWS."
)
helper = LayerHelper('get_tensor_from_selected_rows', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='get_tensor_from_selected_rows',
inputs={'X': x},
outputs={'Out': out},
attrs={},
)
return out
......@@ -38,13 +38,6 @@ from .backward import (
_append_grad_suffix_,
_get_no_grad_set_name,
)
from .clip import (
GradientClipBase,
GradientClipByNorm,
error_clip_callback,
append_gradient_clip_ops,
ClipGradByGlobalNorm,
)
from .framework import program_guard
from .initializer import Constant
from .layer_helper import LayerHelper
......@@ -160,7 +153,7 @@ class Optimizer:
)
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
......@@ -1030,7 +1023,7 @@ class Optimizer:
params_grads.append((param, grad_var))
else:
if callbacks is None:
callbacks = [error_clip_callback]
callbacks = [paddle.nn.clip.error_clip_callback]
else:
assert isinstance(callbacks, list)
program = loss.block.program
......@@ -1260,7 +1253,7 @@ class Optimizer:
# NOTE(zhiqiu): currently, only support ClipGradByGlobalNorm and without regularization.
if self._flatten_param_grads and self.regularization is None:
if self._grad_clip is None or isinstance(
self._grad_clip, ClipGradByGlobalNorm
self._grad_clip, paddle.nn.ClipGradByGlobalNorm
):
params_grads = self.flatten_param_grads(params_grads)
......@@ -1268,7 +1261,7 @@ class Optimizer:
if self._grad_clip is not None:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = self.append_regularization_ops(
......
......@@ -38,13 +38,13 @@ with fluid.program_guard(main_program=prog):
prog_clip = prog.clone()
prog_clip.block(0).var(hidden1.name)._set_error_clip(
fluid.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
paddle.nn.clip.ErrorClipByValue(max=CLIP_MAX, min=CLIP_MIN)
)
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
fluid.backward.append_backward(loss=avg_cost)
fluid.backward.append_backward(
loss=avg_cost_clip, callbacks=[fluid.clip.error_clip_callback]
loss=avg_cost_clip, callbacks=[paddle.nn.clip.error_clip_callback]
)
hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
......
......@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
)
acc_steps = 2 # accumulated steps for pipeline
......
......@@ -122,7 +122,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
opt = fluid.optimizer.Momentum(
learning_rate=lr_val,
momentum=0.9,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
)
acc_steps = 2 # accumulated steps for pipeline
......
......@@ -15,10 +15,10 @@
import unittest
import paddle
import paddle.fluid.clip as clip
import paddle.fluid.framework as framework
import paddle.fluid.optimizer as optimizer
import paddle.fluid.regularizer as regularizer
import paddle.nn.clip as clip
paddle.enable_static()
......@@ -76,7 +76,7 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
rampup_begin_step=0,
num_trainers=2,
regularization=regularization,
grad_clip=clip.GradientClipByNorm(1.0),
grad_clip=clip.ClipGradByNorm(1.0),
)
if use_recompute:
......@@ -144,14 +144,14 @@ class TestDGCMomentumOptimizer(unittest.TestCase):
print("dgc regular_coeff=" + str(coeff))
def test_tpyeError(self):
# the type of DGCMomentumOptimizer(grad_clip=) must be 'GradientClipByNorm'
# the type of DGCMomentumOptimizer(grad_clip=) must be 'ClipGradByNorm'
with self.assertRaises(TypeError):
dgc_momentum_optimizer = self.MockDGCMomentum(
learning_rate=0.01,
momentum=0.2,
rampup_begin_step=0,
num_trainers=2,
grad_clip=clip.GradientClipByGlobalNorm(1.0),
grad_clip=clip.ClipGradByGlobalNorm(1.0),
)
def test_momentum_without_dgc(self):
......
......@@ -354,7 +354,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
}
strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
......@@ -552,7 +552,7 @@ class TestFleetHybridOptimizer(TestFleetMetaOptimizer):
strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32
strategy.fuse_grad_merge = True
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
......@@ -940,7 +940,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
}
strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
......@@ -1044,7 +1044,7 @@ class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer):
}
strategy.fuse_all_reduce_ops = True
strategy.fuse_grad_size_in_MB = 32
clip = paddle.fluid.clip.GradientClipByGlobalNorm(1.0)
clip = paddle.nn.ClipGradByGlobalNorm(1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
......
......@@ -640,7 +640,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
)
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'sharding')
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
)
......@@ -1309,7 +1309,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size": 2,
"accumulate_steps": 4,
}
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer(
avg_cost, strategy, train_prog, startup_prog, grad_clip=clip
)
......@@ -1547,7 +1547,7 @@ class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
"micro_batch_size": 2,
"accumulate_steps": 4,
}
clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
self.optimizer(
avg_cost,
strategy,
......
......@@ -22,8 +22,8 @@ import paddle
import paddle.distributed.fleet as fleet
import paddle.fluid.core as core
from paddle.distributed.fleet.meta_optimizers.common import CollectiveHelper
from paddle.fluid.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
from paddle.incubate import DistributedFusedLamb
from paddle.nn.clip import ClipGradBase, _clip_by_global_norm_using_mp_type
from paddle.vision.models import resnet18 as resnet
......
......@@ -19,6 +19,7 @@ import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.jit.dy2static import Call
from paddle.nn import clip
SEED = 2020
np.random.seed(SEED)
......@@ -89,11 +90,11 @@ def len_with_selected_rows(place):
type=fluid.core.VarDesc.VarType.SELECTED_ROWS,
)
# y is Variable(SelectedRows)
y = fluid.layers.merge_selected_rows(var)
y = clip.merge_selected_rows(var)
y_len = Call(len)(y)
# z is inner tensor with shape [4, 2]
z = fluid.layers.get_tensor_from_selected_rows(y)
z = clip.get_tensor_from_selected_rows(y)
z_len = Call(len)(z)
# set data for selected_rows
......
......@@ -22,8 +22,8 @@ from seq2seq_dygraph_model import AttentionModel, BaseModel
from seq2seq_utils import Seq2SeqModelHyperParams, get_data_iter
import paddle.fluid as fluid
from paddle.fluid.clip import GradientClipByGlobalNorm
from paddle.jit import ProgramTranslator
from paddle.nn import ClipGradByGlobalNorm
place = (
fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
......@@ -71,7 +71,7 @@ def train(args, attn_model=False):
dropout=args.dropout,
)
gloabl_norm_clip = GradientClipByGlobalNorm(args.max_grad_norm)
gloabl_norm_clip = ClipGradByGlobalNorm(args.max_grad_norm)
optimizer = fluid.optimizer.SGD(
args.learning_rate,
parameter_list=model.parameters(),
......
......@@ -127,7 +127,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
):
def set_params(self):
self.operand = paddle.add
self.act = fluid.layers.clip
self.act = paddle.clip
self.act_alpha = 0.0
self.act_beta = 10.0
......@@ -219,7 +219,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
):
def set_params(self):
self.operand = paddle.subtract
self.act = fluid.layers.clip
self.act = paddle.clip
self.act_alpha = 0.0
self.act_beta = 10.0
......@@ -319,7 +319,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
):
def set_params(self):
self.operand = paddle.multiply
self.act = fluid.layers.clip
self.act = paddle.clip
self.act_alpha = 0.0
self.act_beta = 10.0
......
......@@ -106,7 +106,7 @@ class TensorRTSubgraphPassHardSwishPluginTest(
class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
def append_act(self, x):
return fluid.layers.clip(x, 0, 1)
return paddle.clip(x, 0, 1)
class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
......
......@@ -117,13 +117,13 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32")
def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable)
def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0)
paddle.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype)
paddle.disable_static()
......
......@@ -686,7 +686,7 @@ class TestAdamOpV2(unittest.TestCase):
value = np.arange(26).reshape(2, 13).astype("float32")
a = fluid.dygraph.to_variable(value)
linear = paddle.nn.Linear(13, 5)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
adam = paddle.optimizer.Adam(
0.1, parameters=linear.parameters(), grad_clip=clip
)
......
......@@ -20,12 +20,13 @@ from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.nn import clip
class TestClipByNormOp(OpTest):
def setUp(self):
self.max_relative_error = 0.006
self.python_api = fluid.layers.clip_by_norm
self.python_api = clip.clip_by_norm
self.init_dtype()
self.initTestCase()
input = np.random.random(self.shape).astype(self.dtype)
......
......@@ -128,15 +128,9 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32")
def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable)
def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype)
paddle.disable_static()
......
......@@ -584,7 +584,7 @@ class TestL2Decay(TranspilerTest):
def filter(param):
return param.name == "fc_w"
clip = fluid.clip.GradientClipByValue(0.1, need_clip=filter)
clip = paddle.nn.ClipGradByValue(0.1, need_clip=filter)
sgd_optimizer.minimize(avg_cost, grad_clip=clip)
def transpiler_test_impl(self):
......
......@@ -504,8 +504,8 @@ class PaddingRNNTestBase(unittest.TestCase):
self.feed_order,
) = res_vars
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(
paddle.nn.clip.set_gradient_clip(
clip=paddle.nn.ClipGradByGlobalNorm(
clip_norm=config.max_grad_norm
)
)
......
......@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
)
opt.minimize(loss)
# TODO: section_program will be removed in the future
......
......@@ -64,7 +64,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
)
opt.minimize(loss)
# TODO: section_program will be removed in the future
......
......@@ -47,7 +47,7 @@ class TestFleetExecutor(unittest.TestCase):
)
opt = paddle.optimizer.AdamW(
learning_rate=lr_val,
grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
grad_clip=paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0),
)
opt.minimize(loss)
# TODO: section_program will be removed in the future
......
......@@ -20,6 +20,7 @@ import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import Program, program_guard
from paddle.fluid.op import Operator
from paddle.nn import clip
class TestGetTensorFromSelectedRowsError(unittest.TestCase):
......@@ -31,12 +32,12 @@ class TestGetTensorFromSelectedRowsError(unittest.TestCase):
x_data = np.random.random((2, 4)).astype("float32")
def test_Variable():
fluid.layers.get_tensor_from_selected_rows(x=x_data)
clip.get_tensor_from_selected_rows(x=x_data)
self.assertRaises(TypeError, test_Variable)
def test_SELECTED_ROWS():
fluid.layers.get_tensor_from_selected_rows(x=x_var)
clip.get_tensor_from_selected_rows(x=x_var)
self.assertRaises(TypeError, test_SELECTED_ROWS)
......
......@@ -17,12 +17,8 @@ import unittest
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.clip import (
GradientClipByGlobalNorm,
GradientClipByNorm,
GradientClipByValue,
)
from paddle.fluid.dygraph.base import to_variable
from paddle.nn import ClipGradByGlobalNorm, ClipGradByNorm, ClipGradByValue
class TestGradClipByGlobalNorm(unittest.TestCase):
......@@ -67,7 +63,7 @@ class TestGradClipByGlobalNorm(unittest.TestCase):
def get_dygrap_global_norm_result(self):
with fluid.dygraph.guard():
gloabl_norm_clip = GradientClipByGlobalNorm(self.max_global_norm)
gloabl_norm_clip = ClipGradByGlobalNorm(self.max_global_norm)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
......@@ -142,7 +138,7 @@ class TestGradClipByNorm(unittest.TestCase):
def get_dygrap_norm_result(self):
with fluid.dygraph.guard():
norm_clip = GradientClipByNorm(self.max_norm)
norm_clip = ClipGradByNorm(self.max_norm)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
......@@ -212,9 +208,7 @@ class TestGradClipByValue(unittest.TestCase):
def get_dygrap_clip_result(self):
with fluid.dygraph.guard():
value_clip = GradientClipByValue(
max=self.max_value, min=self.min_value
)
value_clip = ClipGradByValue(max=self.max_value, min=self.min_value)
p_g_var = []
for p, g in self.para_and_grad:
new_p = to_variable(p)
......
......@@ -20,7 +20,7 @@ from fake_reader import fake_imdb_reader
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.clip import _allow_pure_fp16_global_norm_clip
from paddle.nn.clip import _allow_pure_fp16_global_norm_clip
paddle.enable_static()
......@@ -173,9 +173,9 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use 'set_gradient_clip'
def test_old_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
fluid.clip.set_gradient_clip(clip)
return fluid.clip.append_gradient_clip_ops(params_grads)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
paddle.nn.clip.set_gradient_clip(clip)
return paddle.nn.clip.append_gradient_clip_ops(params_grads)
self.clip_gradient = func
self.check_gradient_clip(fluid.CPUPlace())
......@@ -183,7 +183,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip
def test_new_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
return clip(params_grads)
self.clip_gradient = func
......@@ -192,7 +192,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# test whether the output is right when use grad_clip under float64
def test_new_gradient_clip_fp64(self):
def func(params_grads):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
return clip(params_grads)
self.clip_gradient = func
......@@ -201,15 +201,15 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
# invoke 'set_gradient_clip' in a wrong order
def test_wrong_API_order(self):
def backward_func(cost):
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
fluid.clip.set_gradient_clip(clip)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=5.0)
paddle.nn.clip.set_gradient_clip(clip)
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.01, grad_clip=clip
)
# if 'set_gradient_clip' and 'optimize(grad_clip)' together, 'set_gradient_clip' will be ineffective
sgd_optimizer.minimize(cost)
# 'set_gradient_clip' must before 'minimize', otherwise, 'set_gradient_clip' will be ineffective
fluid.clip.set_gradient_clip(clip)
paddle.nn.clip.set_gradient_clip(clip)
self.backward_and_optimize = backward_func
for place in self.get_places():
......@@ -269,7 +269,7 @@ class TestGradientClipByGlobalNorm(TestGradientClip):
with fluid.program_guard(
main_program=prog, startup_program=startup_program
):
clip = fluid.clip.GradientClipByGlobalNorm(self.clip_norm)
clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
x = (
fluid.default_main_program()
.global_block()
......@@ -313,7 +313,7 @@ class TestGradientClipByNorm(TestGradientClip):
# test whether the output is right when use grad_clip
def test_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
return clip(params_grads)
self.clip_gradient = func
......@@ -321,7 +321,7 @@ class TestGradientClipByNorm(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
clip = fluid.clip.GradientClipByNorm(self.clip_norm)
clip = paddle.nn.ClipGradByNorm(self.clip_norm)
x = (
fluid.default_main_program()
.global_block()
......@@ -371,7 +371,7 @@ class TestGradientClipByValue(TestGradientClip):
# test whether the output is right when use grad_clip
def test_gradient_clip(self):
def func(params_grads):
clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
return clip(params_grads)
self.clip_gradient = func
......@@ -379,7 +379,7 @@ class TestGradientClipByValue(TestGradientClip):
# if grad is None or not need clip
def test_none_grad(self):
clip = fluid.clip.GradientClipByValue(self.max, self.min)
clip = paddle.nn.ClipGradByValue(self.max, self.min)
x = (
fluid.default_main_program()
.global_block()
......@@ -419,7 +419,7 @@ class TestDygraphGradientClip(unittest.TestCase):
sgd_optimizer = fluid.optimizer.SGD(
learning_rate=0.0,
parameter_list=linear.parameters(),
grad_clip=fluid.clip.GradientClipByGlobalNorm(0.1),
grad_clip=paddle.nn.ClipGradByGlobalNorm(0.1),
)
self.check_clip_result(loss, sgd_optimizer)
......@@ -430,12 +430,8 @@ class TestDygraphGradientClip(unittest.TestCase):
class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
def setUp(self):
self.clip_norm = 0.8
self.clip1 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm
)
self.clip2 = fluid.clip.GradientClipByGlobalNorm(
clip_norm=self.clip_norm
)
self.clip1 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
self.clip2 = paddle.nn.ClipGradByGlobalNorm(clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer):
# if grad is None
......@@ -476,7 +472,7 @@ class TestDygraphGradientClipByGlobalNorm(TestDygraphGradientClip):
class TestDygraphGradientClipByNorm(TestDygraphGradientClip):
def setUp(self):
self.clip_norm = 0.8
self.clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
self.clip = paddle.nn.ClipGradByNorm(clip_norm=self.clip_norm)
def check_clip_result(self, loss, optimizer):
# if grad is None
......@@ -506,7 +502,7 @@ class TestDygraphGradientClipByValue(TestDygraphGradientClip):
def setUp(self):
self.max = 0.2
self.min = 0.1
self.clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
self.clip = paddle.nn.ClipGradByValue(max=self.max, min=self.min)
def check_clip_result(self, loss, optimizer):
# if grad is None
......@@ -572,7 +568,7 @@ class TestDygraphGradientClipFP16(unittest.TestCase):
params_grads.append((param, param._grad_ivar()))
_, grads = zip(*params_grads)
# clip grads
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.8)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.8)
params_grads = clip(params_grads)
_, grads_clip = zip(*params_grads)
# param update
......@@ -616,7 +612,7 @@ class TestDygraphGradientClipFP64(unittest.TestCase):
params_grads.append((param, param._grad_ivar()))
_, grads = zip(*params_grads)
# clip grads
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=0.1)
clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=0.1)
params_grads = clip(params_grads)
_, grads_clip = zip(*params_grads)
......
......@@ -361,7 +361,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
model = MyLayer(size, vocab_size, size)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip
)
......@@ -380,7 +380,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
with fluid.dygraph.guard(place):
model = MyLayer2(size, vocab_size, size)
grad_clip = fluid.clip.GradientClipByGlobalNorm(0.001)
grad_clip = paddle.nn.ClipGradByGlobalNorm(0.001)
optimizer = fluid.optimizer.AdamOptimizer(
0.001, parameter_list=model.parameters(), grad_clip=grad_clip
)
......
......@@ -52,7 +52,7 @@ class TestSimpleNet(unittest.TestCase):
fluid.set_flags(
{'FLAGS_sort_sum_gradient': sort_sum_gradient}
)
# grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
# grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = paddle.to_tensor(input_word)
......@@ -91,7 +91,7 @@ class TestSimpleNet(unittest.TestCase):
fluid.set_flags(
{'FLAGS_sort_sum_gradient': sort_sum_gradient}
)
grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)
grad_clip = paddle.nn.ClipGradByGlobalNorm(5.0)
input_word = np.array([[1, 2], [2, 1]]).astype('int64')
input = to_variable(input_word)
......
......@@ -131,13 +131,13 @@ class TestClipOpError(unittest.TestCase):
input_data = np.random.random((2, 4)).astype("float32")
def test_Variable():
fluid.layers.clip(x=input_data, min=-1.0, max=1.0)
paddle.clip(x=input_data, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_Variable)
def test_dtype():
x2 = fluid.layers.data(name='x2', shape=[1], dtype='int32')
fluid.layers.clip(x=x2, min=-1.0, max=1.0)
paddle.clip(x=x2, min=-1.0, max=1.0)
self.assertRaises(TypeError, test_dtype)
paddle.disable_static()
......
......@@ -1535,7 +1535,7 @@ class Model:
assert isinstance(
self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
), "Only ClipGradByNorm and ClipGradByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {}
......
......@@ -15,13 +15,14 @@
import paddle
import paddle.distributed as dist
from paddle.fluid import core, layers
from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
from paddle.fluid.dygraph import base as imperative_base
from paddle.nn import clip
from paddle.nn.clip import ClipGradBase, _squared_l2_norm
class ClipGradForMOEByGlobalNorm(ClipGradBase):
r"""
The Algrithm is the same as paddle.fluid.clip.ClipGradByGlobalNorm
The Algrithm is the same as paddle.nn.ClipGradByGlobalNorm
Given a list of Tensor :math:`t\_list` , calculate the global norm for the elements of all tensors in
:math:`t\_list` , and limit it to ``clip_norm`` .
......@@ -113,8 +114,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
continue
merge_grad = g
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
merge_grad = clip.merge_selected_rows(g)
merge_grad = clip.get_tensor_from_selected_rows(merge_grad)
sum_square = _squared_l2_norm(merge_grad)
if sum_square.dtype == core.VarDesc.VarType.FP16:
sum_square_list_fp16.append(sum_square)
......
......@@ -16,11 +16,11 @@ import os
import paddle
from paddle.fluid import core, framework, unique_name
from paddle.fluid.clip import ClipGradByGlobalNorm
from paddle.fluid.executor import global_scope
from paddle.fluid.framework import Variable, name_scope
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.optimizer import Optimizer
from paddle.nn import ClipGradByGlobalNorm
def init_communicator(block, rank, ranks, ring_id):
......
此差异已折叠。
......@@ -20,10 +20,10 @@ import paddle
from .. import _C_ops
from ..fluid import core, framework, unique_name
from ..fluid.clip import GradientClipBase
from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Parameter, Variable
from ..fluid.layer_helper import LayerHelper
from ..nn.clip import GradientClipBase
from .lr import LRScheduler
from .optimizer import Optimizer
......
......@@ -18,6 +18,7 @@ from collections import defaultdict
import numpy as np
import paddle
import paddle.autograd as imperative_base
from paddle import _C_ops
from paddle.fluid import core
from paddle.fluid.framework import (
......@@ -32,12 +33,6 @@ from paddle.fluid.framework import (
from ..fluid import framework, unique_name
from ..fluid.backward import _get_no_grad_set_name, append_backward
from ..fluid.clip import (
GradientClipBase,
append_gradient_clip_ops,
error_clip_callback,
)
from ..fluid.dygraph import base as imperative_base
from ..fluid.framework import Parameter, program_guard
from ..fluid.initializer import Constant
from ..fluid.layer_helper import LayerHelper
......@@ -168,7 +163,7 @@ class Optimizer:
"""
@imperative_base.no_grad
@imperative_base.no_grad()
def __init__(
self,
learning_rate,
......@@ -225,7 +220,7 @@ class Optimizer:
% type(learning_rate)
)
if grad_clip is not None:
if not isinstance(grad_clip, GradientClipBase):
if not isinstance(grad_clip, paddle.nn.clip.GradientClipBase):
raise TypeError(
"'grad_clip' should be an instance of GradientClipBase's derived class"
)
......@@ -1042,7 +1037,7 @@ class Optimizer:
params_grads.append((parameter_list[index], grad))
else:
if callbacks is None:
callbacks = [error_clip_callback]
callbacks = [paddle.nn.clip.error_clip_callback]
else:
assert isinstance(callbacks, list)
program = loss.block.program
......@@ -1103,7 +1098,7 @@ class Optimizer:
params_grads = self._grad_clip(params_grads)
else:
params_grads = append_gradient_clip_ops(params_grads)
params_grads = paddle.nn.clip.append_gradient_clip_ops(params_grads)
# Add regularization if any
params_grads = self.append_regularization_ops(
......@@ -1317,7 +1312,7 @@ class Optimizer:
else:
core.clear_gradients(param_list, set_to_zero)
@imperative_base.no_grad
@imperative_base.no_grad()
def minimize(
self, loss, startup_program=None, parameters=None, no_grad_set=None
):
......@@ -1380,7 +1375,7 @@ class Optimizer:
return optimize_ops, params_grads
@imperative_base.no_grad
@imperative_base.no_grad()
@framework.dygraph_only
def step(self):
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册