未验证 提交 42c8d51a 编写于 作者: H HongyuJia 提交者: GitHub

clean fluid elementwise_max (#47866)

上级 2182a4f9
...@@ -160,7 +160,7 @@ class HybridParallelClipGrad: ...@@ -160,7 +160,7 @@ class HybridParallelClipGrad:
) )
clip_var = layers.elementwise_div( clip_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var_fp32, y=max_global_norm), y=paddle.maximum(x=global_norm_var_fp32, y=max_global_norm),
) )
clip_var_fp16 = paddle.cast(clip_var, paddle.float16) clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
for p, g in params_grads: for p, g in params_grads:
......
...@@ -468,7 +468,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase): ...@@ -468,7 +468,7 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
next_local_steps = layers.elementwise_min( next_local_steps = layers.elementwise_min(
next_local_steps, max_local_steps next_local_steps, max_local_steps
) )
next_local_steps = layers.elementwise_max( next_local_steps = paddle.maximum(
next_local_steps, min_local_steps next_local_steps, min_local_steps
) )
layers.assign(next_local_steps, k_steps) layers.assign(next_local_steps, k_steps)
......
...@@ -141,7 +141,7 @@ class GroupShardedClipGrad: ...@@ -141,7 +141,7 @@ class GroupShardedClipGrad:
clip_var = layers.elementwise_div( clip_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
clip_var_fp16 = paddle.cast(clip_var, paddle.float16) clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
......
...@@ -138,7 +138,7 @@ class ShardingClipGrad: ...@@ -138,7 +138,7 @@ class ShardingClipGrad:
clip_var = layers.elementwise_div( clip_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
clip_var_fp16 = paddle.cast(clip_var, paddle.float16) clip_var_fp16 = paddle.cast(clip_var, paddle.float16)
......
...@@ -550,7 +550,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -550,7 +550,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
need_clip = True need_clip = True
clip_var = layers.elementwise_div( clip_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
elif global_norm_var > max_global_norm: elif global_norm_var > max_global_norm:
# only when global_norm_var > max_global_norm, grad need clip # only when global_norm_var > max_global_norm, grad need clip
...@@ -654,9 +654,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -654,9 +654,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
) )
scale_var = layers.elementwise_div( scale_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max( y=paddle.maximum(x=max_global_norm, y=global_norm_var),
x=max_global_norm, y=global_norm_var
),
) )
param_new_grad_name_dict = dict() param_new_grad_name_dict = dict()
for p, g in params_grads: for p, g in params_grads:
...@@ -733,7 +731,7 @@ class ClipGradByGlobalNorm(ClipGradBase): ...@@ -733,7 +731,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
clip_var = self.context[self.group_name + "_clip"] clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div( group_scale_var = layers.elementwise_div(
x=clip_var, x=clip_var,
y=layers.elementwise_max(x=clip_var, y=group_norm_var), y=paddle.maximum(x=clip_var, y=group_norm_var),
) )
assert group_scale_var.shape == (1,) assert group_scale_var.shape == (1,)
self.context[group_scale_name] = group_scale_var self.context[group_scale_name] = group_scale_var
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import math import math
import warnings import warnings
import paddle
from .. import unique_name from .. import unique_name
from ..framework import Variable from ..framework import Variable
from ..data_feeder import check_type from ..data_feeder import check_type
...@@ -977,11 +978,9 @@ class ReduceLROnPlateau(LearningRateDecay): ...@@ -977,11 +978,9 @@ class ReduceLROnPlateau(LearningRateDecay):
self.num_bad_epochs += 1 self.num_bad_epochs += 1
if self.num_bad_epochs > self.patience: if self.num_bad_epochs > self.patience:
from .. import layers
self.cooldown_counter = self.cooldown self.cooldown_counter = self.cooldown
self.num_bad_epochs = 0 self.num_bad_epochs = 0
new_lr = layers.elementwise_max( new_lr = paddle.maximum(
self.learning_rate * self.decay_rate, self.min_lr self.learning_rate * self.decay_rate, self.min_lr
) )
if self.learning_rate - new_lr > self.eps: if self.learning_rate - new_lr > self.eps:
......
...@@ -74,7 +74,7 @@ def get_loss(cos_q_pt, cos_q_nt): ...@@ -74,7 +74,7 @@ def get_loss(cos_q_pt, cos_q_nt):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.param_attr as attr import paddle.fluid.param_attr as attr
...@@ -151,7 +153,7 @@ class ElementwiseMaxLayer: ...@@ -151,7 +153,7 @@ class ElementwiseMaxLayer:
""" """
operation operation
""" """
max = fluid.layers.elementwise_max(x, y) max = paddle.maximum(x, y)
return max return max
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -60,7 +60,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -60,7 +60,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -59,7 +59,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase): ...@@ -56,7 +56,7 @@ class TestPSPassWithBow(unittest.TestCase):
cos_q_pt, cos_q_pt,
) )
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max( loss_op3 = paddle.maximum(
fluid.layers.fill_constant_batch_size_like( fluid.layers.fill_constant_batch_size_like(
input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32' input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'
), ),
......
...@@ -652,14 +652,12 @@ class TestLayer(LayerTest): ...@@ -652,14 +652,12 @@ class TestLayer(LayerTest):
min_eager_ret = layers.elementwise_min( min_eager_ret = layers.elementwise_min(
to_variable(n), to_variable(n2) to_variable(n), to_variable(n2)
) )
max_eager_ret = layers.elementwise_max( max_eager_ret = paddle.maximum(to_variable(n), to_variable(n2))
to_variable(n), to_variable(n2)
)
min_eager_ret_value = min_eager_ret.numpy() min_eager_ret_value = min_eager_ret.numpy()
max_eager_ret_value = max_eager_ret.numpy() max_eager_ret_value = max_eager_ret.numpy()
min_ret = layers.elementwise_min(to_variable(n), to_variable(n2)) min_ret = layers.elementwise_min(to_variable(n), to_variable(n2))
max_ret = layers.elementwise_max(to_variable(n), to_variable(n2)) max_ret = paddle.maximum(to_variable(n), to_variable(n2))
min_ret_value = min_ret.numpy() min_ret_value = min_ret.numpy()
max_ret_value = max_ret.numpy() max_ret_value = max_ret.numpy()
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle
import paddle.distributed as dist import paddle.distributed as dist
from paddle.fluid.clip import ClipGradBase, _squared_l2_norm from paddle.fluid.clip import ClipGradBase, _squared_l2_norm
from paddle.fluid.dygraph import base as imperative_base from paddle.fluid.dygraph import base as imperative_base
...@@ -213,7 +215,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -213,7 +215,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
) )
clip_var = layers.elementwise_div( clip_var = layers.elementwise_div(
x=max_global_norm, x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm), y=paddle.maximum(x=global_norm_var, y=max_global_norm),
) )
for p, g in params_grads: for p, g in params_grads:
if g is None: if g is None:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册