Dygraph模式下使用LayerNorm,batch_size为1时,会出现nan loss
Created by: sserdoubleh
- 标题:Dygraph模式下使用LayerNorm,batch_size为1时,会出现nan loss
- 版本、环境信息: 1)PaddlePaddle版本:1.6.0 2)GPU:V100、CUDA10.0和CUDNN 7.5 4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本
- 训练信息 1)单机单卡 2)显存信息 3)Operator信息
- 复现代码
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.dygraph as dygraph
class Model(dygraph.Layer):
def __init__(self, name_scope):
super().__init__(name_scope)
self.layer_norm = dygraph.LayerNorm(self.full_name())
return
def forward(self, input):
# original
o = self.layer_norm(input)
return layers.reduce_sum(o, dim=1)
place = fluid.CUDAPlace(0)
with dygraph.guard(place):
model = Model("model")
np.random.seed(13)
batch_size = 1
dim = 5
x = dygraph.to_variable(np.random.randn(batch_size, dim).astype("float32"))
y = dygraph.to_variable(np.random.randn(batch_size).astype("float32"))
print("x:", x.numpy().tolist())
print("y:", y.numpy().tolist())
optimizer = fluid.optimizer.SGDOptimizer(learning_rate=1e-6)
for _ in range(10):
print("==================")
pred = model(x)
for param in model.parameters():
print(param.name, param.numpy().tolist())
loss = layers.mse_loss(pred, y)
loss.backward()
optimizer.minimize(loss)
model.clear_gradients()
print("pred:", pred.numpy().tolist())
print("loss:", loss.numpy().tolist())
print()
运行结果出现loss为Nan
model/Model_0/LayerNorm_0.w_0 [0.9999983906745911, 1.0002537965774536, 0.9999850392341614, 1.0001521110534668, 1.0004528760910034]
model/Model_0/LayerNorm_0.b_0 [1.0646755299603683e-06, 1.0646755299603683e-06, 1.0646755299603683e-06, 1.0646755299603683e-06, 1.0646755299603683e-06]
pred: [0.0008184909820556641]
loss: [0.2825128734111786]
==================
model/Model_0/LayerNorm_0.w_0 [0.9999967813491821, 1.0005072355270386, 0.9999700784683228, 1.0003039836883545, 1.0009050369262695]
model/Model_0/LayerNorm_0.b_0 [2.127714424204896e-06, 2.127714424204896e-06, 2.127714424204896e-06, 2.127714424204896e-06, 2.127714424204896e-06]
pred: [0.0016356408596038818]
loss: [0.2816448509693146]
==================
model/Model_0/LayerNorm_0.w_0 [0.9999951720237732, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [3.1891188427835004e-06, 3.1891188427835004e-06, 3.1891188427835004e-06, 3.1891188427835004e-06, 3.1891188427835004e-06]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]
==================
model/Model_0/LayerNorm_0.w_0 [nan, nan, nan, nan, nan]
model/Model_0/LayerNorm_0.b_0 [nan, nan, nan, nan, nan]
pred: [nan]
loss: [nan]