未验证 提交 cbb14419 编写于 作者: R Roc 提交者: GitHub

support sharding in fp16 on xpu, (#48897)

* support sharding in fp16 on xpu, change reduce_max to reduce_sum for found nan or inf

* update
上级 e7711592
......@@ -220,7 +220,8 @@ def GroupShardedScaler(scaler):
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
device = "cpu" if optimizer.offload else "gpu"
device = paddle.get_device().split(":")[0]
device = "cpu" if optimizer.offload else device
dev_id = (
0 if device == "cpu" else int(paddle.get_device().split(":")[1])
)
......@@ -245,8 +246,9 @@ def GroupShardedScaler(scaler):
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
paddle.distributed.all_reduce(
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
)
self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler)
......
......@@ -344,7 +344,7 @@ class TestAmpScaler(unittest.TestCase):
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
self.assertEqual(scaler._found_inf.numpy() >= 1, True)
for param in model.parameters():
# param not update when tensor contains nan or inf
......
......@@ -343,7 +343,7 @@ class TestAmpScaler(unittest.TestCase):
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True)
self.assertEqual(scaler._found_inf.numpy() >= 1, True)
for param in model.parameters():
# param not update when tensor contains nan or inf
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册