未验证 提交 cbb14419 编写于 作者: R Roc 提交者: GitHub

support sharding in fp16 on xpu, (#48897)

* support sharding in fp16 on xpu, change reduce_max to reduce_sum for found nan or inf

* update
上级 e7711592
...@@ -220,7 +220,8 @@ def GroupShardedScaler(scaler): ...@@ -220,7 +220,8 @@ def GroupShardedScaler(scaler):
temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_)) temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_))
temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_))
device = "cpu" if optimizer.offload else "gpu" device = paddle.get_device().split(":")[0]
device = "cpu" if optimizer.offload else device
dev_id = ( dev_id = (
0 if device == "cpu" else int(paddle.get_device().split(":")[1]) 0 if device == "cpu" else int(paddle.get_device().split(":")[1])
) )
...@@ -245,8 +246,9 @@ def GroupShardedScaler(scaler): ...@@ -245,8 +246,9 @@ def GroupShardedScaler(scaler):
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
paddle.distributed.all_reduce( paddle.distributed.all_reduce(
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
) )
self._found_inf = is_found_inf.numpy()[0] self._found_inf = is_found_inf.numpy()[0]
scaler._unscale = MethodType(unscale_method, scaler) scaler._unscale = MethodType(unscale_method, scaler)
......
...@@ -344,7 +344,7 @@ class TestAmpScaler(unittest.TestCase): ...@@ -344,7 +344,7 @@ class TestAmpScaler(unittest.TestCase):
scaled_loss = scaler.scale(loss) scaled_loss = scaler.scale(loss)
scaled_loss.backward() scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True) self.assertEqual(scaler._found_inf.numpy() >= 1, True)
for param in model.parameters(): for param in model.parameters():
# param not update when tensor contains nan or inf # param not update when tensor contains nan or inf
......
...@@ -343,7 +343,7 @@ class TestAmpScaler(unittest.TestCase): ...@@ -343,7 +343,7 @@ class TestAmpScaler(unittest.TestCase):
scaled_loss = scaler.scale(loss) scaled_loss = scaler.scale(loss)
scaled_loss.backward() scaled_loss.backward()
optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss)
self.assertEqual(scaler._found_inf.numpy() == 1, True) self.assertEqual(scaler._found_inf.numpy() >= 1, True)
for param in model.parameters(): for param in model.parameters():
# param not update when tensor contains nan or inf # param not update when tensor contains nan or inf
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册