未验证 提交 dd827bbe 编写于 作者: W wuhuachaocoding 提交者: GitHub

update fluid api. (#49731)

上级 a1772bb8
...@@ -29,7 +29,7 @@ from collections import OrderedDict ...@@ -29,7 +29,7 @@ from collections import OrderedDict
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
from paddle.distributed import ParallelMode, fleet from paddle.distributed import ParallelMode, fleet
from paddle.fluid import core from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm from paddle.nn import ClipGradByGlobalNorm
from paddle.optimizer import Optimizer from paddle.optimizer import Optimizer
......
...@@ -20,12 +20,11 @@ import numpy as np ...@@ -20,12 +20,11 @@ import numpy as np
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
import paddle.fluid.core as core from paddle import framework, nn
import paddle.fluid.framework as framework
from paddle import nn
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.distributed import collective from paddle.distributed import collective
from paddle.fluid.framework import EagerParamBase from paddle.fluid.framework import EagerParamBase
from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm from paddle.nn import ClipGradByGlobalNorm
from .group_sharded_storage import GradStorage from .group_sharded_storage import GradStorage
......
...@@ -25,7 +25,7 @@ ...@@ -25,7 +25,7 @@
import numpy as np import numpy as np
import paddle import paddle
from paddle.fluid import core from paddle.framework import core
from .group_sharded_utils import Type, cvt_to_device, device_guard from .group_sharded_utils import Type, cvt_to_device, device_guard
......
...@@ -20,9 +20,9 @@ import numpy as np ...@@ -20,9 +20,9 @@ import numpy as np
import paddle import paddle
from paddle import _legacy_C_ops from paddle import _legacy_C_ops
from paddle.fluid import core, layers from paddle.common_ops_import import dygraph_only
from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only from paddle.framework import core
from paddle.nn import clip from paddle.nn import clip
...@@ -87,7 +87,7 @@ class GroupShardedClipGrad: ...@@ -87,7 +87,7 @@ class GroupShardedClipGrad:
if len(sum_square_fp16) == 0: if len(sum_square_fp16) == 0:
global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else: else:
global_norm_fp16 = layers.concat(sum_square_fp16) global_norm_fp16 = paddle.concat(sum_square_fp16)
global_norm_fp16 = paddle.sum(global_norm_fp16) global_norm_fp16 = paddle.sum(global_norm_fp16)
global_norm_fp16 = paddle.cast( global_norm_fp16 = paddle.cast(
global_norm_fp16, dtype=paddle.float32 global_norm_fp16, dtype=paddle.float32
...@@ -97,7 +97,7 @@ class GroupShardedClipGrad: ...@@ -97,7 +97,7 @@ class GroupShardedClipGrad:
if len(unslice_params_fp16) == 0: if len(unslice_params_fp16) == 0:
global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else: else:
global_unslice_fp16 = layers.concat(unslice_params_fp16) global_unslice_fp16 = paddle.concat(unslice_params_fp16)
global_unslice_fp16 = paddle.sum(global_unslice_fp16) global_unslice_fp16 = paddle.sum(global_unslice_fp16)
global_unslice_fp16 = paddle.cast( global_unslice_fp16 = paddle.cast(
global_unslice_fp16, dtype=paddle.float32 global_unslice_fp16, dtype=paddle.float32
...@@ -105,7 +105,7 @@ class GroupShardedClipGrad: ...@@ -105,7 +105,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads # global norm of non-distributed FP32 params_and_grads
global_norm_fp32 = ( global_norm_fp32 = (
layers.concat(sum_square_fp32) paddle.concat(sum_square_fp32)
if len(sum_square_fp32) != 0 if len(sum_square_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32) else paddle.to_tensor([0.0], dtype=paddle.float32)
) )
...@@ -113,7 +113,7 @@ class GroupShardedClipGrad: ...@@ -113,7 +113,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads for unslice parameters # global norm of non-distributed FP32 params_and_grads for unslice parameters
global_unslice_fp32 = ( global_unslice_fp32 = (
layers.concat(unslice_params_fp32) paddle.concat(unslice_params_fp32)
if len(unslice_params_fp32) != 0 if len(unslice_params_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32) else paddle.to_tensor([0.0], dtype=paddle.float32)
) )
...@@ -131,8 +131,8 @@ class GroupShardedClipGrad: ...@@ -131,8 +131,8 @@ class GroupShardedClipGrad:
paddle.distributed.all_reduce(global_norm_var, group=self._group) paddle.distributed.all_reduce(global_norm_var, group=self._group)
global_norm_var = paddle.sqrt(global_norm_var + global_unslice_var) global_norm_var = paddle.sqrt(global_norm_var + global_unslice_var)
max_global_norm = layers.fill_constant( max_global_norm = paddle.full(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
) )
clip_var = paddle.divide( clip_var = paddle.divide(
......
...@@ -13,8 +13,9 @@ ...@@ -13,8 +13,9 @@
# limitations under the License. # limitations under the License.
import paddle import paddle
from paddle import framework
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.fluid import core, framework from paddle.framework import core
from ..meta_parallel.parallel_layers.random import get_rng_state_tracker from ..meta_parallel.parallel_layers.random import get_rng_state_tracker
from ..meta_parallel.pp_utils import utils from ..meta_parallel.pp_utils import utils
......
...@@ -13,9 +13,8 @@ ...@@ -13,9 +13,8 @@
# limitations under the License. # limitations under the License.
from paddle import _legacy_C_ops from paddle import _legacy_C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype from paddle.common_ops_import import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode from paddle.framework import LayerHelper, in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
def _number_count(numbers, upper_range): def _number_count(numbers, upper_range):
......
...@@ -66,7 +66,7 @@ def get_cluster_from_args(args, selected_gpus): ...@@ -66,7 +66,7 @@ def get_cluster_from_args(args, selected_gpus):
def get_gpus(selected_gpus): def get_gpus(selected_gpus):
if selected_gpus is None: if selected_gpus is None:
from paddle.fluid import core from paddle.framework import core
gpus_num = core.get_cuda_device_count() gpus_num = core.get_cuda_device_count()
gpus = [str(x) for x in range(0, gpus_num)] gpus = [str(x) for x in range(0, gpus_num)]
......
...@@ -13,9 +13,8 @@ ...@@ -13,9 +13,8 @@
# limitations under the License. # limitations under the License.
from paddle import _legacy_C_ops from paddle import _legacy_C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype from paddle.common_ops_import import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode from paddle.framework import LayerHelper, in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
def global_scatter( def global_scatter(
......
...@@ -14,8 +14,8 @@ ...@@ -14,8 +14,8 @@
import paddle import paddle
import paddle.distributed as dist import paddle.distributed as dist
from paddle.fluid import core, layers from paddle.autograd import no_grad
from paddle.fluid.dygraph import base as imperative_base from paddle.framework import core
from paddle.nn import clip from paddle.nn import clip
from paddle.nn.clip import ClipGradBase, _squared_l2_norm from paddle.nn.clip import ClipGradBase, _squared_l2_norm
...@@ -142,25 +142,25 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -142,25 +142,25 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
global_norm_var = [] global_norm_var = []
if len(sum_square_list_fp16) > 0: if len(sum_square_list_fp16) > 0:
global_norm_var_fp16 = layers.concat(sum_square_list_fp16) global_norm_var_fp16 = paddle.concat(sum_square_list_fp16)
global_norm_var_fp16 = paddle.sum(global_norm_var_fp16) global_norm_var_fp16 = paddle.sum(global_norm_var_fp16)
global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
if len(sum_square_list_fp32) > 0: if len(sum_square_list_fp32) > 0:
global_norm_var_fp32 = layers.concat(sum_square_list_fp32) global_norm_var_fp32 = paddle.concat(sum_square_list_fp32)
global_norm_var_fp32 = paddle.sum(global_norm_var_fp32) global_norm_var_fp32 = paddle.sum(global_norm_var_fp32)
if sum_dtype == 'float32': if sum_dtype == 'float32':
global_norm_var.append(global_norm_var_fp32) global_norm_var.append(global_norm_var_fp32)
else: else:
global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
if len(sum_square_list) > 0: if len(sum_square_list) > 0:
global_norm_var_fp64 = layers.concat(sum_square_list) global_norm_var_fp64 = paddle.concat(sum_square_list)
global_norm_var_fp64 = paddle.sum(global_norm_var_fp64) global_norm_var_fp64 = paddle.sum(global_norm_var_fp64)
global_norm_var.append(global_norm_var_fp64) global_norm_var.append(global_norm_var_fp64)
global_norm_var = layers.concat(global_norm_var) global_norm_var = paddle.concat(global_norm_var)
global_norm_var = paddle.sum(global_norm_var) global_norm_var = paddle.sum(global_norm_var)
return global_norm_var, sum_dtype return global_norm_var, sum_dtype
@imperative_base.no_grad @no_grad()
def _dygraph_clip(self, params_grads): def _dygraph_clip(self, params_grads):
normal_params_grads = [] normal_params_grads = []
moe_params_grads = [] moe_params_grads = []
...@@ -210,8 +210,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): ...@@ -210,8 +210,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
params_and_grads = [] params_and_grads = []
global_norm_var = paddle.sqrt(global_norm_var) global_norm_var = paddle.sqrt(global_norm_var)
max_global_norm = layers.fill_constant( max_global_norm = paddle.full(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
) )
clip_var = paddle.divide( clip_var = paddle.divide(
x=max_global_norm, x=max_global_norm,
......
...@@ -25,7 +25,7 @@ import paddle ...@@ -25,7 +25,7 @@ import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.autograd import PyLayer from paddle.autograd import PyLayer
from paddle.distributed.utils.moe_utils import global_gather, global_scatter from paddle.distributed.utils.moe_utils import global_gather, global_scatter
from paddle.fluid.framework import in_dygraph_mode from paddle.framework import in_dygraph_mode
from paddle.incubate.distributed.fleet import recompute_hybrid from paddle.incubate.distributed.fleet import recompute_hybrid
from .gate import BaseGate, GShardGate, NaiveGate, SwitchGate from .gate import BaseGate, GShardGate, NaiveGate, SwitchGate
......
...@@ -26,7 +26,7 @@ from paddle.distributed.models.moe.utils import ( ...@@ -26,7 +26,7 @@ from paddle.distributed.models.moe.utils import (
_number_count, _number_count,
_prune_gate_by_capacity, _prune_gate_by_capacity,
) )
from paddle.fluid.framework import in_dygraph_mode from paddle.framework import in_dygraph_mode
def _alltoall(in_tensor_list, group=None, use_calc_stream=True): def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册