未验证 提交 dd827bbe 编写于 作者: W wuhuachaocoding 提交者: GitHub

update fluid api. (#49731)

上级 a1772bb8
......@@ -29,7 +29,7 @@ from collections import OrderedDict
import paddle
import paddle.distributed as dist
from paddle.distributed import ParallelMode, fleet
from paddle.fluid import core
from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm
from paddle.optimizer import Optimizer
......
......@@ -20,12 +20,11 @@ import numpy as np
import paddle
import paddle.distributed as dist
import paddle.fluid.core as core
import paddle.fluid.framework as framework
from paddle import nn
from paddle import framework, nn
from paddle.autograd import PyLayer
from paddle.distributed import collective
from paddle.fluid.framework import EagerParamBase
from paddle.framework import core
from paddle.nn import ClipGradByGlobalNorm
from .group_sharded_storage import GradStorage
......
......@@ -25,7 +25,7 @@
import numpy as np
import paddle
from paddle.fluid import core
from paddle.framework import core
from .group_sharded_utils import Type, cvt_to_device, device_guard
......
......@@ -20,9 +20,9 @@ import numpy as np
import paddle
from paddle import _legacy_C_ops
from paddle.fluid import core, layers
from paddle.common_ops_import import dygraph_only
from paddle.fluid.dygraph import to_variable
from paddle.fluid.framework import dygraph_only
from paddle.framework import core
from paddle.nn import clip
......@@ -87,7 +87,7 @@ class GroupShardedClipGrad:
if len(sum_square_fp16) == 0:
global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else:
global_norm_fp16 = layers.concat(sum_square_fp16)
global_norm_fp16 = paddle.concat(sum_square_fp16)
global_norm_fp16 = paddle.sum(global_norm_fp16)
global_norm_fp16 = paddle.cast(
global_norm_fp16, dtype=paddle.float32
......@@ -97,7 +97,7 @@ class GroupShardedClipGrad:
if len(unslice_params_fp16) == 0:
global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32)
else:
global_unslice_fp16 = layers.concat(unslice_params_fp16)
global_unslice_fp16 = paddle.concat(unslice_params_fp16)
global_unslice_fp16 = paddle.sum(global_unslice_fp16)
global_unslice_fp16 = paddle.cast(
global_unslice_fp16, dtype=paddle.float32
......@@ -105,7 +105,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32 = (
layers.concat(sum_square_fp32)
paddle.concat(sum_square_fp32)
if len(sum_square_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
......@@ -113,7 +113,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads for unslice parameters
global_unslice_fp32 = (
layers.concat(unslice_params_fp32)
paddle.concat(unslice_params_fp32)
if len(unslice_params_fp32) != 0
else paddle.to_tensor([0.0], dtype=paddle.float32)
)
......@@ -131,8 +131,8 @@ class GroupShardedClipGrad:
paddle.distributed.all_reduce(global_norm_var, group=self._group)
global_norm_var = paddle.sqrt(global_norm_var + global_unslice_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
max_global_norm = paddle.full(
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
)
clip_var = paddle.divide(
......
......@@ -13,8 +13,9 @@
# limitations under the License.
import paddle
from paddle import framework
from paddle.autograd import PyLayer
from paddle.fluid import core, framework
from paddle.framework import core
from ..meta_parallel.parallel_layers.random import get_rng_state_tracker
from ..meta_parallel.pp_utils import utils
......
......@@ -13,9 +13,8 @@
# limitations under the License.
from paddle import _legacy_C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
from paddle.common_ops_import import check_variable_and_dtype
from paddle.framework import LayerHelper, in_dygraph_mode
def _number_count(numbers, upper_range):
......
......@@ -66,7 +66,7 @@ def get_cluster_from_args(args, selected_gpus):
def get_gpus(selected_gpus):
if selected_gpus is None:
from paddle.fluid import core
from paddle.framework import core
gpus_num = core.get_cuda_device_count()
gpus = [str(x) for x in range(0, gpus_num)]
......
......@@ -13,9 +13,8 @@
# limitations under the License.
from paddle import _legacy_C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
from paddle.common_ops_import import check_variable_and_dtype
from paddle.framework import LayerHelper, in_dygraph_mode
def global_scatter(
......
......@@ -14,8 +14,8 @@
import paddle
import paddle.distributed as dist
from paddle.fluid import core, layers
from paddle.fluid.dygraph import base as imperative_base
from paddle.autograd import no_grad
from paddle.framework import core
from paddle.nn import clip
from paddle.nn.clip import ClipGradBase, _squared_l2_norm
......@@ -142,25 +142,25 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
global_norm_var = []
if len(sum_square_list_fp16) > 0:
global_norm_var_fp16 = layers.concat(sum_square_list_fp16)
global_norm_var_fp16 = paddle.concat(sum_square_list_fp16)
global_norm_var_fp16 = paddle.sum(global_norm_var_fp16)
global_norm_var.append(global_norm_var_fp16.astype(sum_dtype))
if len(sum_square_list_fp32) > 0:
global_norm_var_fp32 = layers.concat(sum_square_list_fp32)
global_norm_var_fp32 = paddle.concat(sum_square_list_fp32)
global_norm_var_fp32 = paddle.sum(global_norm_var_fp32)
if sum_dtype == 'float32':
global_norm_var.append(global_norm_var_fp32)
else:
global_norm_var.append(global_norm_var_fp32.astype(sum_dtype))
if len(sum_square_list) > 0:
global_norm_var_fp64 = layers.concat(sum_square_list)
global_norm_var_fp64 = paddle.concat(sum_square_list)
global_norm_var_fp64 = paddle.sum(global_norm_var_fp64)
global_norm_var.append(global_norm_var_fp64)
global_norm_var = layers.concat(global_norm_var)
global_norm_var = paddle.concat(global_norm_var)
global_norm_var = paddle.sum(global_norm_var)
return global_norm_var, sum_dtype
@imperative_base.no_grad
@no_grad()
def _dygraph_clip(self, params_grads):
normal_params_grads = []
moe_params_grads = []
......@@ -210,8 +210,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
params_and_grads = []
global_norm_var = paddle.sqrt(global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
max_global_norm = paddle.full(
shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm
)
clip_var = paddle.divide(
x=max_global_norm,
......
......@@ -25,7 +25,7 @@ import paddle
import paddle.nn as nn
from paddle.autograd import PyLayer
from paddle.distributed.utils.moe_utils import global_gather, global_scatter
from paddle.fluid.framework import in_dygraph_mode
from paddle.framework import in_dygraph_mode
from paddle.incubate.distributed.fleet import recompute_hybrid
from .gate import BaseGate, GShardGate, NaiveGate, SwitchGate
......
......@@ -26,7 +26,7 @@ from paddle.distributed.models.moe.utils import (
_number_count,
_prune_gate_by_capacity,
)
from paddle.fluid.framework import in_dygraph_mode
from paddle.framework import in_dygraph_mode
def _alltoall(in_tensor_list, group=None, use_calc_stream=True):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册