From dd827bbe87a9dd86c14b5ad82540ccb9a6f7f7cb Mon Sep 17 00:00:00 2001 From: wuhuachaocoding <77733235+wuhuachaocoding@users.noreply.github.com> Date: Fri, 13 Jan 2023 15:46:47 +0800 Subject: [PATCH] update fluid api. (#49731) --- .../sharding/group_sharded_optimizer_stage2.py | 2 +- .../sharding/group_sharded_stage3.py | 5 ++--- .../sharding/group_sharded_storage.py | 2 +- .../sharding/group_sharded_utils.py | 16 ++++++++-------- .../fleet/recompute/recompute_hybrid.py | 3 ++- python/paddle/distributed/models/moe/utils.py | 5 ++--- .../paddle/distributed/utils/launch_utils.py | 2 +- python/paddle/distributed/utils/moe_utils.py | 5 ++--- .../distributed/models/moe/grad_clip.py | 18 +++++++++--------- .../distributed/models/moe/moe_layer.py | 2 +- .../incubate/distributed/models/moe/utils.py | 2 +- 11 files changed, 30 insertions(+), 32 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py index f5ca60b100..00ec12a523 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py @@ -29,7 +29,7 @@ from collections import OrderedDict import paddle import paddle.distributed as dist from paddle.distributed import ParallelMode, fleet -from paddle.fluid import core +from paddle.framework import core from paddle.nn import ClipGradByGlobalNorm from paddle.optimizer import Optimizer diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py index f792e0a538..768953eed0 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py @@ -20,12 +20,11 @@ import numpy as np import paddle import paddle.distributed as dist -import paddle.fluid.core as core -import paddle.fluid.framework as framework -from paddle import nn +from paddle import framework, nn from paddle.autograd import PyLayer from paddle.distributed import collective from paddle.fluid.framework import EagerParamBase +from paddle.framework import core from paddle.nn import ClipGradByGlobalNorm from .group_sharded_storage import GradStorage diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py index abab68a191..c179e1d4dd 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py @@ -25,7 +25,7 @@ import numpy as np import paddle -from paddle.fluid import core +from paddle.framework import core from .group_sharded_utils import Type, cvt_to_device, device_guard diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index f8c86e02b7..b1ab777964 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -20,9 +20,9 @@ import numpy as np import paddle from paddle import _legacy_C_ops -from paddle.fluid import core, layers +from paddle.common_ops_import import dygraph_only from paddle.fluid.dygraph import to_variable -from paddle.fluid.framework import dygraph_only +from paddle.framework import core from paddle.nn import clip @@ -87,7 +87,7 @@ class GroupShardedClipGrad: if len(sum_square_fp16) == 0: global_norm_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) else: - global_norm_fp16 = layers.concat(sum_square_fp16) + global_norm_fp16 = paddle.concat(sum_square_fp16) global_norm_fp16 = paddle.sum(global_norm_fp16) global_norm_fp16 = paddle.cast( global_norm_fp16, dtype=paddle.float32 @@ -97,7 +97,7 @@ class GroupShardedClipGrad: if len(unslice_params_fp16) == 0: global_unslice_fp16 = paddle.to_tensor([0.0], dtype=paddle.float32) else: - global_unslice_fp16 = layers.concat(unslice_params_fp16) + global_unslice_fp16 = paddle.concat(unslice_params_fp16) global_unslice_fp16 = paddle.sum(global_unslice_fp16) global_unslice_fp16 = paddle.cast( global_unslice_fp16, dtype=paddle.float32 @@ -105,7 +105,7 @@ class GroupShardedClipGrad: # global norm of non-distributed FP32 params_and_grads global_norm_fp32 = ( - layers.concat(sum_square_fp32) + paddle.concat(sum_square_fp32) if len(sum_square_fp32) != 0 else paddle.to_tensor([0.0], dtype=paddle.float32) ) @@ -113,7 +113,7 @@ class GroupShardedClipGrad: # global norm of non-distributed FP32 params_and_grads for unslice parameters global_unslice_fp32 = ( - layers.concat(unslice_params_fp32) + paddle.concat(unslice_params_fp32) if len(unslice_params_fp32) != 0 else paddle.to_tensor([0.0], dtype=paddle.float32) ) @@ -131,8 +131,8 @@ class GroupShardedClipGrad: paddle.distributed.all_reduce(global_norm_var, group=self._group) global_norm_var = paddle.sqrt(global_norm_var + global_unslice_var) - max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm + max_global_norm = paddle.full( + shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm ) clip_var = paddle.divide( diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py index db5166917e..781f44e406 100644 --- a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py +++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py @@ -13,8 +13,9 @@ # limitations under the License. import paddle +from paddle import framework from paddle.autograd import PyLayer -from paddle.fluid import core, framework +from paddle.framework import core from ..meta_parallel.parallel_layers.random import get_rng_state_tracker from ..meta_parallel.pp_utils import utils diff --git a/python/paddle/distributed/models/moe/utils.py b/python/paddle/distributed/models/moe/utils.py index 89c6add474..8a9d199cee 100644 --- a/python/paddle/distributed/models/moe/utils.py +++ b/python/paddle/distributed/models/moe/utils.py @@ -13,9 +13,8 @@ # limitations under the License. from paddle import _legacy_C_ops -from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.framework import in_dygraph_mode -from paddle.fluid.layer_helper import LayerHelper +from paddle.common_ops_import import check_variable_and_dtype +from paddle.framework import LayerHelper, in_dygraph_mode def _number_count(numbers, upper_range): diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py index 76642fe761..c922eef9cc 100644 --- a/python/paddle/distributed/utils/launch_utils.py +++ b/python/paddle/distributed/utils/launch_utils.py @@ -66,7 +66,7 @@ def get_cluster_from_args(args, selected_gpus): def get_gpus(selected_gpus): if selected_gpus is None: - from paddle.fluid import core + from paddle.framework import core gpus_num = core.get_cuda_device_count() gpus = [str(x) for x in range(0, gpus_num)] diff --git a/python/paddle/distributed/utils/moe_utils.py b/python/paddle/distributed/utils/moe_utils.py index 6266537a40..ae18938941 100644 --- a/python/paddle/distributed/utils/moe_utils.py +++ b/python/paddle/distributed/utils/moe_utils.py @@ -13,9 +13,8 @@ # limitations under the License. from paddle import _legacy_C_ops -from paddle.fluid.data_feeder import check_variable_and_dtype -from paddle.fluid.framework import in_dygraph_mode -from paddle.fluid.layer_helper import LayerHelper +from paddle.common_ops_import import check_variable_and_dtype +from paddle.framework import LayerHelper, in_dygraph_mode def global_scatter( diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py index 6bee79b871..a386347ce2 100644 --- a/python/paddle/incubate/distributed/models/moe/grad_clip.py +++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py @@ -14,8 +14,8 @@ import paddle import paddle.distributed as dist -from paddle.fluid import core, layers -from paddle.fluid.dygraph import base as imperative_base +from paddle.autograd import no_grad +from paddle.framework import core from paddle.nn import clip from paddle.nn.clip import ClipGradBase, _squared_l2_norm @@ -142,25 +142,25 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): global_norm_var = [] if len(sum_square_list_fp16) > 0: - global_norm_var_fp16 = layers.concat(sum_square_list_fp16) + global_norm_var_fp16 = paddle.concat(sum_square_list_fp16) global_norm_var_fp16 = paddle.sum(global_norm_var_fp16) global_norm_var.append(global_norm_var_fp16.astype(sum_dtype)) if len(sum_square_list_fp32) > 0: - global_norm_var_fp32 = layers.concat(sum_square_list_fp32) + global_norm_var_fp32 = paddle.concat(sum_square_list_fp32) global_norm_var_fp32 = paddle.sum(global_norm_var_fp32) if sum_dtype == 'float32': global_norm_var.append(global_norm_var_fp32) else: global_norm_var.append(global_norm_var_fp32.astype(sum_dtype)) if len(sum_square_list) > 0: - global_norm_var_fp64 = layers.concat(sum_square_list) + global_norm_var_fp64 = paddle.concat(sum_square_list) global_norm_var_fp64 = paddle.sum(global_norm_var_fp64) global_norm_var.append(global_norm_var_fp64) - global_norm_var = layers.concat(global_norm_var) + global_norm_var = paddle.concat(global_norm_var) global_norm_var = paddle.sum(global_norm_var) return global_norm_var, sum_dtype - @imperative_base.no_grad + @no_grad() def _dygraph_clip(self, params_grads): normal_params_grads = [] moe_params_grads = [] @@ -210,8 +210,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase): params_and_grads = [] global_norm_var = paddle.sqrt(global_norm_var) - max_global_norm = layers.fill_constant( - shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm + max_global_norm = paddle.full( + shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm ) clip_var = paddle.divide( x=max_global_norm, diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py index e5456cf378..52951c499c 100644 --- a/python/paddle/incubate/distributed/models/moe/moe_layer.py +++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py @@ -25,7 +25,7 @@ import paddle import paddle.nn as nn from paddle.autograd import PyLayer from paddle.distributed.utils.moe_utils import global_gather, global_scatter -from paddle.fluid.framework import in_dygraph_mode +from paddle.framework import in_dygraph_mode from paddle.incubate.distributed.fleet import recompute_hybrid from .gate import BaseGate, GShardGate, NaiveGate, SwitchGate diff --git a/python/paddle/incubate/distributed/models/moe/utils.py b/python/paddle/incubate/distributed/models/moe/utils.py index aa952f878b..e7c0fa9af5 100644 --- a/python/paddle/incubate/distributed/models/moe/utils.py +++ b/python/paddle/incubate/distributed/models/moe/utils.py @@ -26,7 +26,7 @@ from paddle.distributed.models.moe.utils import ( _number_count, _prune_gate_by_capacity, ) -from paddle.fluid.framework import in_dygraph_mode +from paddle.framework import in_dygraph_mode def _alltoall(in_tensor_list, group=None, use_calc_stream=True): -- GitLab