Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
dd827bbe
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
dd827bbe
编写于
1月 13, 2023
作者:
W
wuhuachaocoding
提交者:
GitHub
1月 13, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update fluid api. (#49731)
上级
a1772bb8
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
30 addition
and
32 deletion
+30
-32
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
.../meta_parallel/sharding/group_sharded_optimizer_stage2.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
...uted/fleet/meta_parallel/sharding/group_sharded_stage3.py
+2
-3
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
...ted/fleet/meta_parallel/sharding/group_sharded_storage.py
+1
-1
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
...buted/fleet/meta_parallel/sharding/group_sharded_utils.py
+8
-8
python/paddle/distributed/fleet/recompute/recompute_hybrid.py
...on/paddle/distributed/fleet/recompute/recompute_hybrid.py
+2
-1
python/paddle/distributed/models/moe/utils.py
python/paddle/distributed/models/moe/utils.py
+2
-3
python/paddle/distributed/utils/launch_utils.py
python/paddle/distributed/utils/launch_utils.py
+1
-1
python/paddle/distributed/utils/moe_utils.py
python/paddle/distributed/utils/moe_utils.py
+2
-3
python/paddle/incubate/distributed/models/moe/grad_clip.py
python/paddle/incubate/distributed/models/moe/grad_clip.py
+9
-9
python/paddle/incubate/distributed/models/moe/moe_layer.py
python/paddle/incubate/distributed/models/moe/moe_layer.py
+1
-1
python/paddle/incubate/distributed/models/moe/utils.py
python/paddle/incubate/distributed/models/moe/utils.py
+1
-1
未找到文件。
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
浏览文件 @
dd827bbe
...
...
@@ -29,7 +29,7 @@ from collections import OrderedDict
import
paddle
import
paddle.distributed
as
dist
from
paddle.distributed
import
ParallelMode
,
fleet
from
paddle.f
luid
import
core
from
paddle.f
ramework
import
core
from
paddle.nn
import
ClipGradByGlobalNorm
from
paddle.optimizer
import
Optimizer
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
浏览文件 @
dd827bbe
...
...
@@ -20,12 +20,11 @@ import numpy as np
import
paddle
import
paddle.distributed
as
dist
import
paddle.fluid.core
as
core
import
paddle.fluid.framework
as
framework
from
paddle
import
nn
from
paddle
import
framework
,
nn
from
paddle.autograd
import
PyLayer
from
paddle.distributed
import
collective
from
paddle.fluid.framework
import
EagerParamBase
from
paddle.framework
import
core
from
paddle.nn
import
ClipGradByGlobalNorm
from
.group_sharded_storage
import
GradStorage
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
浏览文件 @
dd827bbe
...
...
@@ -25,7 +25,7 @@
import
numpy
as
np
import
paddle
from
paddle.f
luid
import
core
from
paddle.f
ramework
import
core
from
.group_sharded_utils
import
Type
,
cvt_to_device
,
device_guard
...
...
python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
浏览文件 @
dd827bbe
...
...
@@ -20,9 +20,9 @@ import numpy as np
import
paddle
from
paddle
import
_legacy_C_ops
from
paddle.
fluid
import
core
,
layers
from
paddle.
common_ops_import
import
dygraph_only
from
paddle.fluid.dygraph
import
to_variable
from
paddle.f
luid.framework
import
dygraph_only
from
paddle.f
ramework
import
core
from
paddle.nn
import
clip
...
...
@@ -87,7 +87,7 @@ class GroupShardedClipGrad:
if
len
(
sum_square_fp16
)
==
0
:
global_norm_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_norm_fp16
=
layers
.
concat
(
sum_square_fp16
)
global_norm_fp16
=
paddle
.
concat
(
sum_square_fp16
)
global_norm_fp16
=
paddle
.
sum
(
global_norm_fp16
)
global_norm_fp16
=
paddle
.
cast
(
global_norm_fp16
,
dtype
=
paddle
.
float32
...
...
@@ -97,7 +97,7 @@ class GroupShardedClipGrad:
if
len
(
unslice_params_fp16
)
==
0
:
global_unslice_fp16
=
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
else
:
global_unslice_fp16
=
layers
.
concat
(
unslice_params_fp16
)
global_unslice_fp16
=
paddle
.
concat
(
unslice_params_fp16
)
global_unslice_fp16
=
paddle
.
sum
(
global_unslice_fp16
)
global_unslice_fp16
=
paddle
.
cast
(
global_unslice_fp16
,
dtype
=
paddle
.
float32
...
...
@@ -105,7 +105,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads
global_norm_fp32
=
(
layers
.
concat
(
sum_square_fp32
)
paddle
.
concat
(
sum_square_fp32
)
if
len
(
sum_square_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
...
...
@@ -113,7 +113,7 @@ class GroupShardedClipGrad:
# global norm of non-distributed FP32 params_and_grads for unslice parameters
global_unslice_fp32
=
(
layers
.
concat
(
unslice_params_fp32
)
paddle
.
concat
(
unslice_params_fp32
)
if
len
(
unslice_params_fp32
)
!=
0
else
paddle
.
to_tensor
([
0.0
],
dtype
=
paddle
.
float32
)
)
...
...
@@ -131,8 +131,8 @@ class GroupShardedClipGrad:
paddle
.
distributed
.
all_reduce
(
global_norm_var
,
group
=
self
.
_group
)
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
+
global_unslice_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
max_global_norm
=
paddle
.
full
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
fill_
value
=
self
.
clip_norm
)
clip_var
=
paddle
.
divide
(
...
...
python/paddle/distributed/fleet/recompute/recompute_hybrid.py
浏览文件 @
dd827bbe
...
...
@@ -13,8 +13,9 @@
# limitations under the License.
import
paddle
from
paddle
import
framework
from
paddle.autograd
import
PyLayer
from
paddle.f
luid
import
core
,
framework
from
paddle.f
ramework
import
core
from
..meta_parallel.parallel_layers.random
import
get_rng_state_tracker
from
..meta_parallel.pp_utils
import
utils
...
...
python/paddle/distributed/models/moe/utils.py
浏览文件 @
dd827bbe
...
...
@@ -13,9 +13,8 @@
# limitations under the License.
from
paddle
import
_legacy_C_ops
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.common_ops_import
import
check_variable_and_dtype
from
paddle.framework
import
LayerHelper
,
in_dygraph_mode
def
_number_count
(
numbers
,
upper_range
):
...
...
python/paddle/distributed/utils/launch_utils.py
浏览文件 @
dd827bbe
...
...
@@ -66,7 +66,7 @@ def get_cluster_from_args(args, selected_gpus):
def
get_gpus
(
selected_gpus
):
if
selected_gpus
is
None
:
from
paddle.f
luid
import
core
from
paddle.f
ramework
import
core
gpus_num
=
core
.
get_cuda_device_count
()
gpus
=
[
str
(
x
)
for
x
in
range
(
0
,
gpus_num
)]
...
...
python/paddle/distributed/utils/moe_utils.py
浏览文件 @
dd827bbe
...
...
@@ -13,9 +13,8 @@
# limitations under the License.
from
paddle
import
_legacy_C_ops
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.layer_helper
import
LayerHelper
from
paddle.common_ops_import
import
check_variable_and_dtype
from
paddle.framework
import
LayerHelper
,
in_dygraph_mode
def
global_scatter
(
...
...
python/paddle/incubate/distributed/models/moe/grad_clip.py
浏览文件 @
dd827bbe
...
...
@@ -14,8 +14,8 @@
import
paddle
import
paddle.distributed
as
dist
from
paddle.
fluid
import
core
,
layers
from
paddle.f
luid.dygraph
import
base
as
imperative_bas
e
from
paddle.
autograd
import
no_grad
from
paddle.f
ramework
import
cor
e
from
paddle.nn
import
clip
from
paddle.nn.clip
import
ClipGradBase
,
_squared_l2_norm
...
...
@@ -142,25 +142,25 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
global_norm_var
=
[]
if
len
(
sum_square_list_fp16
)
>
0
:
global_norm_var_fp16
=
layers
.
concat
(
sum_square_list_fp16
)
global_norm_var_fp16
=
paddle
.
concat
(
sum_square_list_fp16
)
global_norm_var_fp16
=
paddle
.
sum
(
global_norm_var_fp16
)
global_norm_var
.
append
(
global_norm_var_fp16
.
astype
(
sum_dtype
))
if
len
(
sum_square_list_fp32
)
>
0
:
global_norm_var_fp32
=
layers
.
concat
(
sum_square_list_fp32
)
global_norm_var_fp32
=
paddle
.
concat
(
sum_square_list_fp32
)
global_norm_var_fp32
=
paddle
.
sum
(
global_norm_var_fp32
)
if
sum_dtype
==
'float32'
:
global_norm_var
.
append
(
global_norm_var_fp32
)
else
:
global_norm_var
.
append
(
global_norm_var_fp32
.
astype
(
sum_dtype
))
if
len
(
sum_square_list
)
>
0
:
global_norm_var_fp64
=
layers
.
concat
(
sum_square_list
)
global_norm_var_fp64
=
paddle
.
concat
(
sum_square_list
)
global_norm_var_fp64
=
paddle
.
sum
(
global_norm_var_fp64
)
global_norm_var
.
append
(
global_norm_var_fp64
)
global_norm_var
=
layers
.
concat
(
global_norm_var
)
global_norm_var
=
paddle
.
concat
(
global_norm_var
)
global_norm_var
=
paddle
.
sum
(
global_norm_var
)
return
global_norm_var
,
sum_dtype
@
imperative_base
.
no_grad
@
no_grad
()
def
_dygraph_clip
(
self
,
params_grads
):
normal_params_grads
=
[]
moe_params_grads
=
[]
...
...
@@ -210,8 +210,8 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
params_and_grads
=
[]
global_norm_var
=
paddle
.
sqrt
(
global_norm_var
)
max_global_norm
=
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
value
=
self
.
clip_norm
max_global_norm
=
paddle
.
full
(
shape
=
[
1
],
dtype
=
global_norm_var
.
dtype
,
fill_
value
=
self
.
clip_norm
)
clip_var
=
paddle
.
divide
(
x
=
max_global_norm
,
...
...
python/paddle/incubate/distributed/models/moe/moe_layer.py
浏览文件 @
dd827bbe
...
...
@@ -25,7 +25,7 @@ import paddle
import
paddle.nn
as
nn
from
paddle.autograd
import
PyLayer
from
paddle.distributed.utils.moe_utils
import
global_gather
,
global_scatter
from
paddle.f
luid.f
ramework
import
in_dygraph_mode
from
paddle.framework
import
in_dygraph_mode
from
paddle.incubate.distributed.fleet
import
recompute_hybrid
from
.gate
import
BaseGate
,
GShardGate
,
NaiveGate
,
SwitchGate
...
...
python/paddle/incubate/distributed/models/moe/utils.py
浏览文件 @
dd827bbe
...
...
@@ -26,7 +26,7 @@ from paddle.distributed.models.moe.utils import (
_number_count
,
_prune_gate_by_capacity
,
)
from
paddle.f
luid.f
ramework
import
in_dygraph_mode
from
paddle.framework
import
in_dygraph_mode
def
_alltoall
(
in_tensor_list
,
group
=
None
,
use_calc_stream
=
True
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录