Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
983ae1d7
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
983ae1d7
编写于
12月 25, 2022
作者:
W
wanghuancoder
提交者:
GitHub
12月 25, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete legacy dygraph code in python/paddle/distributed (#49304)
* delete legacy dygraph code in python/paddle/distributed * refine
上级
91cdd295
变更
26
显示空白变更内容
内联
并排
Showing
26 changed file
with
587 addition
and
881 deletion
+587
-881
python/paddle/distributed/auto_parallel/engine.py
python/paddle/distributed/auto_parallel/engine.py
+4
-4
python/paddle/distributed/auto_parallel/process_group.py
python/paddle/distributed/auto_parallel/process_group.py
+2
-2
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+2
-2
python/paddle/distributed/communication/all_gather.py
python/paddle/distributed/communication/all_gather.py
+1
-33
python/paddle/distributed/communication/all_reduce.py
python/paddle/distributed/communication/all_reduce.py
+3
-30
python/paddle/distributed/communication/all_to_all.py
python/paddle/distributed/communication/all_to_all.py
+11
-27
python/paddle/distributed/communication/broadcast.py
python/paddle/distributed/communication/broadcast.py
+5
-28
python/paddle/distributed/communication/group.py
python/paddle/distributed/communication/group.py
+32
-31
python/paddle/distributed/communication/recv.py
python/paddle/distributed/communication/recv.py
+2
-25
python/paddle/distributed/communication/reduce.py
python/paddle/distributed/communication/reduce.py
+8
-10
python/paddle/distributed/communication/reduce_scatter.py
python/paddle/distributed/communication/reduce_scatter.py
+16
-19
python/paddle/distributed/communication/scatter.py
python/paddle/distributed/communication/scatter.py
+1
-34
python/paddle/distributed/communication/send.py
python/paddle/distributed/communication/send.py
+2
-23
python/paddle/distributed/fleet/fleet.py
python/paddle/distributed/fleet/fleet.py
+246
-204
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
+124
-152
python/paddle/distributed/fleet/layers/mpu/random.py
python/paddle/distributed/fleet/layers/mpu/random.py
+29
-28
python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
...paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+2
-2
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
...ributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+26
-79
python/paddle/distributed/fleet/optimizer.py
python/paddle/distributed/fleet/optimizer.py
+2
-2
python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
...ddle/distributed/fleet/utils/hybrid_parallel_inference.py
+3
-2
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
...on/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+6
-34
python/paddle/distributed/models/moe/utils.py
python/paddle/distributed/models/moe/utils.py
+26
-40
python/paddle/distributed/utils/moe_utils.py
python/paddle/distributed/utils/moe_utils.py
+3
-3
python/paddle/distribution/dirichlet.py
python/paddle/distribution/dirichlet.py
+1
-3
python/paddle/distribution/distribution.py
python/paddle/distribution/distribution.py
+4
-13
python/paddle/distribution/uniform.py
python/paddle/distribution/uniform.py
+26
-51
未找到文件。
python/paddle/distributed/auto_parallel/engine.py
浏览文件 @
983ae1d7
...
...
@@ -31,7 +31,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
from
paddle.fluid.executor
import
_to_name_str
,
global_scope
from
paddle.fluid.framework
import
Operator
from
paddle.fluid.framework
import
_current_expected_place
as
_get_device
from
paddle.fluid.framework
import
_non_static
_mode
from
paddle.fluid.framework
import
in_dygraph
_mode
from
paddle.fluid.layers.utils
import
flatten
from
paddle.metric
import
Metric
from
paddle.static
import
InputSpec
...
...
@@ -300,7 +300,7 @@ class Engine:
return
inputs_spec
,
labels_spec
def
_prepare_data_tensor
(
self
,
inputs_spec
,
labels_spec
,
inputs
,
labels
):
if
_non_static
_mode
()
or
self
.
_dygraph_mode
:
if
in_dygraph
_mode
()
or
self
.
_dygraph_mode
:
raise
ValueError
(
"Only support static graph mode."
)
if
inputs_spec
:
...
...
@@ -512,7 +512,7 @@ class Engine:
self
.
_has_prepared
[
mode
]
=
True
def
_build
(
self
,
mode
):
if
_non_static
_mode
()
or
self
.
_dygraph_mode
:
if
in_dygraph
_mode
()
or
self
.
_dygraph_mode
:
paddle
.
disable_static
()
self
.
_dygraph_mode
=
True
self
.
_logger
.
info
(
"Building model with 'to_static' method."
)
...
...
@@ -1713,7 +1713,7 @@ class Engine:
self
.
_build
(
mode
)
self
.
_plan
(
mode
)
else
:
if
_non_static
_mode
()
or
self
.
_dygraph_mode
:
if
in_dygraph
_mode
()
or
self
.
_dygraph_mode
:
raise
ValueError
(
"Please call `prepare()` or `fit()` or `evaluate()` or `predict()` before calling `cost()`."
)
...
...
python/paddle/distributed/auto_parallel/process_group.py
浏览文件 @
983ae1d7
...
...
@@ -17,8 +17,8 @@ from collections import OrderedDict
import
paddle
import
paddle.fluid.core
as
core
from
paddle
import
_legacy_C_ops
from
paddle.fluid.framework
import
in_dygraph_mode
from
...fluid.framework
import
_non_static_mode
from
...fluid.layers.tensor
import
fill_constant
from
..collective
import
_get_global_env
,
_new_ring_id
...
...
@@ -154,7 +154,7 @@ class ProcessGroup:
)
tmp
=
(
paddle
.
to_tensor
([
1
],
dtype
=
"int32"
)
if
_non_static
_mode
()
if
in_dygraph
_mode
()
else
fill_constant
([
0
],
dtype
=
"int32"
,
value
=
"1"
)
)
# use legacy ops
...
...
python/paddle/distributed/collective.py
浏览文件 @
983ae1d7
...
...
@@ -18,7 +18,7 @@ import paddle
# (TODO: GhostScreaming) It will be removed later.
import
paddle.fluid.core
as
core
from
paddle.framework
import
_non_static_mode
,
in_dygraph_mode
from
paddle.framework
import
in_dygraph_mode
from
.communication.group
import
Group
,
_add_new_group
,
is_initialized
from
.fleet.layers.mpu.mp_ops
import
_c_concat
# noqa: F401
...
...
@@ -301,7 +301,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
# hang caused by cross-creation of new_group
tmp
=
(
paddle
.
to_tensor
([
1
],
dtype
=
"int32"
)
if
_non_static
_mode
()
if
in_dygraph
_mode
()
else
paddle
.
full
([
0
],
1
,
dtype
=
"int32"
)
)
paddle
.
distributed
.
all_reduce
(
tmp
,
sync_op
=
True
)
...
...
python/paddle/distributed/communication/all_gather.py
浏览文件 @
983ae1d7
...
...
@@ -18,7 +18,6 @@ import pickle
import
numpy
as
np
import
paddle
import
paddle.distributed
as
dist
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
...
...
@@ -64,39 +63,8 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
print(tensor_list)
# [[[4, 5, 6], [4, 5, 6]], [[1, 2, 3], [1, 2, 3]]] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
all_gather
(
tensor_list
,
tensor
,
group
,
sync_op
)
# NOTE: uncomment code below when having fully complex support
# def convert_to_complex(list_of_tensor):
# list_of_complex = []
# for tensor in list_of_tensor:
# list_of_complex.append(paddle.as_complex(tensor))
# return list_of_complex
# is_input_complex = (tensor.dtype == paddle.complex64
# or tensor.dtype == paddle.complex128)
# if is_input_complex:
# tensor = paddle.as_real(tensor)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
nranks
=
dist
.
get_world_size
()
out
=
paddle
.
_legacy_C_ops
.
c_allgather
(
tensor
,
'use_calc_stream'
,
sync_op
,
'ring_id'
,
ring_id
,
'nranks'
,
nranks
,
)
tensor_list
.
clear
()
tensor_list
.
extend
(
paddle
.
split
(
out
,
nranks
,
0
))
def
_convert_object_to_tensor
(
obj
):
_pickler
=
pickle
.
Pickler
...
...
python/paddle/distributed/communication/all_reduce.py
浏览文件 @
983ae1d7
...
...
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.reduce
import
ReduceOp
...
...
@@ -57,31 +55,6 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
print(data)
# [[5, 7, 9], [5, 7, 9]] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
all_reduce
(
tensor
,
op
=
op
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
ring_id
=
0
if
group
is
None
else
group
.
id
if
op
==
ReduceOp
.
SUM
:
return
paddle
.
_legacy_C_ops
.
c_allreduce_sum_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
elif
op
==
ReduceOp
.
MAX
:
return
paddle
.
_legacy_C_ops
.
c_allreduce_max_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
elif
op
==
ReduceOp
.
MIN
:
return
paddle
.
_legacy_C_ops
.
c_allreduce_min_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
elif
op
==
ReduceOp
.
PROD
:
return
paddle
.
_legacy_C_ops
.
c_allreduce_prod_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
else
:
raise
ValueError
(
"Unknown parameter: {}."
.
format
(
op
))
python/paddle/distributed/communication/all_to_all.py
浏览文件 @
983ae1d7
...
...
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
def
alltoall
(
in_tensor_list
,
out_tensor_list
,
group
=
None
,
sync_op
=
True
):
...
...
@@ -59,23 +57,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
alltoall
(
out_tensor_list
,
in_tensor_list
,
group
,
sync_op
,
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
temp
=
paddle
.
concat
(
in_tensor_list
,
axis
=
0
)
nranks
=
len
(
in_tensor_list
)
use_calc_stream
=
sync_op
out
=
paddle
.
_legacy_C_ops
.
alltoall
(
temp
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
)
out_tensor_list
.
extend
(
paddle
.
split
(
out
,
nranks
,
0
))
def
alltoall_single
(
in_tensor
,
...
...
@@ -149,7 +134,6 @@ def alltoall_single(
# output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
alltoall_single
(
out_tensor
,
in_tensor
,
...
...
python/paddle/distributed/communication/broadcast.py
浏览文件 @
983ae1d7
...
...
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
def
broadcast
(
tensor
,
src
,
group
=
None
,
sync_op
=
True
):
...
...
@@ -55,7 +53,6 @@ def broadcast(tensor, src, group=None, sync_op=True):
print(data)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
broadcast
(
tensor
,
src
,
...
...
@@ -63,23 +60,3 @@ def broadcast(tensor, src, group=None, sync_op=True):
sync_op
=
sync_op
,
use_calc_stream
=
False
,
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
ring_id
=
0
if
group
is
None
else
group
.
id
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
assert
gsrc
>=
0
,
"src rank out of group, need global rank"
return
paddle
.
_legacy_C_ops
.
c_broadcast
(
tensor
,
tensor
,
'root'
,
gsrc
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
)
python/paddle/distributed/communication/group.py
浏览文件 @
983ae1d7
...
...
@@ -19,6 +19,7 @@ import paddle.distributed as dist
import
paddle.fluid.core
as
core
import
paddle.fluid.framework
as
framework
import
paddle.fluid.layer_helper
as
layer_helper
from
paddle.fluid.framework
import
in_dygraph_mode
class
Group
:
...
...
@@ -235,9 +236,9 @@ def get_group(id=0):
def
_sync_calc_stream
(
tensor
):
if
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
return
paddle
.
_legacy_C_ops
.
c_sync_calc_stream
(
tensor
,
tensor
)
else
:
op_type
=
'c_sync_calc_stream'
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
...
...
@@ -248,11 +249,11 @@ def _sync_calc_stream(tensor):
def
_sync_comm_stream
(
tensor
,
ring_id
=
0
):
if
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
return
paddle
.
_legacy_C_ops
.
c_sync_comm_stream
(
[
tensor
],
[
tensor
],
'ring_id'
,
ring_id
)
else
:
op_type
=
'c_sync_comm_stream'
helper
=
layer_helper
.
LayerHelper
(
op_type
,
**
locals
())
helper
.
append_op
(
...
...
@@ -336,11 +337,11 @@ def barrier(group=None):
ring_id
=
0
if
group
is
None
else
group
.
id
barrier_tensor
=
paddle
.
full
([
1
],
1
,
dtype
=
"int32"
)
if
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
return
paddle
.
_legacy_C_ops
.
barrier
(
barrier_tensor
,
barrier_tensor
,
'ring_id'
,
ring_id
)
else
:
op_type
=
'barrier'
if
not
isinstance
(
ring_id
,
int
):
raise
ValueError
(
"The type of 'group' for barrier must be int."
)
...
...
python/paddle/distributed/communication/recv.py
浏览文件 @
983ae1d7
...
...
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
def
recv
(
tensor
,
src
=
0
,
group
=
None
,
sync_op
=
True
):
...
...
@@ -48,31 +46,10 @@ def recv(tensor, src=0, group=None, sync_op=True):
print(data)
# [7, 8, 9] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
recv
(
tensor
,
src
=
src
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
ring_id
=
0
if
group
is
None
else
group
.
id
return
paddle
.
_legacy_C_ops
.
recv_v2
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
src
,
'dtype'
,
tensor
.
dtype
,
'out_shape'
,
tensor
.
shape
,
)
def
irecv
(
tensor
,
src
=
None
,
group
=
None
):
"""
...
...
python/paddle/distributed/communication/reduce.py
浏览文件 @
983ae1d7
...
...
@@ -121,8 +121,6 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
reduce
(
tensor
,
dst
=
dst
,
...
...
python/paddle/distributed/communication/reduce_scatter.py
浏览文件 @
983ae1d7
...
...
@@ -13,7 +13,6 @@
# limitations under the License.
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.reduce
import
ReduceOp
from
paddle.distributed.communication.stream.reduce_scatter
import
(
_reduce_scatter_base
as
_reduce_scatter_base_stream
,
...
...
@@ -62,7 +61,6 @@ def reduce_scatter(
# [8, 10] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
reduce_scatter
(
tensor
,
tensor_list
,
...
...
@@ -111,7 +109,6 @@ def _reduce_scatter_base(
# [5, 7] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
_reduce_scatter_base_stream
(
output
,
input
,
...
...
python/paddle/distributed/communication/scatter.py
浏览文件 @
983ae1d7
...
...
@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
from
paddle.distributed.communication.group
import
_get_global_group
def
scatter
(
tensor
,
tensor_list
=
None
,
src
=
0
,
group
=
None
,
sync_op
=
True
):
...
...
@@ -61,34 +58,4 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
# [1, 2, 3] [10, 11, 12] (2 GPUs, out for rank 0)
# [4, 5, 6] [4, 5, 6] (2 GPUs, out for rank 1)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
scatter
(
tensor
,
tensor_list
,
src
,
group
,
sync_op
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
gsrc
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
rank
=
_get_global_group
().
rank
if
group
is
None
else
group
.
rank
nranks
=
_get_global_group
().
nranks
if
group
is
None
else
group
.
nranks
assert
gsrc
>=
0
,
"src rank out of group, need global rank"
if
rank
!=
gsrc
:
tensor_list
=
[]
for
_
in
range
(
nranks
):
tensor_list
.
append
(
tensor
)
temp
=
paddle
.
concat
(
tensor_list
,
axis
=
0
)
use_calc_stream
=
sync_op
return
framework
.
_legacy_C_ops
.
c_scatter
(
temp
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'nranks'
,
nranks
,
'root'
,
gsrc
,
)
python/paddle/distributed/communication/send.py
浏览文件 @
983ae1d7
...
...
@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.distributed.communication.stream
as
stream
import
paddle.fluid.framework
as
framework
def
send
(
tensor
,
dst
=
0
,
group
=
None
,
sync_op
=
True
):
...
...
@@ -48,29 +46,10 @@ def send(tensor, dst=0, group=None, sync_op=True):
print(data)
# [7, 8, 9] (2 GPUs)
"""
if
not
framework
.
_in_legacy_dygraph
():
return
stream
.
send
(
tensor
,
dst
=
dst
,
group
=
group
,
sync_op
=
sync_op
,
use_calc_stream
=
False
)
# code below will be removed after we remove the old dygraph
if
group
is
not
None
and
not
group
.
is_member
():
return
use_calc_stream
=
sync_op
gdst
=
dst
if
group
is
None
else
group
.
get_group_rank
(
dst
)
assert
gdst
>=
0
,
"dst rank out of group, need global rank"
ring_id
=
0
if
group
is
None
else
group
.
id
return
paddle
.
_legacy_C_ops
.
send_v2
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
gdst
,
)
def
isend
(
tensor
,
dst
,
group
=
None
):
"""
...
...
python/paddle/distributed/fleet/fleet.py
浏览文件 @
983ae1d7
...
...
@@ -18,6 +18,7 @@ import os
import
paddle
from
paddle.fluid
import
compiler
from
paddle.fluid.dygraph
import
parallel_helper
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.ir
import
apply_build_strategy
from
paddle.fluid.wrapped_decorator
import
wrap_decorator
from
paddle.framework
import
_global_flags
...
...
@@ -280,7 +281,7 @@ class Fleet:
"CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program."
)
if
paddle
.
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
if
self
.
worker_num
()
==
1
:
# if worker_num is 1, should construct default topology & hcg
self
.
_topology
=
tp
.
CommunicateTopology
()
...
...
@@ -1255,7 +1256,7 @@ class Fleet:
)
else
:
if
(
paddle
.
framework
.
_non_static
_mode
()
in_dygraph
_mode
()
or
self
.
_role_maker
.
_is_non_distributed
()
or
self
.
_is_collective
):
...
...
@@ -1271,12 +1272,12 @@ class Fleet:
context
[
"user_defined_strategy"
]
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
if
paddle
.
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
# imitate target optimizer retrieval
target_opt
=
self
.
user_defined_optimizer
self
.
_context
=
context
return
target_opt
.
minimize
(
loss
)
else
:
# cache original feed forward program
self
.
origin_main_program
=
loss
.
block
.
program
# add distributed attr
...
...
@@ -1293,18 +1294,24 @@ class Fleet:
]
=
self
.
_user_defined_strategy
.
sharding_configs
[
"pp_degree"
]
self
.
origin_main_program
.
distributed_info_
[
"sharding_degree"
]
=
self
.
_user_defined_strategy
.
sharding_configs
[
"sharding_degree"
]
]
=
self
.
_user_defined_strategy
.
sharding_configs
[
"sharding_degree"
]
context
[
"origin_main_program"
]
=
self
.
origin_main_program
context
[
"origin_main_programs"
]
=
[
self
.
origin_main_program
]
context
[
"loss"
]
=
loss
if
startup_program
is
None
:
self
.
origin_startup_program
=
(
paddle
.
static
.
default_startup_program
().
clone
(
for_test
=
False
)
paddle
.
static
.
default_startup_program
().
clone
(
for_test
=
False
)
)
startup_program
=
paddle
.
static
.
default_startup_program
()
else
:
self
.
origin_startup_program
=
startup_program
.
clone
(
for_test
=
False
)
self
.
origin_startup_program
=
startup_program
.
clone
(
for_test
=
False
)
context
[
"origin_startup_program"
]
=
startup_program
context
[
"origin_startup_programs"
]
=
[
startup_program
]
...
...
@@ -1327,12 +1334,19 @@ class Fleet:
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
return
optimize_ops
,
params_grads
,
dist_startup_prog
,
dist_main_prog
return
(
optimize_ops
,
params_grads
,
dist_startup_prog
,
dist_main_prog
,
)
context
[
"user_defined_strategy"
]
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
copy_user_defined_strategy
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
copy_user_defined_strategy
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
can_not_apply_optimizer_list
=
[]
# fix set collective and fleet ps gpu error
...
...
@@ -1380,7 +1394,9 @@ class Fleet:
if
copy_user_defined_strategy
.
_is_strict_auto
():
# turn on all the strategy for each optimizer
for
opt
in
distributed_optimizer_list
:
opt
.
_enable_strategy
(
copy_user_defined_strategy
,
context
)
opt
.
_enable_strategy
(
copy_user_defined_strategy
,
context
)
valid_optimizer_list
=
[]
valid_graph_optimizer_list
=
[]
...
...
@@ -1418,11 +1434,14 @@ class Fleet:
context
[
"valid_strategy"
]
=
copy
.
deepcopy
(
valid_strategy
)
logger
.
debug
(
"valid_strategy: "
+
str
(
context
[
"valid_strategy"
]))
logger
.
debug
(
"user_defined_strategy: "
+
str
(
context
[
"user_defined_strategy"
])
"user_defined_strategy: "
+
str
(
context
[
"user_defined_strategy"
])
)
applied_meta_list
=
self
.
strategy_compiler
.
_get_applied_meta_list
()
applied_graph_list
=
self
.
strategy_compiler
.
_get_applied_graph_list
()
applied_graph_list
=
(
self
.
strategy_compiler
.
_get_applied_graph_list
()
)
context
[
'applied_meta_list'
]
=
applied_meta_list
context
[
'applied_graph_list'
]
=
applied_graph_list
...
...
@@ -1435,16 +1454,24 @@ class Fleet:
optimize_ops
=
[]
params_grads
=
[]
if
self
.
_role_maker
.
_is_non_distributed
()
and
not
self
.
_is_collective
:
if
(
self
.
_role_maker
.
_is_non_distributed
()
and
not
self
.
_is_collective
):
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
compiled_program
=
compiler
.
CompiledProgram
(
self
.
origin_main_program
).
with_data_parallel
(
loss_name
=
loss
.
name
,
share_vars_from
=
None
)
loss
.
block
.
program
.
_graph
=
compiled_program
return
self
.
user_defined_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
,
)
if
meta_optimizer
:
...
...
@@ -1452,7 +1479,10 @@ class Fleet:
"before minimize program id: "
+
str
(
id
(
loss
.
block
.
program
))
)
optimize_ops
,
params_grads
=
meta_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
,
)
logger
.
debug
(
"after minimize program id: "
+
str
(
id
(
loss
.
block
.
program
))
...
...
@@ -1463,12 +1493,19 @@ class Fleet:
if
id
(
default_program
)
!=
id
(
loss
.
block
.
program
):
paddle
.
framework
.
switch_main_program
(
loss
.
block
.
program
)
logger
.
debug
(
"default program id after switch: "
+
str
(
id
(
default_program
))
"default program id after switch: "
+
str
(
id
(
default_program
))
)
else
:
optimize_ops
,
params_grads
=
self
.
user_defined_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
(
optimize_ops
,
params_grads
,
)
=
self
.
user_defined_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
,
)
context
[
"program_optimize_ops"
]
=
optimize_ops
...
...
@@ -1480,7 +1517,10 @@ class Fleet:
+
str
(
id
(
loss
.
block
.
program
))
)
optimize_ops
,
params_grads
=
graph_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
,
)
# since we do not encourage users to use graph operations
# if a graph optimizer takes effect, mostly
...
...
@@ -1493,7 +1533,9 @@ class Fleet:
if
not
self
.
_role_maker
.
_is_heter_parameter_server_mode
:
program
=
paddle
.
static
.
default_main_program
()
opt_info
=
{}
if
program
.
_fleet_opt
is
None
else
program
.
_fleet_opt
opt_info
=
(
{}
if
program
.
_fleet_opt
is
None
else
program
.
_fleet_opt
)
opt_info
[
"mpi_size"
]
=
self
.
worker_num
()
opt_info
[
"mpi_rank"
]
=
self
.
worker_index
()
for
(
...
...
python/paddle/distributed/fleet/layers/mpu/mp_ops.py
浏览文件 @
983ae1d7
...
...
@@ -18,13 +18,7 @@ from paddle.common_ops_import import dygraph_utils
from
paddle.distributed
import
collective
from
paddle.fluid
import
core
from
paddle.fluid.data_feeder
import
check_dtype
,
check_variable_and_dtype
from
paddle.framework
import
(
LayerHelper
,
_in_legacy_dygraph
,
_varbase_creator
,
in_dygraph_mode
,
in_dynamic_mode
,
)
from
paddle.framework
import
LayerHelper
,
_varbase_creator
,
in_dygraph_mode
from
paddle.nn
import
Layer
from
....communication.reduce
import
ReduceOp
,
_get_reduce_op
...
...
@@ -69,17 +63,7 @@ def _c_identity(tensor, group=None):
return
dy
return
c_identity_eager
.
apply
(
tensor
)
elif
_in_legacy_dygraph
():
return
_legacy_C_ops
.
c_identity
(
tensor
,
'use_calc_stream'
,
True
,
'ring_id'
,
ring_id
,
'use_model_parallel'
,
True
,
)
else
:
op_type
=
'c_identity'
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
tensor
.
dtype
)
...
...
@@ -125,7 +109,7 @@ def _c_concat(tensor, group=None):
rank
=
group
.
rank
nranks
=
group
.
nranks
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
return
_legacy_C_ops
.
c_concat
(
tensor
,
'ring_id'
,
...
...
@@ -139,7 +123,7 @@ def _c_concat(tensor, group=None):
'use_model_parallel'
,
True
,
)
else
:
op_type
=
'c_concat'
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
tensor
.
dtype
)
...
...
@@ -191,7 +175,7 @@ def _c_split(tensor, group=None):
else
group
.
nranks
)
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
return
_legacy_C_ops
.
c_split
(
tensor
,
'use_calc_stream'
,
...
...
@@ -205,7 +189,7 @@ def _c_split(tensor, group=None):
'use_model_parallel'
,
True
,
)
else
:
op_type
=
'c_split'
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
tensor
.
dtype
)
...
...
@@ -286,20 +270,8 @@ def _mp_allreduce(
return
mp_allreduce_eager
.
apply
(
tensor
,
group
,
use_calc_stream
,
use_model_parallel
)
ring_id
=
0
if
group
is
None
else
group
.
id
if
_in_legacy_dygraph
():
if
op
==
ReduceOp
.
SUM
:
return
_legacy_C_ops
.
mp_allreduce_sum_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
)
else
:
raise
ValueError
(
"Unknown parameter: {}."
.
format
(
op
))
ring_id
=
0
if
group
is
None
else
group
.
id
op_type
=
'mp_allreduce_sum'
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
dtype
=
tensor
.
dtype
)
...
...
@@ -337,11 +309,11 @@ def _c_lookup_table(table, index, start_index=0, name=None):
Returns:
Tensor.
"""
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
return
_legacy_C_ops
.
c_embedding
(
table
,
index
,
"start_index"
,
start_index
)
else
:
op_type
=
'c_embedding'
helper
=
LayerHelper
(
op_type
,
**
locals
())
dtype
=
helper
.
input_dtype
(
input_param_name
=
'table'
)
...
...
@@ -426,7 +398,7 @@ def _c_softmax_with_cross_entropy(
if
input_dims
-
1
==
label_dims
:
label
=
paddle
.
unsqueeze
(
label
,
axis
=-
1
)
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
softmax
,
loss
=
_legacy_C_ops
.
c_softmax_with_cross_entropy
(
logits
,
label
,
'ring_id'
,
ring_id
,
'rank'
,
rank
,
'nranks'
,
nranks
)
...
...
@@ -434,7 +406,7 @@ def _c_softmax_with_cross_entropy(
return
loss
else
:
return
loss
,
softmax
else
:
attrs
=
{
'ring_id'
:
ring_id
,
'rank'
:
rank
,
...
...
@@ -460,7 +432,7 @@ def _linear(x, weight, bias=None, name=None):
"""
Fuction Linear
"""
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
pre_bias
=
_varbase_creator
(
dtype
=
x
.
dtype
)
_legacy_C_ops
.
matmul
(
x
,
...
...
@@ -827,7 +799,7 @@ def split(
supported_operations
)
)
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
raise
ValueError
(
"paddle.distributed.split cannot be used in dynamic "
"graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
...
...
python/paddle/distributed/fleet/layers/mpu/random.py
浏览文件 @
983ae1d7
...
...
@@ -20,7 +20,8 @@ import paddle
from
paddle
import
_legacy_C_ops
from
paddle.fluid
import
core
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.framework
import
LayerHelper
,
in_dynamic_mode
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.framework
import
LayerHelper
from
paddle.static
import
Variable
__all__
=
[]
...
...
@@ -211,7 +212,7 @@ def dropout(
)
# semantic transfer
# dygraph using tracker, doesn't need determinate seed
if
in_dy
namic
_mode
():
if
in_dy
graph
_mode
():
out
,
mask
=
_legacy_C_ops
.
dropout
(
x
,
'dropout_prob'
,
...
...
@@ -226,7 +227,7 @@ def dropout(
mode
,
)
return
out
else
:
seed
=
determinate_seed
(
rng_name
)
if
isinstance
(
p
,
Variable
)
and
not
p
.
shape
!=
[
1
]:
...
...
python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
浏览文件 @
983ae1d7
...
...
@@ -19,10 +19,10 @@ from .meta_optimizer_base import MetaOptimizerBase
__all__
=
[]
import
paddle
from
paddle
import
framework
from
paddle.common_ops_import
import
LayerHelper
from
paddle.fluid.clip
import
GradientClipByNorm
,
append_gradient_clip_ops
from
paddle.fluid.dygraph
import
base
as
imperative_base
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.optimizer
import
Momentum
,
Optimizer
from
paddle.framework
import
core
from
paddle.static
import
create_global_var
...
...
@@ -46,7 +46,7 @@ class DGCMomentumOptimizer(Optimizer):
grad_clip
=
None
,
name
=
None
,
):
if
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
raise
Exception
(
"In dygraph, don't support DGCMomentumOptimizer."
)
assert
(
...
...
python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
浏览文件 @
983ae1d7
...
...
@@ -16,8 +16,7 @@ import numpy as np
import
paddle
import
paddle.fluid.core
as
core
from
paddle
import
_legacy_C_ops
from
paddle.fluid.framework
import
_in_legacy_dygraph
,
in_dygraph_mode
from
paddle.fluid.framework
import
in_dygraph_mode
from
...utils.log_util
import
logger
from
.utils
import
number_2_dtype
,
paddle_2_number
...
...
@@ -189,21 +188,7 @@ def _partial_send_op(
tensor
,
group
,
use_calc_stream
,
ring_id
,
dst
,
nranks
,
rank_id
):
dst_rank_in_group
=
dst
if
group
is
None
else
group
.
get_group_rank
(
dst
)
if
_in_legacy_dygraph
():
return
_legacy_C_ops
.
partial_send
(
tensor
.
detach
(),
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
dst_rank_in_group
,
'num'
,
nranks
,
'id'
,
rank_id
,
)
elif
in_dygraph_mode
():
if
in_dygraph_mode
():
group
=
(
paddle
.
distributed
.
collective
.
_get_default_group
()
if
group
is
None
...
...
@@ -234,11 +219,6 @@ def send_partial(
tensor
,
group
,
use_calc_stream
,
ring_id
,
dst_rank
,
nranks
,
rank_id
)
else
:
if
_in_legacy_dygraph
():
send_op
=
lambda
x
,
dst
,
group
:
paddle
.
distributed
.
send
(
x
,
dst
,
group
,
use_calc_stream
)
elif
in_dygraph_mode
():
send_op
=
paddle
.
distributed
.
isend
return
send_op
(
tensor
.
detach
(),
dst
=
dst_rank
,
group
=
group
)
...
...
@@ -247,26 +227,6 @@ def _partial_recv_op(
tensor
,
group
,
use_calc_stream
,
ring_id
,
src
,
nranks
,
rank_id
):
src_rank_in_group
=
src
if
group
is
None
else
group
.
get_group_rank
(
src
)
if
_in_legacy_dygraph
():
assert
use_calc_stream
return
_legacy_C_ops
.
partial_recv
(
tensor
.
detach
(),
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'peer'
,
src_rank_in_group
,
'num'
,
nranks
,
'id'
,
rank_id
,
'dtype'
,
tensor
.
dtype
,
'out_shape'
,
tensor
.
shape
,
)
elif
in_dygraph_mode
():
group
=
(
paddle
.
distributed
.
collective
.
_get_default_group
()
if
group
is
None
...
...
@@ -297,7 +257,7 @@ def recv_partial(
tensor
,
group
,
use_calc_stream
,
ring_id
,
src_rank
,
nranks
,
rank_id
)
else
:
if
_in_legacy_dygraph
()
or
use_calc_stream
:
if
use_calc_stream
:
recv_op
=
paddle
.
distributed
.
recv
elif
in_dygraph_mode
():
recv_op
=
paddle
.
distributed
.
irecv
...
...
@@ -307,19 +267,6 @@ def recv_partial(
def
_partial_allgather_op
(
tensor
,
group
,
use_calc_stream
,
ring_id
,
nranks
,
rank_id
):
if
_in_legacy_dygraph
():
return
_legacy_C_ops
.
partial_allgather_
(
tensor
.
detach
(),
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
'nranks'
,
nranks
,
'rank'
,
rank_id
,
)
elif
in_dygraph_mode
():
group
=
(
paddle
.
distributed
.
collective
.
_get_default_group
()
if
group
is
None
...
...
python/paddle/distributed/fleet/optimizer.py
浏览文件 @
983ae1d7
...
...
@@ -14,8 +14,8 @@
import
copy
import
paddle
from
paddle.distributed
import
fleet
from
paddle.fluid.framework
import
in_dygraph_mode
from
.meta_optimizers
import
HeterParallelOptimizer
,
HybridParallelOptimizer
from
.utils.log_util
import
logger
...
...
@@ -74,7 +74,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None):
def
distributed_optimizer
(
*
args
,
**
kwargs
):
if
paddle
.
framework
.
_non_static
_mode
():
if
in_dygraph
_mode
():
return
_dygraph_distributed_optimizer
(
*
args
,
**
kwargs
)
else
:
return
fleet
.
fleet
.
distributed_optimizer
(
*
args
,
**
kwargs
)
python/paddle/distributed/fleet/utils/hybrid_parallel_inference.py
浏览文件 @
983ae1d7
...
...
@@ -20,7 +20,8 @@ import paddle.distributed.fleet as fleet
# (TODO: GhostScreaming) It will be removed later.
import
paddle.fluid.core
as
core
from
paddle.framework
import
Block
,
Program
,
_non_static_mode
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.framework
import
Block
,
Program
class
HybridParallelInferenceHelper
:
...
...
@@ -205,7 +206,7 @@ class HybridParallelInferenceHelper:
elif
core
.
is_compiled_with_cuda
():
self
.
_device
=
"gpu"
assert
self
.
_device
,
"Only gpu and npu are supported."
assert
not
_non_static
_mode
(),
"Only static mode is supported."
assert
not
in_dygraph
_mode
(),
"Only static mode is supported."
op_maker
=
core
.
op_proto_and_checker_maker
self
.
_op_role
=
op_maker
.
OpRole
...
...
python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
浏览文件 @
983ae1d7
...
...
@@ -18,7 +18,6 @@ from paddle import framework
# (TODO: GhostScreaming) It will be removed later.
from
paddle.fluid
import
core
from
paddle.framework
import
(
_in_legacy_dygraph
,
_split_tensors
,
build_groups
,
in_dygraph_mode
,
...
...
@@ -215,7 +214,6 @@ def sharding_reduce_gradients(parameter_list, hcg):
sharding_nrank
=
hcg
.
get_sharding_parallel_group
().
nranks
for
param
in
parameter_list
:
if
param
.
trainable
and
(
param
.
_grad_ivar
()
is
not
None
):
if
in_dygraph_mode
():
param
.
grad
.
scale_
(
1.0
/
sharding_nrank
)
paddle
.
distributed
.
all_reduce
(
param
.
grad
,
...
...
@@ -223,32 +221,6 @@ def sharding_reduce_gradients(parameter_list, hcg):
sync_op
=
True
,
)
elif
_in_legacy_dygraph
():
g_var
=
param
.
_grad_ivar
()
# need use trace_op to allreduce
# paddle.distributed.all_reduce(
# g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
paddle
.
fluid
.
framework
.
_dygraph_tracer
().
trace_op
(
type
=
"c_allreduce_sum"
,
inputs
=
{
'X'
:
g_var
},
outputs
=
{
'Out'
:
g_var
},
attrs
=
{
'ring_id'
:
hcg
.
get_sharding_parallel_group
().
id
,
'use_calc_stream'
:
True
,
},
)
# grad / sharding_rank
div_factor
=
paddle
.
to_tensor
(
sharding_nrank
,
dtype
=
g_var
.
dtype
)
paddle
.
fluid
.
framework
.
_dygraph_tracer
().
trace_op
(
type
=
"elementwise_div"
,
inputs
=
{
'X'
:
g_var
,
'Y'
:
div_factor
},
outputs
=
{
'Out'
:
g_var
},
attrs
=
{
'axis'
:
-
1
},
)
def
broadcast_sharding_parameters
(
model
,
hcg
):
# TODO TO save memory, use un-fused broadcast to avoid potentional OOM
...
...
python/paddle/distributed/models/moe/utils.py
浏览文件 @
983ae1d7
...
...
@@ -13,9 +13,8 @@
# limitations under the License.
from
paddle
import
_legacy_C_ops
from
paddle.fluid
import
core
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.framework
import
_in_legacy_dygraph
,
in_dygraph_mode
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.layer_helper
import
LayerHelper
...
...
@@ -43,8 +42,6 @@ def _number_count(numbers, upper_range):
"""
if
in_dygraph_mode
():
return
_legacy_C_ops
.
number_count
(
numbers
,
'upper_range'
,
upper_range
)
elif
_in_legacy_dygraph
():
return
core
.
ops
.
number_count
(
numbers
,
'upper_range'
,
upper_range
)
else
:
op_type
=
'number_count'
...
...
@@ -92,8 +89,6 @@ def _assign_pos(x, cum_count):
"""
if
in_dygraph_mode
():
return
_legacy_C_ops
.
assign_pos
(
x
,
cum_count
,
cum_count
[
-
1
])
elif
_in_legacy_dygraph
():
return
core
.
ops
.
assign_pos
(
x
,
cum_count
,
cum_count
[
-
1
])
else
:
op_type
=
'assign_pos'
...
...
@@ -129,8 +124,6 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
if
topk
==
2
:
if
in_dygraph_mode
():
return
_legacy_C_ops
.
random_routing
(
prob
,
topk_value
,
topk_idx
)
elif
_in_legacy_dygraph
():
return
core
.
ops
.
random_routing
(
prob
,
topk_value
,
topk_idx
)
else
:
raise
RuntimeError
(
"Not supporting static mode now"
)
else
:
...
...
@@ -162,10 +155,6 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
return
_legacy_C_ops
.
limit_by_capacity
(
expert_count
,
capacity
,
'n_worker'
,
n_worker
)
elif
_in_legacy_dygraph
():
return
core
.
ops
.
limit_by_capacity
(
expert_count
,
capacity
,
'n_worker'
,
n_worker
)
else
:
op_type
=
'limit_by_capacity'
...
...
@@ -211,10 +200,7 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
return
_legacy_C_ops
.
prune_gate_by_capacity
(
gate_idx
,
expert_count
,
"n_expert"
,
n_expert
,
"n_worker"
,
n_worker
)
elif
_in_legacy_dygraph
():
return
core
.
ops
.
prune_gate_by_capacity
(
gate_idx
,
expert_count
,
"n_expert"
,
n_expert
,
"n_worker"
,
n_worker
)
else
:
check_variable_and_dtype
(
gate_idx
,
'GateIdx'
,
...
...
python/paddle/distributed/utils/moe_utils.py
浏览文件 @
983ae1d7
...
...
@@ -14,7 +14,7 @@
from
paddle
import
_legacy_C_ops
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.framework
import
_non_static
_mode
from
paddle.fluid.framework
import
in_dygraph
_mode
from
paddle.fluid.layer_helper
import
LayerHelper
...
...
@@ -103,7 +103,7 @@ def global_scatter(
return
ring_id
=
0
if
group
is
None
else
group
.
id
if
_non_static
_mode
():
if
in_dygraph
_mode
():
return
_legacy_C_ops
.
global_scatter
(
x
,
local_count
,
...
...
@@ -220,7 +220,7 @@ def global_gather(
return
ring_id
=
0
if
group
is
None
else
group
.
id
if
_non_static
_mode
():
if
in_dygraph
_mode
():
return
_legacy_C_ops
.
global_gather
(
x
,
local_count
,
...
...
python/paddle/distribution/dirichlet.py
浏览文件 @
983ae1d7
...
...
@@ -15,7 +15,7 @@
import
paddle
from
paddle.distribution
import
exponential_family
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
from
paddle.fluid.framework
import
_in_legacy_dygraph
,
in_dygraph_mode
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.layer_helper
import
LayerHelper
...
...
@@ -166,8 +166,6 @@ def _dirichlet(concentration, name=None):
if
in_dygraph_mode
():
return
paddle
.
_C_ops
.
dirichlet
(
concentration
)
elif
_in_legacy_dygraph
():
return
paddle
.
_legacy_C_ops
.
dirichlet
(
concentration
)
else
:
helper
=
LayerHelper
(
op_type
,
**
locals
())
out
=
helper
.
create_variable_for_type_inference
(
...
...
python/paddle/distribution/distribution.py
浏览文件 @
983ae1d7
...
...
@@ -24,13 +24,9 @@ import warnings
import
numpy
as
np
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
paddle.fluid.data_feeder
import
check_variable_and_dtype
,
convert_dtype
from
paddle.fluid.framework
import
(
_in_legacy_dygraph
,
_non_static_mode
,
in_dygraph_mode
,
)
from
paddle.fluid.framework
import
in_dygraph_mode
from
paddle.fluid.layers
import
tensor
...
...
@@ -221,7 +217,7 @@ class Distribution:
Returns:
value (Tensor): Change value's dtype if value's dtype is different from param.
"""
if
_non_static
_mode
():
if
in_dygraph
_mode
():
if
value
.
dtype
!=
param
.
dtype
and
convert_dtype
(
value
.
dtype
)
in
[
'float32'
,
'float64'
,
...
...
@@ -229,12 +225,7 @@ class Distribution:
warnings
.
warn
(
"dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
)
if
in_dygraph_mode
():
return
_C_ops
.
cast
(
value
,
param
.
dtype
)
if
_in_legacy_dygraph
():
return
_legacy_C_ops
.
cast
(
value
,
'in_dtype'
,
value
.
dtype
,
'out_dtype'
,
param
.
dtype
)
return
value
check_variable_and_dtype
(
...
...
python/paddle/distribution/uniform.py
浏览文件 @
983ae1d7
...
...
@@ -15,14 +15,10 @@
import
numpy
as
np
import
paddle
from
paddle
import
_C_ops
,
_legacy_C_ops
from
paddle
import
_C_ops
from
paddle.distribution
import
distribution
from
paddle.fluid.data_feeder
import
check_type
,
convert_dtype
from
paddle.fluid.framework
import
(
_in_legacy_dygraph
,
_non_static_mode
,
in_dygraph_mode
,
)
from
paddle.fluid.framework
import
_non_static_mode
,
in_dygraph_mode
from
paddle.fluid.layers
import
tensor
from
paddle.tensor
import
random
...
...
@@ -210,25 +206,15 @@ class Uniform(distribution.Distribution):
"""
value
=
self
.
_check_values_dtype_in_probs
(
self
.
low
,
value
)
if
_non_static
_mode
():
if
in_dygraph
_mode
():
# ensure value in [low, high]
lb_bool
=
self
.
low
<
value
ub_bool
=
value
<
self
.
high
if
in_dygraph_mode
():
lb
=
_C_ops
.
cast
(
lb_bool
,
value
.
dtype
)
ub
=
_C_ops
.
cast
(
ub_bool
,
value
.
dtype
)
return
paddle
.
log
(
lb
*
ub
)
-
paddle
.
log
(
self
.
high
-
self
.
low
)
if
_in_legacy_dygraph
():
lb
=
_legacy_C_ops
.
cast
(
lb_bool
,
'in_dtype'
,
lb_bool
.
dtype
,
'out_dtype'
,
value
.
dtype
)
ub
=
_legacy_C_ops
.
cast
(
ub_bool
,
'in_dtype'
,
ub_bool
.
dtype
,
'out_dtype'
,
value
.
dtype
)
return
paddle
.
log
(
lb
*
ub
)
-
paddle
.
log
(
self
.
high
-
self
.
low
)
else
:
name
=
self
.
name
+
'_log_prob'
lb_bool
=
self
.
low
<
value
ub_bool
=
value
<
self
.
high
...
...
@@ -249,24 +235,13 @@ class Uniform(distribution.Distribution):
"""
value
=
self
.
_check_values_dtype_in_probs
(
self
.
low
,
value
)
if
_non_static
_mode
():
if
in_dygraph
_mode
():
lb_bool
=
self
.
low
<
value
ub_bool
=
value
<
self
.
high
if
in_dygraph_mode
():
lb
=
_C_ops
.
cast
(
lb_bool
,
value
.
dtype
)
ub
=
_C_ops
.
cast
(
ub_bool
,
value
.
dtype
)
return
(
lb
*
ub
)
/
(
self
.
high
-
self
.
low
)
if
_in_legacy_dygraph
():
lb
=
_legacy_C_ops
.
cast
(
lb_bool
,
'in_dtype'
,
lb_bool
.
dtype
,
'out_dtype'
,
value
.
dtype
)
ub
=
_legacy_C_ops
.
cast
(
ub_bool
,
'in_dtype'
,
ub_bool
.
dtype
,
'out_dtype'
,
value
.
dtype
)
return
(
lb
*
ub
)
/
(
self
.
high
-
self
.
low
)
else
:
name
=
self
.
name
+
'_probs'
lb_bool
=
self
.
low
<
value
ub_bool
=
value
<
self
.
high
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录