未验证 提交 983ae1d7 编写于 作者: W wanghuancoder 提交者: GitHub

delete legacy dygraph code in python/paddle/distributed (#49304)

* delete legacy dygraph code in python/paddle/distributed

* refine
上级 91cdd295
......@@ -31,7 +31,7 @@ from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.executor import _to_name_str, global_scope
from paddle.fluid.framework import Operator
from paddle.fluid.framework import _current_expected_place as _get_device
from paddle.fluid.framework import _non_static_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layers.utils import flatten
from paddle.metric import Metric
from paddle.static import InputSpec
......@@ -300,7 +300,7 @@ class Engine:
return inputs_spec, labels_spec
def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
if _non_static_mode() or self._dygraph_mode:
if in_dygraph_mode() or self._dygraph_mode:
raise ValueError("Only support static graph mode.")
if inputs_spec:
......@@ -512,7 +512,7 @@ class Engine:
self._has_prepared[mode] = True
def _build(self, mode):
if _non_static_mode() or self._dygraph_mode:
if in_dygraph_mode() or self._dygraph_mode:
paddle.disable_static()
self._dygraph_mode = True
self._logger.info("Building model with 'to_static' method.")
......@@ -1713,7 +1713,7 @@ class Engine:
self._build(mode)
self._plan(mode)
else:
if _non_static_mode() or self._dygraph_mode:
if in_dygraph_mode() or self._dygraph_mode:
raise ValueError(
"Please call `prepare()` or `fit()` or `evaluate()` or `predict()` before calling `cost()`."
)
......
......@@ -17,8 +17,8 @@ from collections import OrderedDict
import paddle
import paddle.fluid.core as core
from paddle import _legacy_C_ops
from paddle.fluid.framework import in_dygraph_mode
from ...fluid.framework import _non_static_mode
from ...fluid.layers.tensor import fill_constant
from ..collective import _get_global_env, _new_ring_id
......@@ -154,7 +154,7 @@ class ProcessGroup:
)
tmp = (
paddle.to_tensor([1], dtype="int32")
if _non_static_mode()
if in_dygraph_mode()
else fill_constant([0], dtype="int32", value="1")
)
# use legacy ops
......
......@@ -18,7 +18,7 @@ import paddle
# (TODO: GhostScreaming) It will be removed later.
import paddle.fluid.core as core
from paddle.framework import _non_static_mode, in_dygraph_mode
from paddle.framework import in_dygraph_mode
from .communication.group import Group, _add_new_group, is_initialized
from .fleet.layers.mpu.mp_ops import _c_concat # noqa: F401
......@@ -301,7 +301,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
# hang caused by cross-creation of new_group
tmp = (
paddle.to_tensor([1], dtype="int32")
if _non_static_mode()
if in_dygraph_mode()
else paddle.full([0], 1, dtype="int32")
)
paddle.distributed.all_reduce(tmp, sync_op=True)
......
......@@ -18,7 +18,6 @@ import pickle
import numpy as np
import paddle
import paddle.distributed as dist
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
......@@ -64,39 +63,8 @@ def all_gather(tensor_list, tensor, group=None, sync_op=True):
print(tensor_list)
# [[[4, 5, 6], [4, 5, 6]], [[1, 2, 3], [1, 2, 3]]] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.all_gather(tensor_list, tensor, group, sync_op)
# NOTE: uncomment code below when having fully complex support
# def convert_to_complex(list_of_tensor):
# list_of_complex = []
# for tensor in list_of_tensor:
# list_of_complex.append(paddle.as_complex(tensor))
# return list_of_complex
# is_input_complex = (tensor.dtype == paddle.complex64
# or tensor.dtype == paddle.complex128)
# if is_input_complex:
# tensor = paddle.as_real(tensor)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
nranks = dist.get_world_size()
out = paddle._legacy_C_ops.c_allgather(
tensor,
'use_calc_stream',
sync_op,
'ring_id',
ring_id,
'nranks',
nranks,
)
tensor_list.clear()
tensor_list.extend(paddle.split(out, nranks, 0))
def _convert_object_to_tensor(obj):
_pickler = pickle.Pickler
......
......@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
from paddle.distributed.communication.reduce import ReduceOp
......@@ -57,31 +55,6 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
print(data)
# [[5, 7, 9], [5, 7, 9]] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.all_reduce(
tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
if op == ReduceOp.SUM:
return paddle._legacy_C_ops.c_allreduce_sum_(
tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
elif op == ReduceOp.MAX:
return paddle._legacy_C_ops.c_allreduce_max_(
tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
elif op == ReduceOp.MIN:
return paddle._legacy_C_ops.c_allreduce_min_(
tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
elif op == ReduceOp.PROD:
return paddle._legacy_C_ops.c_allreduce_prod_(
tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
else:
raise ValueError("Unknown parameter: {}.".format(op))
......@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
......@@ -59,23 +57,10 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
# [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]] (2 GPUs, out for rank 0)
# [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.alltoall(
out_tensor_list, in_tensor_list, group, sync_op, False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
temp = paddle.concat(in_tensor_list, axis=0)
nranks = len(in_tensor_list)
use_calc_stream = sync_op
out = paddle._legacy_C_ops.alltoall(
temp, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
)
out_tensor_list.extend(paddle.split(out, nranks, 0))
def alltoall_single(
in_tensor,
......@@ -149,7 +134,6 @@ def alltoall_single(
# output for rank 1: [[0., 0.], [0., 0.], [1., 1.], [1., 1.]]
"""
if not framework._in_legacy_dygraph():
return stream.alltoall_single(
out_tensor,
in_tensor,
......
......@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
def broadcast(tensor, src, group=None, sync_op=True):
......@@ -55,7 +53,6 @@ def broadcast(tensor, src, group=None, sync_op=True):
print(data)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.broadcast(
tensor,
src,
......@@ -63,23 +60,3 @@ def broadcast(tensor, src, group=None, sync_op=True):
sync_op=sync_op,
use_calc_stream=False,
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
assert gsrc >= 0, "src rank out of group, need global rank"
return paddle._legacy_C_ops.c_broadcast(
tensor,
tensor,
'root',
gsrc,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
)
......@@ -19,6 +19,7 @@ import paddle.distributed as dist
import paddle.fluid.core as core
import paddle.fluid.framework as framework
import paddle.fluid.layer_helper as layer_helper
from paddle.fluid.framework import in_dygraph_mode
class Group:
......@@ -235,9 +236,9 @@ def get_group(id=0):
def _sync_calc_stream(tensor):
if framework._non_static_mode():
if in_dygraph_mode():
return paddle._legacy_C_ops.c_sync_calc_stream(tensor, tensor)
else:
op_type = 'c_sync_calc_stream'
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
......@@ -248,11 +249,11 @@ def _sync_calc_stream(tensor):
def _sync_comm_stream(tensor, ring_id=0):
if framework._non_static_mode():
if in_dygraph_mode():
return paddle._legacy_C_ops.c_sync_comm_stream(
[tensor], [tensor], 'ring_id', ring_id
)
else:
op_type = 'c_sync_comm_stream'
helper = layer_helper.LayerHelper(op_type, **locals())
helper.append_op(
......@@ -336,11 +337,11 @@ def barrier(group=None):
ring_id = 0 if group is None else group.id
barrier_tensor = paddle.full([1], 1, dtype="int32")
if framework._non_static_mode():
if in_dygraph_mode():
return paddle._legacy_C_ops.barrier(
barrier_tensor, barrier_tensor, 'ring_id', ring_id
)
else:
op_type = 'barrier'
if not isinstance(ring_id, int):
raise ValueError("The type of 'group' for barrier must be int.")
......
......@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
def recv(tensor, src=0, group=None, sync_op=True):
......@@ -48,31 +46,10 @@ def recv(tensor, src=0, group=None, sync_op=True):
print(data)
# [7, 8, 9] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.recv(
tensor, src=src, group=group, sync_op=sync_op, use_calc_stream=False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
gsrc = src if group is None else group.get_group_rank(src)
ring_id = 0 if group is None else group.id
return paddle._legacy_C_ops.recv_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
src,
'dtype',
tensor.dtype,
'out_shape',
tensor.shape,
)
def irecv(tensor, src=None, group=None):
"""
......
......@@ -121,8 +121,6 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
# [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
# [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.reduce(
tensor,
dst=dst,
......
......@@ -13,7 +13,6 @@
# limitations under the License.
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
from paddle.distributed.communication.reduce import ReduceOp
from paddle.distributed.communication.stream.reduce_scatter import (
_reduce_scatter_base as _reduce_scatter_base_stream,
......@@ -62,7 +61,6 @@ def reduce_scatter(
# [8, 10] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.reduce_scatter(
tensor,
tensor_list,
......@@ -111,7 +109,6 @@ def _reduce_scatter_base(
# [5, 7] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return _reduce_scatter_base_stream(
output,
input,
......
......@@ -12,10 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
from paddle.distributed.communication.group import _get_global_group
def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
......@@ -61,34 +58,4 @@ def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
# [1, 2, 3] [10, 11, 12] (2 GPUs, out for rank 0)
# [4, 5, 6] [4, 5, 6] (2 GPUs, out for rank 1)
"""
if not framework._in_legacy_dygraph():
return stream.scatter(tensor, tensor_list, src, group, sync_op)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
gsrc = src if group is None else group.get_group_rank(src)
rank = _get_global_group().rank if group is None else group.rank
nranks = _get_global_group().nranks if group is None else group.nranks
assert gsrc >= 0, "src rank out of group, need global rank"
if rank != gsrc:
tensor_list = []
for _ in range(nranks):
tensor_list.append(tensor)
temp = paddle.concat(tensor_list, axis=0)
use_calc_stream = sync_op
return framework._legacy_C_ops.c_scatter(
temp,
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'nranks',
nranks,
'root',
gsrc,
)
......@@ -12,9 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.distributed.communication.stream as stream
import paddle.fluid.framework as framework
def send(tensor, dst=0, group=None, sync_op=True):
......@@ -48,29 +46,10 @@ def send(tensor, dst=0, group=None, sync_op=True):
print(data)
# [7, 8, 9] (2 GPUs)
"""
if not framework._in_legacy_dygraph():
return stream.send(
tensor, dst=dst, group=group, sync_op=sync_op, use_calc_stream=False
)
# code below will be removed after we remove the old dygraph
if group is not None and not group.is_member():
return
use_calc_stream = sync_op
gdst = dst if group is None else group.get_group_rank(dst)
assert gdst >= 0, "dst rank out of group, need global rank"
ring_id = 0 if group is None else group.id
return paddle._legacy_C_ops.send_v2(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
gdst,
)
def isend(tensor, dst, group=None):
"""
......
......@@ -18,6 +18,7 @@ import os
import paddle
from paddle.fluid import compiler
from paddle.fluid.dygraph import parallel_helper
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.ir import apply_build_strategy
from paddle.fluid.wrapped_decorator import wrap_decorator
from paddle.framework import _global_flags
......@@ -280,7 +281,7 @@ class Fleet:
"CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program."
)
if paddle.framework._non_static_mode():
if in_dygraph_mode():
if self.worker_num() == 1:
# if worker_num is 1, should construct default topology & hcg
self._topology = tp.CommunicateTopology()
......@@ -1255,7 +1256,7 @@ class Fleet:
)
else:
if (
paddle.framework._non_static_mode()
in_dygraph_mode()
or self._role_maker._is_non_distributed()
or self._is_collective
):
......@@ -1271,12 +1272,12 @@ class Fleet:
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy
)
if paddle.framework._non_static_mode():
if in_dygraph_mode():
# imitate target optimizer retrieval
target_opt = self.user_defined_optimizer
self._context = context
return target_opt.minimize(loss)
else:
# cache original feed forward program
self.origin_main_program = loss.block.program
# add distributed attr
......@@ -1293,18 +1294,24 @@ class Fleet:
] = self._user_defined_strategy.sharding_configs["pp_degree"]
self.origin_main_program.distributed_info_[
"sharding_degree"
] = self._user_defined_strategy.sharding_configs["sharding_degree"]
] = self._user_defined_strategy.sharding_configs[
"sharding_degree"
]
context["origin_main_program"] = self.origin_main_program
context["origin_main_programs"] = [self.origin_main_program]
context["loss"] = loss
if startup_program is None:
self.origin_startup_program = (
paddle.static.default_startup_program().clone(for_test=False)
paddle.static.default_startup_program().clone(
for_test=False
)
)
startup_program = paddle.static.default_startup_program()
else:
self.origin_startup_program = startup_program.clone(for_test=False)
self.origin_startup_program = startup_program.clone(
for_test=False
)
context["origin_startup_program"] = startup_program
context["origin_startup_programs"] = [startup_program]
......@@ -1327,12 +1334,19 @@ class Fleet:
loss, startup_program, parameter_list, no_grad_set
)
return optimize_ops, params_grads, dist_startup_prog, dist_main_prog
return (
optimize_ops,
params_grads,
dist_startup_prog,
dist_main_prog,
)
context["user_defined_strategy"] = copy.deepcopy(
self._user_defined_strategy
)
copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
copy_user_defined_strategy = copy.deepcopy(
self._user_defined_strategy
)
can_not_apply_optimizer_list = []
# fix set collective and fleet ps gpu error
......@@ -1380,7 +1394,9 @@ class Fleet:
if copy_user_defined_strategy._is_strict_auto():
# turn on all the strategy for each optimizer
for opt in distributed_optimizer_list:
opt._enable_strategy(copy_user_defined_strategy, context)
opt._enable_strategy(
copy_user_defined_strategy, context
)
valid_optimizer_list = []
valid_graph_optimizer_list = []
......@@ -1418,11 +1434,14 @@ class Fleet:
context["valid_strategy"] = copy.deepcopy(valid_strategy)
logger.debug("valid_strategy: " + str(context["valid_strategy"]))
logger.debug(
"user_defined_strategy: " + str(context["user_defined_strategy"])
"user_defined_strategy: "
+ str(context["user_defined_strategy"])
)
applied_meta_list = self.strategy_compiler._get_applied_meta_list()
applied_graph_list = self.strategy_compiler._get_applied_graph_list()
applied_graph_list = (
self.strategy_compiler._get_applied_graph_list()
)
context['applied_meta_list'] = applied_meta_list
context['applied_graph_list'] = applied_graph_list
......@@ -1435,16 +1454,24 @@ class Fleet:
optimize_ops = []
params_grads = []
if self._role_maker._is_non_distributed() and not self._is_collective:
if (
self._role_maker._is_non_distributed()
and not self._is_collective
):
if self._runtime_handle is None:
self._runtime_handle = RuntimeFactory()._create_runtime(context)
self._runtime_handle = RuntimeFactory()._create_runtime(
context
)
compiled_program = compiler.CompiledProgram(
self.origin_main_program
).with_data_parallel(loss_name=loss.name, share_vars_from=None)
loss.block.program._graph = compiled_program
return self.user_defined_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set
loss,
startup_program,
parameter_list,
no_grad_set=no_grad_set,
)
if meta_optimizer:
......@@ -1452,7 +1479,10 @@ class Fleet:
"before minimize program id: " + str(id(loss.block.program))
)
optimize_ops, params_grads = meta_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set
loss,
startup_program,
parameter_list,
no_grad_set=no_grad_set,
)
logger.debug(
"after minimize program id: " + str(id(loss.block.program))
......@@ -1463,12 +1493,19 @@ class Fleet:
if id(default_program) != id(loss.block.program):
paddle.framework.switch_main_program(loss.block.program)
logger.debug(
"default program id after switch: " + str(id(default_program))
"default program id after switch: "
+ str(id(default_program))
)
else:
optimize_ops, params_grads = self.user_defined_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set
(
optimize_ops,
params_grads,
) = self.user_defined_optimizer.minimize(
loss,
startup_program,
parameter_list,
no_grad_set=no_grad_set,
)
context["program_optimize_ops"] = optimize_ops
......@@ -1480,7 +1517,10 @@ class Fleet:
+ str(id(loss.block.program))
)
optimize_ops, params_grads = graph_optimizer.minimize(
loss, startup_program, parameter_list, no_grad_set=no_grad_set
loss,
startup_program,
parameter_list,
no_grad_set=no_grad_set,
)
# since we do not encourage users to use graph operations
# if a graph optimizer takes effect, mostly
......@@ -1493,7 +1533,9 @@ class Fleet:
if not self._role_maker._is_heter_parameter_server_mode:
program = paddle.static.default_main_program()
opt_info = {} if program._fleet_opt is None else program._fleet_opt
opt_info = (
{} if program._fleet_opt is None else program._fleet_opt
)
opt_info["mpi_size"] = self.worker_num()
opt_info["mpi_rank"] = self.worker_index()
for (
......
......@@ -18,13 +18,7 @@ from paddle.common_ops_import import dygraph_utils
from paddle.distributed import collective
from paddle.fluid import core
from paddle.fluid.data_feeder import check_dtype, check_variable_and_dtype
from paddle.framework import (
LayerHelper,
_in_legacy_dygraph,
_varbase_creator,
in_dygraph_mode,
in_dynamic_mode,
)
from paddle.framework import LayerHelper, _varbase_creator, in_dygraph_mode
from paddle.nn import Layer
from ....communication.reduce import ReduceOp, _get_reduce_op
......@@ -69,17 +63,7 @@ def _c_identity(tensor, group=None):
return dy
return c_identity_eager.apply(tensor)
elif _in_legacy_dygraph():
return _legacy_C_ops.c_identity(
tensor,
'use_calc_stream',
True,
'ring_id',
ring_id,
'use_model_parallel',
True,
)
else:
op_type = 'c_identity'
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
......@@ -125,7 +109,7 @@ def _c_concat(tensor, group=None):
rank = group.rank
nranks = group.nranks
if in_dynamic_mode():
if in_dygraph_mode():
return _legacy_C_ops.c_concat(
tensor,
'ring_id',
......@@ -139,7 +123,7 @@ def _c_concat(tensor, group=None):
'use_model_parallel',
True,
)
else:
op_type = 'c_concat'
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
......@@ -191,7 +175,7 @@ def _c_split(tensor, group=None):
else group.nranks
)
if in_dynamic_mode():
if in_dygraph_mode():
return _legacy_C_ops.c_split(
tensor,
'use_calc_stream',
......@@ -205,7 +189,7 @@ def _c_split(tensor, group=None):
'use_model_parallel',
True,
)
else:
op_type = 'c_split'
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
......@@ -286,20 +270,8 @@ def _mp_allreduce(
return mp_allreduce_eager.apply(
tensor, group, use_calc_stream, use_model_parallel
)
ring_id = 0 if group is None else group.id
if _in_legacy_dygraph():
if op == ReduceOp.SUM:
return _legacy_C_ops.mp_allreduce_sum_(
tensor,
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
)
else:
raise ValueError("Unknown parameter: {}.".format(op))
ring_id = 0 if group is None else group.id
op_type = 'mp_allreduce_sum'
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
......@@ -337,11 +309,11 @@ def _c_lookup_table(table, index, start_index=0, name=None):
Returns:
Tensor.
"""
if in_dynamic_mode():
if in_dygraph_mode():
return _legacy_C_ops.c_embedding(
table, index, "start_index", start_index
)
else:
op_type = 'c_embedding'
helper = LayerHelper(op_type, **locals())
dtype = helper.input_dtype(input_param_name='table')
......@@ -426,7 +398,7 @@ def _c_softmax_with_cross_entropy(
if input_dims - 1 == label_dims:
label = paddle.unsqueeze(label, axis=-1)
if in_dynamic_mode():
if in_dygraph_mode():
softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks
)
......@@ -434,7 +406,7 @@ def _c_softmax_with_cross_entropy(
return loss
else:
return loss, softmax
else:
attrs = {
'ring_id': ring_id,
'rank': rank,
......@@ -460,7 +432,7 @@ def _linear(x, weight, bias=None, name=None):
"""
Fuction Linear
"""
if in_dynamic_mode():
if in_dygraph_mode():
pre_bias = _varbase_creator(dtype=x.dtype)
_legacy_C_ops.matmul(
x,
......@@ -827,7 +799,7 @@ def split(
supported_operations
)
)
if in_dynamic_mode():
if in_dygraph_mode():
raise ValueError(
"paddle.distributed.split cannot be used in dynamic "
"graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
......
......@@ -20,7 +20,8 @@ import paddle
from paddle import _legacy_C_ops
from paddle.fluid import core
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.framework import LayerHelper, in_dynamic_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.framework import LayerHelper
from paddle.static import Variable
__all__ = []
......@@ -211,7 +212,7 @@ def dropout(
) # semantic transfer
# dygraph using tracker, doesn't need determinate seed
if in_dynamic_mode():
if in_dygraph_mode():
out, mask = _legacy_C_ops.dropout(
x,
'dropout_prob',
......@@ -226,7 +227,7 @@ def dropout(
mode,
)
return out
else:
seed = determinate_seed(rng_name)
if isinstance(p, Variable) and not p.shape != [1]:
......
......@@ -19,10 +19,10 @@ from .meta_optimizer_base import MetaOptimizerBase
__all__ = []
import paddle
from paddle import framework
from paddle.common_ops_import import LayerHelper
from paddle.fluid.clip import GradientClipByNorm, append_gradient_clip_ops
from paddle.fluid.dygraph import base as imperative_base
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.optimizer import Momentum, Optimizer
from paddle.framework import core
from paddle.static import create_global_var
......@@ -46,7 +46,7 @@ class DGCMomentumOptimizer(Optimizer):
grad_clip=None,
name=None,
):
if framework._non_static_mode():
if in_dygraph_mode():
raise Exception("In dygraph, don't support DGCMomentumOptimizer.")
assert (
......
......@@ -16,8 +16,7 @@ import numpy as np
import paddle
import paddle.fluid.core as core
from paddle import _legacy_C_ops
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
from paddle.fluid.framework import in_dygraph_mode
from ...utils.log_util import logger
from .utils import number_2_dtype, paddle_2_number
......@@ -189,21 +188,7 @@ def _partial_send_op(
tensor, group, use_calc_stream, ring_id, dst, nranks, rank_id
):
dst_rank_in_group = dst if group is None else group.get_group_rank(dst)
if _in_legacy_dygraph():
return _legacy_C_ops.partial_send(
tensor.detach(),
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
dst_rank_in_group,
'num',
nranks,
'id',
rank_id,
)
elif in_dygraph_mode():
if in_dygraph_mode():
group = (
paddle.distributed.collective._get_default_group()
if group is None
......@@ -234,11 +219,6 @@ def send_partial(
tensor, group, use_calc_stream, ring_id, dst_rank, nranks, rank_id
)
else:
if _in_legacy_dygraph():
send_op = lambda x, dst, group: paddle.distributed.send(
x, dst, group, use_calc_stream
)
elif in_dygraph_mode():
send_op = paddle.distributed.isend
return send_op(tensor.detach(), dst=dst_rank, group=group)
......@@ -247,26 +227,6 @@ def _partial_recv_op(
tensor, group, use_calc_stream, ring_id, src, nranks, rank_id
):
src_rank_in_group = src if group is None else group.get_group_rank(src)
if _in_legacy_dygraph():
assert use_calc_stream
return _legacy_C_ops.partial_recv(
tensor.detach(),
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'peer',
src_rank_in_group,
'num',
nranks,
'id',
rank_id,
'dtype',
tensor.dtype,
'out_shape',
tensor.shape,
)
elif in_dygraph_mode():
group = (
paddle.distributed.collective._get_default_group()
if group is None
......@@ -297,7 +257,7 @@ def recv_partial(
tensor, group, use_calc_stream, ring_id, src_rank, nranks, rank_id
)
else:
if _in_legacy_dygraph() or use_calc_stream:
if use_calc_stream:
recv_op = paddle.distributed.recv
elif in_dygraph_mode():
recv_op = paddle.distributed.irecv
......@@ -307,19 +267,6 @@ def recv_partial(
def _partial_allgather_op(
tensor, group, use_calc_stream, ring_id, nranks, rank_id
):
if _in_legacy_dygraph():
return _legacy_C_ops.partial_allgather_(
tensor.detach(),
'use_calc_stream',
use_calc_stream,
'ring_id',
ring_id,
'nranks',
nranks,
'rank',
rank_id,
)
elif in_dygraph_mode():
group = (
paddle.distributed.collective._get_default_group()
if group is None
......
......@@ -14,8 +14,8 @@
import copy
import paddle
from paddle.distributed import fleet
from paddle.fluid.framework import in_dygraph_mode
from .meta_optimizers import HeterParallelOptimizer, HybridParallelOptimizer
from .utils.log_util import logger
......@@ -74,7 +74,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None):
def distributed_optimizer(*args, **kwargs):
if paddle.framework._non_static_mode():
if in_dygraph_mode():
return _dygraph_distributed_optimizer(*args, **kwargs)
else:
return fleet.fleet.distributed_optimizer(*args, **kwargs)
......@@ -20,7 +20,8 @@ import paddle.distributed.fleet as fleet
# (TODO: GhostScreaming) It will be removed later.
import paddle.fluid.core as core
from paddle.framework import Block, Program, _non_static_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.framework import Block, Program
class HybridParallelInferenceHelper:
......@@ -205,7 +206,7 @@ class HybridParallelInferenceHelper:
elif core.is_compiled_with_cuda():
self._device = "gpu"
assert self._device, "Only gpu and npu are supported."
assert not _non_static_mode(), "Only static mode is supported."
assert not in_dygraph_mode(), "Only static mode is supported."
op_maker = core.op_proto_and_checker_maker
self._op_role = op_maker.OpRole
......
......@@ -18,7 +18,6 @@ from paddle import framework
# (TODO: GhostScreaming) It will be removed later.
from paddle.fluid import core
from paddle.framework import (
_in_legacy_dygraph,
_split_tensors,
build_groups,
in_dygraph_mode,
......@@ -215,7 +214,6 @@ def sharding_reduce_gradients(parameter_list, hcg):
sharding_nrank = hcg.get_sharding_parallel_group().nranks
for param in parameter_list:
if param.trainable and (param._grad_ivar() is not None):
if in_dygraph_mode():
param.grad.scale_(1.0 / sharding_nrank)
paddle.distributed.all_reduce(
param.grad,
......@@ -223,32 +221,6 @@ def sharding_reduce_gradients(parameter_list, hcg):
sync_op=True,
)
elif _in_legacy_dygraph():
g_var = param._grad_ivar()
# need use trace_op to allreduce
# paddle.distributed.all_reduce(
# g_var, group=hcg.get_sharding_parallel_group(), use_calc_stream=True)
paddle.fluid.framework._dygraph_tracer().trace_op(
type="c_allreduce_sum",
inputs={'X': g_var},
outputs={'Out': g_var},
attrs={
'ring_id': hcg.get_sharding_parallel_group().id,
'use_calc_stream': True,
},
)
# grad / sharding_rank
div_factor = paddle.to_tensor(
sharding_nrank, dtype=g_var.dtype
)
paddle.fluid.framework._dygraph_tracer().trace_op(
type="elementwise_div",
inputs={'X': g_var, 'Y': div_factor},
outputs={'Out': g_var},
attrs={'axis': -1},
)
def broadcast_sharding_parameters(model, hcg):
# TODO TO save memory, use un-fused broadcast to avoid potentional OOM
......
......@@ -13,9 +13,8 @@
# limitations under the License.
from paddle import _legacy_C_ops
from paddle.fluid import core
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
......@@ -43,8 +42,6 @@ def _number_count(numbers, upper_range):
"""
if in_dygraph_mode():
return _legacy_C_ops.number_count(numbers, 'upper_range', upper_range)
elif _in_legacy_dygraph():
return core.ops.number_count(numbers, 'upper_range', upper_range)
else:
op_type = 'number_count'
......@@ -92,8 +89,6 @@ def _assign_pos(x, cum_count):
"""
if in_dygraph_mode():
return _legacy_C_ops.assign_pos(x, cum_count, cum_count[-1])
elif _in_legacy_dygraph():
return core.ops.assign_pos(x, cum_count, cum_count[-1])
else:
op_type = 'assign_pos'
......@@ -129,8 +124,6 @@ def _random_routing(topk_idx, topk_value, prob, topk=2):
if topk == 2:
if in_dygraph_mode():
return _legacy_C_ops.random_routing(prob, topk_value, topk_idx)
elif _in_legacy_dygraph():
return core.ops.random_routing(prob, topk_value, topk_idx)
else:
raise RuntimeError("Not supporting static mode now")
else:
......@@ -162,10 +155,6 @@ def _limit_by_capacity(expert_count, capacity, n_worker):
return _legacy_C_ops.limit_by_capacity(
expert_count, capacity, 'n_worker', n_worker
)
elif _in_legacy_dygraph():
return core.ops.limit_by_capacity(
expert_count, capacity, 'n_worker', n_worker
)
else:
op_type = 'limit_by_capacity'
......@@ -211,10 +200,7 @@ def _prune_gate_by_capacity(gate_idx, expert_count, n_expert, n_worker):
return _legacy_C_ops.prune_gate_by_capacity(
gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker
)
elif _in_legacy_dygraph():
return core.ops.prune_gate_by_capacity(
gate_idx, expert_count, "n_expert", n_expert, "n_worker", n_worker
)
else:
check_variable_and_dtype(
gate_idx,
'GateIdx',
......
......@@ -14,7 +14,7 @@
from paddle import _legacy_C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import _non_static_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
......@@ -103,7 +103,7 @@ def global_scatter(
return
ring_id = 0 if group is None else group.id
if _non_static_mode():
if in_dygraph_mode():
return _legacy_C_ops.global_scatter(
x,
local_count,
......@@ -220,7 +220,7 @@ def global_gather(
return
ring_id = 0 if group is None else group.id
if _non_static_mode():
if in_dygraph_mode():
return _legacy_C_ops.global_gather(
x,
local_count,
......
......@@ -15,7 +15,7 @@
import paddle
from paddle.distribution import exponential_family
from paddle.fluid.data_feeder import check_variable_and_dtype
from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
......@@ -166,8 +166,6 @@ def _dirichlet(concentration, name=None):
if in_dygraph_mode():
return paddle._C_ops.dirichlet(concentration)
elif _in_legacy_dygraph():
return paddle._legacy_C_ops.dirichlet(concentration)
else:
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(
......
......@@ -24,13 +24,9 @@ import warnings
import numpy as np
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from paddle.fluid.data_feeder import check_variable_and_dtype, convert_dtype
from paddle.fluid.framework import (
_in_legacy_dygraph,
_non_static_mode,
in_dygraph_mode,
)
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layers import tensor
......@@ -221,7 +217,7 @@ class Distribution:
Returns:
value (Tensor): Change value's dtype if value's dtype is different from param.
"""
if _non_static_mode():
if in_dygraph_mode():
if value.dtype != param.dtype and convert_dtype(value.dtype) in [
'float32',
'float64',
......@@ -229,12 +225,7 @@ class Distribution:
warnings.warn(
"dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
)
if in_dygraph_mode():
return _C_ops.cast(value, param.dtype)
if _in_legacy_dygraph():
return _legacy_C_ops.cast(
value, 'in_dtype', value.dtype, 'out_dtype', param.dtype
)
return value
check_variable_and_dtype(
......
......@@ -15,14 +15,10 @@
import numpy as np
import paddle
from paddle import _C_ops, _legacy_C_ops
from paddle import _C_ops
from paddle.distribution import distribution
from paddle.fluid.data_feeder import check_type, convert_dtype
from paddle.fluid.framework import (
_in_legacy_dygraph,
_non_static_mode,
in_dygraph_mode,
)
from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
from paddle.fluid.layers import tensor
from paddle.tensor import random
......@@ -210,25 +206,15 @@ class Uniform(distribution.Distribution):
"""
value = self._check_values_dtype_in_probs(self.low, value)
if _non_static_mode():
if in_dygraph_mode():
# ensure value in [low, high]
lb_bool = self.low < value
ub_bool = value < self.high
if in_dygraph_mode():
lb = _C_ops.cast(lb_bool, value.dtype)
ub = _C_ops.cast(ub_bool, value.dtype)
return paddle.log(lb * ub) - paddle.log(self.high - self.low)
if _in_legacy_dygraph():
lb = _legacy_C_ops.cast(
lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
)
ub = _legacy_C_ops.cast(
ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
)
return paddle.log(lb * ub) - paddle.log(self.high - self.low)
else:
name = self.name + '_log_prob'
lb_bool = self.low < value
ub_bool = value < self.high
......@@ -249,24 +235,13 @@ class Uniform(distribution.Distribution):
"""
value = self._check_values_dtype_in_probs(self.low, value)
if _non_static_mode():
if in_dygraph_mode():
lb_bool = self.low < value
ub_bool = value < self.high
if in_dygraph_mode():
lb = _C_ops.cast(lb_bool, value.dtype)
ub = _C_ops.cast(ub_bool, value.dtype)
return (lb * ub) / (self.high - self.low)
if _in_legacy_dygraph():
lb = _legacy_C_ops.cast(
lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
)
ub = _legacy_C_ops.cast(
ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
)
return (lb * ub) / (self.high - self.low)
else:
name = self.name + '_probs'
lb_bool = self.low < value
ub_bool = value < self.high
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册