未验证 提交 6b690d89 编写于 作者: S ShenLiang 提交者: GitHub

Fix pipeline in new dygraph (#41937)

* fix utest

* fix time
上级 60356f67
...@@ -12,6 +12,32 @@ ...@@ -12,6 +12,32 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# The file has been adapted from the file:
# https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/runtime/pipe/module.py
# Git commit hash: fafc827d643b3eed611e282d909025f16be36601
# We retain the following license from the original files:
# MIT License
# Copyright (c) Microsoft Corporation.
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE
import math import math
import re import re
import glob import glob
...@@ -24,6 +50,7 @@ import paddle ...@@ -24,6 +50,7 @@ import paddle
from paddle.fluid.dygraph.layers import Layer from paddle.fluid.dygraph.layers import Layer
from ...utils.log_util import logger, layer_to_str from ...utils.log_util import logger, layer_to_str
from ..pp_utils.utils import _hp_recompute, _initialize_recompute_setting from ..pp_utils.utils import _hp_recompute, _initialize_recompute_setting
from paddle.fluid.framework import in_dygraph_mode
__all__ = [] __all__ = []
...@@ -269,15 +296,20 @@ class PipelineLayer(Layer): ...@@ -269,15 +296,20 @@ class PipelineLayer(Layer):
for key, comm in self.shared_comm.items(): for key, comm in self.shared_comm.items():
param = getattr(self.shared_layers[key], comm['weight_attr']) param = getattr(self.shared_layers[key], comm['weight_attr'])
# need use trace_op to allreduce weight # need use trace_op to allreduce weight
with paddle.framework.no_grad(): if in_dygraph_mode():
paddle.fluid.framework._dygraph_tracer().trace_op( with paddle.framework.no_grad():
type="c_allreduce_sum", paddle.distributed.all_reduce(
inputs={'X': param._grad_ivar()}, param.grad, group=comm['group'])
outputs={'Out': param._grad_ivar()}, else:
attrs={ with paddle.framework.no_grad():
'ring_id': comm['group'].id, paddle.fluid.framework._dygraph_tracer().trace_op(
'use_calc_stream': True type="c_allreduce_sum",
}) inputs={'X': param._grad_ivar()},
outputs={'Out': param._grad_ivar()},
attrs={
'ring_id': comm['group'].id,
'use_calc_stream': True
})
def _segment_network(self, seg_method): def _segment_network(self, seg_method):
logger.info("start segment network..") logger.info("start segment network..")
......
...@@ -23,6 +23,7 @@ from ..utils.hybrid_parallel_util import broadcast_sharding_parameters ...@@ -23,6 +23,7 @@ from ..utils.hybrid_parallel_util import broadcast_sharding_parameters
from ..utils.log_util import logger from ..utils.log_util import logger
from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler
from .pp_utils import p2p_communication as p2p from .pp_utils import p2p_communication as p2p
import paddle.fluid.core as core
__all__ = [] __all__ = []
...@@ -238,9 +239,9 @@ class PipelineParallel(MetaParallelBase): ...@@ -238,9 +239,9 @@ class PipelineParallel(MetaParallelBase):
assert self._layers._loss_fn is not None, "loss function should exist to compute loss" assert self._layers._loss_fn is not None, "loss function should exist to compute loss"
labels = self._load_micro_batch(self.micro_batch_id) labels = self._load_micro_batch(self.micro_batch_id)
output_tensor = self._layers._loss_fn(output_tensor, labels) output_tensor = self._layers._loss_fn(output_tensor, labels)
assert isinstance( assert isinstance(output_tensor, (
output_tensor, paddle.Tensor paddle.Tensor, core.eager.Tensor
), "Currently, loss_fn should obtain Paddle.Tensor dtype" )), "Currently, loss_fn should obtain Paddle.Tensor dtype"
with paddle.amp.auto_cast(enable=False): with paddle.amp.auto_cast(enable=False):
if self.accumulate_steps > 1: if self.accumulate_steps > 1:
...@@ -254,31 +255,33 @@ class PipelineParallel(MetaParallelBase): ...@@ -254,31 +255,33 @@ class PipelineParallel(MetaParallelBase):
return output_tensor return output_tensor
def _backward_step(self, input_tensor, output_tensor, output_tensor_grad): def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
if self.is_last_stage: with paddle.amp.auto_cast(enable=False):
assert output_tensor_grad is None if self.is_last_stage:
if self.scaler: assert output_tensor_grad is None
paddle.autograd.backward(self.scaler.scale(output_tensor)) if self.scaler:
else: paddle.autograd.backward(self.scaler.scale(output_tensor))
paddle.autograd.backward(output_tensor) else:
else: paddle.autograd.backward(output_tensor)
if isinstance(output_tensor, tuple):
outputs = [t for t in output_tensor if not t.stop_gradient]
assert len(outputs) == len(output_tensor_grad)
paddle.autograd.backward(
tensors=outputs,
grad_tensors=[t for t in output_tensor_grad])
else:
paddle.autograd.backward(
tensors=[output_tensor], grad_tensors=[output_tensor_grad])
input_tensor_grad = None
if input_tensor is not None:
if isinstance(input_tensor, tuple):
input_tensor_grad = tuple(
[t.grad for t in input_tensor if not t.stop_gradient])
else: else:
input_tensor_grad = input_tensor.grad if isinstance(output_tensor, tuple):
return input_tensor_grad outputs = [t for t in output_tensor if not t.stop_gradient]
assert len(outputs) == len(output_tensor_grad)
paddle.autograd.backward(
tensors=outputs,
grad_tensors=[t for t in output_tensor_grad])
else:
paddle.autograd.backward(
tensors=[output_tensor],
grad_tensors=[output_tensor_grad])
input_tensor_grad = None
if input_tensor is not None:
if isinstance(input_tensor, tuple):
input_tensor_grad = tuple(
[t.grad for t in input_tensor if not t.stop_gradient])
else:
input_tensor_grad = input_tensor.grad
return input_tensor_grad
def _load_micro_batch(self, cache_id): def _load_micro_batch(self, cache_id):
inputs = self.data inputs = self.data
......
...@@ -17,6 +17,7 @@ from .utils import paddle_2_number, number_2_dtype ...@@ -17,6 +17,7 @@ from .utils import paddle_2_number, number_2_dtype
from ...utils.log_util import logger from ...utils.log_util import logger
import numpy as np import numpy as np
from paddle import _C_ops from paddle import _C_ops
import paddle.fluid.core as core
_hcg = None _hcg = None
_use_cache = False _use_cache = False
...@@ -114,7 +115,7 @@ class SendRecvMeta: ...@@ -114,7 +115,7 @@ class SendRecvMeta:
paddle.distributed.send(stop_grad, dst=1, group=group) paddle.distributed.send(stop_grad, dst=1, group=group)
def send_meta(self, tensor, group): def send_meta(self, tensor, group):
if isinstance(tensor, paddle.Tensor): if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)):
tensor_type = paddle.to_tensor([0]) tensor_type = paddle.to_tensor([0])
# send tensor type # send tensor type
paddle.distributed.send(tensor_type, dst=1, group=group) paddle.distributed.send(tensor_type, dst=1, group=group)
...@@ -129,11 +130,11 @@ class SendRecvMeta: ...@@ -129,11 +130,11 @@ class SendRecvMeta:
paddle.distributed.send(nums, dst=1, group=group) paddle.distributed.send(nums, dst=1, group=group)
for d in tensor: for d in tensor:
assert isinstance(d, paddle.Tensor) assert isinstance(d, (paddle.Tensor, core.eager.Tensor))
self._send_dims_shape_dtype(d, group=group) self._send_dims_shape_dtype(d, group=group)
def set_send_message(self, tensor): def set_send_message(self, tensor):
if isinstance(tensor, paddle.Tensor): if isinstance(tensor, (paddle.Tensor, core.eager.Tensor)):
self.send_shape_message = tensor.shape self.send_shape_message = tensor.shape
self.send_dtype_message = paddle_2_number(tensor.dtype) self.send_dtype_message = paddle_2_number(tensor.dtype)
elif isinstance(tensor, tuple): elif isinstance(tensor, tuple):
......
...@@ -17,10 +17,11 @@ import contextlib ...@@ -17,10 +17,11 @@ import contextlib
import paddle import paddle
from paddle.fluid import core from paddle.fluid import core
from paddle import _C_ops from paddle import _C_ops
from paddle.autograd import PyLayer from paddle.autograd import PyLayer, EagerPyLayer
from paddle.fluid import framework from paddle.fluid import framework
from ...utils.recompute import check_recompute_necessary, detach_variable from ...utils.recompute import check_recompute_necessary, detach_variable
from ..parallel_layers.random import get_rng_state_tracker from ..parallel_layers.random import get_rng_state_tracker
from paddle.fluid.framework import in_dygraph_mode
__all__ = [] __all__ = []
...@@ -164,6 +165,138 @@ def _swith_rng_state_tracker(rng_state, tracker): ...@@ -164,6 +165,138 @@ def _swith_rng_state_tracker(rng_state, tracker):
get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker) get_rng_state_tracker().set_states_tracker(orig_cuda_rng_tracker)
class _HPEagerRecomputeFunction(EagerPyLayer):
"""
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
2. Offload support for activation
3. Support MP segmentation of activation to further reduce cuda memory
4. Adapt to the random state of MP
"""
@staticmethod
def forward(ctx, run_function, all_outputs, *args):
check_recompute_necessary(args)
# store for recomputing
ctx.run_function = run_function
# store the rng states
ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state()
ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
).get_states_tracker()
# save input for backward
ctx.inputs = []
ctx.tensor_indices = []
ctx.tensor_shapes = []
tensor_inputs = []
cur_device = paddle.get_device()
assert 'gpu:' in paddle.get_device(
), "Recompute with RNG is not support current device: {}.".format(
cur_device)
# TODO support AMP
tracer = framework._dygraph_tracer()
ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
if tracer._amp_level == core.AmpLevel.O2:
ctx.amp_level = 'O2'
elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
ctx.amp_level = 'O1'
else:
raise ValueError("unsupported amp level: {}".format(
tracer._amp_level))
ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
with paddle.no_grad():
outputs = run_function(*args)
for i, arg in enumerate(args):
if paddle.is_tensor(arg):
state = arg.stop_gradient
if _recompute_partition:
ctx.tensor_shapes.append(arg.shape)
partition = _split_activation(arg.detach()).clone()
# TODO(shenliang03) not use calculate stream to D2H to speed
arg = partition.cpu() if _recompute_offload else partition
else:
arg = arg.cpu() if _recompute_offload else arg
arg.stop_gradient = state
tensor_inputs.append(arg)
ctx.tensor_indices.append(i)
ctx.inputs.append(None)
else:
ctx.inputs.append(arg)
ctx.save_for_backward(*tensor_inputs)
if paddle.is_tensor(outputs):
all_outputs += [outputs]
return outputs
else:
all_outputs += outputs
return tuple(outputs)
@staticmethod
def backward(ctx, *args):
with paddle.fluid.dygraph.guard():
# Restore inputs
inputs = list(ctx.inputs)
tensor_indices = ctx.tensor_indices
tensor_shapes = ctx.tensor_shapes
tensors = list(ctx.saved_tensor())
device_id = paddle.distributed.ParallelEnv().device_id
for i, idx in enumerate(tensor_indices):
if _recompute_partition:
state = tensors[i].stop_gradient
tensors[i] = _merge_activation(tensors[i]).detach(
).reshape_(tensor_shapes[i])
tensors[i].stop_gradient = state
inputs[idx] = tensors[i].cuda(
device_id) if _recompute_offload else tensors[i]
tracer = framework._dygraph_tracer()
tracer._has_grad = True
# need restore auto_cast state as well as w/b list
with _swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
ctx.fwd_cuda_rng_state_tracker):
with paddle.amp.auto_cast(
enable=ctx.is_fw_autocast,
custom_white_list=ctx.amp_white_list,
custom_black_list=ctx.amp_black_list,
level=ctx.amp_level):
detached_inputs = detach_variable(tuple(inputs))
outputs = ctx.run_function(*detached_inputs)
if isinstance(outputs, core.eager.Tensor):
outputs = (outputs, )
assert len(outputs) == len(args)
forward_outputs_with_grad = []
backward_inputs = []
for i in range(len(outputs)):
if isinstance(
outputs[i],
core.eager.Tensor) and not outputs[i].stop_gradient:
forward_outputs_with_grad.append(outputs[i])
backward_inputs.append(args[i])
if len(forward_outputs_with_grad) == 0:
raise RuntimeError(
"none of output has stop_gradient=False, this recompute() is not necessary"
)
# actually backward
paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
grads = tuple(inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, core.eager.Tensor))
return grads
class _HPRecomputeFunction(PyLayer): class _HPRecomputeFunction(PyLayer):
""" """
Compared with paddle.distributed.fleet.utils.recompute, there are the following differences: Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
...@@ -290,8 +423,8 @@ class _HPRecomputeFunction(PyLayer): ...@@ -290,8 +423,8 @@ class _HPRecomputeFunction(PyLayer):
# actually backward # actually backward
paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
grads = list(inp._grad_ivar() for inp in detached_inputs grads = tuple(inp._grad_ivar() for inp in detached_inputs
if isinstance(inp, core.VarBase)) if isinstance(inp, core.VarBase))
return grads return grads
...@@ -303,7 +436,10 @@ def _hp_recompute(function, *args): ...@@ -303,7 +436,10 @@ def _hp_recompute(function, *args):
# 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
all_outputs = [] all_outputs = []
_HPRecomputeFunction.apply(function, all_outputs, *args) if in_dygraph_mode():
_HPEagerRecomputeFunction.apply(function, all_outputs, *args)
else:
_HPRecomputeFunction.apply(function, all_outputs, *args)
if len(all_outputs) == 1: if len(all_outputs) == 1:
return all_outputs[0] return all_outputs[0]
......
...@@ -1137,7 +1137,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) ...@@ -1137,7 +1137,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 350) set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 350)
set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300) set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30) set_tests_properties(test_parallel_dygraph_no_sync_gradient_check PROPERTIES TIMEOUT 30)
set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_pipeline_parallel PROPERTIES TIMEOUT 500)
set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120) set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
......
...@@ -112,10 +112,11 @@ class TestDistPPTraning(unittest.TestCase): ...@@ -112,10 +112,11 @@ class TestDistPPTraning(unittest.TestCase):
with paddle.amp.auto_cast(): with paddle.amp.auto_cast():
loss_a = model_a(img, label) loss_a = model_a(img, label)
scaler_a.scale(loss_a).backward()
scaler_a.minimize(optimizer_a, loss_a) scaler_a.scale(loss_a).backward()
optimizer_a.clear_grad() scaler_a.minimize(optimizer_a, loss_a)
scheduler_a.step() optimizer_a.clear_grad()
scheduler_a.step()
with paddle.amp.auto_cast(): with paddle.amp.auto_cast():
loss_b = model_b.train_batch( loss_b = model_b.train_batch(
......
...@@ -124,12 +124,12 @@ class TestDistPPTraning(unittest.TestCase): ...@@ -124,12 +124,12 @@ class TestDistPPTraning(unittest.TestCase):
with paddle.amp.auto_cast(enable=True, level='O2'): with paddle.amp.auto_cast(enable=True, level='O2'):
loss_a = model_a(img, label) loss_a = model_a(img, label)
scaler_a.scale(loss_a).backward() scaler_a.scale(loss_a).backward()
with paddle.amp.auto_cast(enable=False): scaler_a.minimize(optimizer_a, loss_a)
scaler_a.minimize(optimizer_a, loss_a) optimizer_a.clear_grad()
optimizer_a.clear_grad() scheduler_a.step()
scheduler_a.step()
with paddle.amp.auto_cast(enable=True, level='O2'):
loss_b = model_b.train_batch( loss_b = model_b.train_batch(
[img, label], optimizer_b, scheduler_b, scaler=scaler_b) [img, label], optimizer_b, scheduler_b, scaler=scaler_b)
......
...@@ -16,6 +16,7 @@ from __future__ import print_function ...@@ -16,6 +16,7 @@ from __future__ import print_function
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import os
from test_parallel_dygraph_dataparallel import TestMultipleGpus from test_parallel_dygraph_dataparallel import TestMultipleGpus
...@@ -23,31 +24,43 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus ...@@ -23,31 +24,43 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestHybridPipeParallel(TestMultipleGpus): class TestHybridPipeParallel(TestMultipleGpus):
def test_hybrid_parallel_pp_layer(self): def test_hybrid_parallel_pp_layer(self):
self.run_mnist_2gpu('hybrid_parallel_pp_layer.py') self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
self.run_mnist_2gpu('hybrid_parallel_pp_layer.py', eager_mode=False)
def test_hybrid_parallel_pp_tuple_inputs(self): def test_hybrid_parallel_pp_tuple_inputs(self):
self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py') self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py', eager_mode=False)
def test_hybrid_parallel_shared_weight(self): def test_hybrid_parallel_shared_weight(self):
self.run_mnist_2gpu('hybrid_parallel_shared_weight.py') self.run_mnist_2gpu('hybrid_parallel_shared_weight.py')
self.run_mnist_2gpu(
'hybrid_parallel_shared_weight.py', eager_mode=False)
def test_pipeline_parallel_amp(self): def test_pipeline_parallel_amp(self):
self.run_mnist_2gpu('hybrid_parallel_pp_amp.py') self.run_mnist_2gpu('hybrid_parallel_pp_amp.py')
self.run_mnist_2gpu('hybrid_parallel_pp_amp.py', eager_mode=False)
def test_pipeline_parallel_fp16(self): def test_pipeline_parallel_fp16(self):
self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py') self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py')
self.run_mnist_2gpu('hybrid_parallel_pp_fp16.py', eager_mode=False)
def test_hybrid_parallel_transformer(self): def test_hybrid_parallel_transformer(self):
self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py') self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py')
self.run_mnist_2gpu(
'hybrid_parallel_pp_transformer.py', eager_mode=False)
def test_hybrid_parallel_save_load(self): def test_hybrid_parallel_save_load(self):
self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py') self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py')
self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py', eager_mode=False)
def test_hybrid_parallel_recompute(self): def test_hybrid_parallel_recompute(self):
self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py') self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py')
self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py', eager_mode=False)
def test_hybrid_parallel_pp_clip_grad(self): def test_hybrid_parallel_pp_clip_grad(self):
self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py') self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py')
self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py', eager_mode=False)
if __name__ == "__main__": if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main() unittest.main()
...@@ -23,7 +23,7 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus ...@@ -23,7 +23,7 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestHybridParallel(TestMultipleGpus): class TestHybridParallel(TestMultipleGpus):
def test_hybrid_parallel_mp_random(self): def test_hybrid_parallel_mp_random(self):
# self.run_mnist_2gpu('hybrid_parallel_mp_random.py') self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
self.run_mnist_2gpu('hybrid_parallel_mp_random.py', eager_mode=False) self.run_mnist_2gpu('hybrid_parallel_mp_random.py', eager_mode=False)
def test_hybrid_parallel_mp_model(self): def test_hybrid_parallel_mp_model(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册