未验证 提交 37216a8f 编写于 作者: H Haohongxiang 提交者: GitHub

[Dygraph] Support new apis in ProcessGroupNCCL (#43918)

* fix conflict

* new pg apis

* add docs of new apis

* update

* fix coverage

* update

* fix bug

* fix reduce scatter

* fix api

* update
Co-authored-by: NForFishes <2282912238@qq.com>
上级 02e4f1f8
......@@ -46,6 +46,7 @@ enum class CommType : std::uint8_t {
SEND = 9,
RECV = 10,
BARRIER = 11,
ALLTOALL_SINGLE = 12,
UNKNOWN = 100,
};
......@@ -143,6 +144,15 @@ class ProcessGroup {
"ProcessGroup%s does not support AllToAll", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<int64_t>&,
std::vector<int64_t>&) {
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>&, // NOLINT
std::vector<phi::DenseTensor>&, // NOLINT
......@@ -159,6 +169,14 @@ class ProcessGroup {
"ProcessGroup%s does not support Scatter", GetBackendName()));
}
virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
phi::DenseTensor&, // NOLINT
phi::DenseTensor&, // NOLINT
const ReduceScatterOptions&) { // NOLINT
PADDLE_THROW(platform::errors::InvalidArgument(
"ProcessGroup%s does not support ReduceScatter", GetBackendName()));
}
protected:
const int rank_;
const int size_;
......
......@@ -85,6 +85,34 @@ bool ProcessGroupNCCL::NCCLTask::IsCompleted() {
return true;
}
void ProcessGroupNCCL::CheckSplitSizes(std::vector<int64_t>& split_sizes,
std::vector<int64_t> tensor_shape) {
int64_t len_size = split_sizes.size();
if (len_size == 0) {
PADDLE_ENFORCE_EQ(tensor_shape[0] % size_ == 0,
true,
platform::errors::InvalidArgument(
"Tensor's dim[0] must be divisible by group size "
"when split_sizes not given."));
split_sizes.insert(split_sizes.end(),
size_,
static_cast<int64_t>(tensor_shape[0] / size_));
} else {
PADDLE_ENFORCE_EQ(
len_size == size_,
true,
platform::errors::InvalidArgument(
"The length of split_sizes must be equal to group size."));
auto sum_size = std::accumulate(
split_sizes.begin(), split_sizes.end(), static_cast<int64_t>(0));
PADDLE_ENFORCE_EQ(
sum_size == tensor_shape[0],
true,
platform::errors::InvalidArgument(
"The sum of split_sizes must be equal to tensor's dim[0]."));
}
}
// TODO(sheniang03): Add timeout for wait, now timeout unused
bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
SynchronizeStreams();
......@@ -637,7 +665,69 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLREDUCE);
CommType::ALLTOALL);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
std::vector<phi::DenseTensor>& in_tensors,
std::vector<phi::DenseTensor>& out_tensors,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes) {
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInCudaPlace(out_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
return Collective(
in_tensors,
out_tensors,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
true,
platform::errors::InvalidArgument(
"The dtypes of input and output must be equal."));
std::vector<int64_t> in_dims = phi::vectorize(input.dims());
std::vector<int64_t> out_dims = phi::vectorize(output.dims());
CheckSplitSizes(in_sizes, in_dims);
CheckSplitSizes(out_sizes, out_dims);
size_t in_offset = 0, out_offset = 0;
size_t in_length = 0, out_length = 0;
size_t in_row_size = input.numel() / in_dims[0];
size_t out_row_size = output.numel() / out_dims[0];
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
for (auto i = 0; i < size_; i++) {
in_length = in_sizes[i] * in_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
GetPointerByOffset(input.data(), in_offset, input.dtype()),
in_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
in_offset += in_length;
out_length = out_sizes[i] * out_row_size;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
GetPointerByOffset(output.data(), out_offset, input.dtype()),
out_length,
platform::ToNCCLDataType(input.dtype()),
i,
comm,
stream));
out_offset += out_length;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
},
CommType::ALLTOALL_SINGLE);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
......@@ -721,5 +811,57 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
CommType::SCATTER);
}
std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
phi::DenseTensor& out_tensor,
phi::DenseTensor& in_tensor,
const ReduceScatterOptions& opts) {
// auto tensor = out_tensors.back();
PADDLE_ENFORCE_EQ(
out_tensor.dtype(),
in_tensor.dtype(),
platform::errors::InvalidArgument(
"Input tensor and output tensor should be same dtype."));
PADDLE_ENFORCE_EQ(
out_tensor.numel() * size_,
in_tensor.numel(),
platform::errors::InvalidArgument("input tensor must be the same size as "
"output tensor size times world_size"));
auto inputs = std::vector<phi::DenseTensor>{in_tensor};
auto outputs = std::vector<phi::DenseTensor>{out_tensor};
return Collective(
inputs,
outputs,
[&](phi::DenseTensor& input,
phi::DenseTensor& output,
ncclComm_t comm,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
platform::CUDADeviceGuard cuda_guard;
cuda_guard.SetDevice(output.place());
memory::RecordStream(output.Holder(), stream);
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
input.data(),
output.data(),
output.numel(),
platform::ToNCCLDataType(input.dtype()),
ToNCCLRedType(opts.reduce_op),
comm,
stream));
},
CommType::REDUCE_SCATTER);
}
void ProcessGroupNCCL::GroupStart() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
}
void ProcessGroupNCCL::GroupEnd() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
}
} // namespace distributed
} // namespace paddle
......@@ -129,6 +129,12 @@ class ProcessGroupNCCL : public ProcessGroup {
std::vector<phi::DenseTensor>& in,
std::vector<phi::DenseTensor>& out) override;
std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
std::vector<phi::DenseTensor>& in,
std::vector<phi::DenseTensor>& out,
std::vector<int64_t>& in_sizes,
std::vector<int64_t>& out_sizes) override;
std::shared_ptr<ProcessGroup::Task> Reduce(
std::vector<phi::DenseTensor>& tensors,
std::vector<phi::DenseTensor>& out_tensors,
......@@ -139,6 +145,15 @@ class ProcessGroupNCCL : public ProcessGroup {
std::vector<phi::DenseTensor>& out_tensors,
const ScatterOptions&) override;
std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
phi::DenseTensor&, // NOLINT
phi::DenseTensor&, // NOLINT
const ReduceScatterOptions&) override;
static void GroupStart();
static void GroupEnd();
protected:
virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
std::vector<Place> places,
......@@ -162,7 +177,7 @@ class ProcessGroupNCCL : public ProcessGroup {
std::set<int> used_place_ids_;
private:
void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,
void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids, // NOLINT
int root, // NOLINT
int server_fd);
......@@ -190,6 +205,9 @@ class ProcessGroupNCCL : public ProcessGroup {
void CreateNCCLManagerCache(const std::string& places_key,
const std::vector<Place>& places);
void CheckSplitSizes(std::vector<int64_t>& split_sizes,
std::vector<int64_t> tensor_shape);
};
} // namespace distributed
......
......@@ -45,5 +45,9 @@ struct ScatterOptions {
int root_rank = 0;
};
struct ReduceScatterOptions {
ReduceOp reduce_op = ReduceOp::SUM;
};
} // namespace distributed
} // namespace paddle
......@@ -225,6 +225,30 @@ void BindDistributed(py::module *m) {
py::arg("out"),
py::call_guard<py::gil_scoped_release>())
.def(
"alltoall_single",
[](distributed::ProcessGroup &self,
py::handle py_in_tensor,
py::handle py_out_tensor,
std::vector<int64_t> in_sizes,
std::vector<int64_t> out_sizes) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
in_tensor.impl());
auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
out_tensor.impl());
std::vector<phi::DenseTensor> in_tensors = {*in_dense};
std::vector<phi::DenseTensor> out_tensors = {*out_dense};
return self.AllToAll_Single(
in_tensors, out_tensors, in_sizes, out_sizes);
},
py::arg("in"),
py::arg("out"),
py::arg("in_sizes"),
py::arg("out_sizes"),
py::call_guard<py::gil_scoped_release>())
.def(
"reduce",
[](distributed::ProcessGroup &self,
......@@ -244,7 +268,6 @@ void BindDistributed(py::module *m) {
py::arg("dst"),
py::arg("op") = distributed::ReduceOp::SUM,
py::call_guard<py::gil_scoped_release>())
.def(
"scatter",
[](distributed::ProcessGroup &self,
......@@ -266,9 +289,30 @@ void BindDistributed(py::module *m) {
py::arg("in"),
py::arg("out"),
py::arg("src"),
py::call_guard<py::gil_scoped_release>())
.def(
"_reduce_scatter_base",
[](distributed::ProcessGroup &self,
py::handle py_out_tensor,
py::handle py_in_tensor,
distributed::ReduceOp op) {
auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
distributed::ReduceScatterOptions opts;
opts.reduce_op = op;
auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(
out_tensor.impl());
auto dense_in = std::dynamic_pointer_cast<phi::DenseTensor>(
in_tensor.impl());
return self._ReduceScatterBase(*dense_out, *dense_in, opts);
},
py::arg("out_tensor"),
py::arg("in_tensor"),
py::arg("op") = distributed::ReduceOp::SUM,
py::call_guard<py::gil_scoped_release>());
#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
auto processGroupNCCL =
py::class_<distributed::ProcessGroupNCCL,
std::shared_ptr<distributed::ProcessGroupNCCL>>(
*m, "ProcessGroupNCCL", ProcessGroup)
......@@ -283,6 +327,12 @@ void BindDistributed(py::module *m) {
py::arg("place"),
py::arg("group_id") = 0,
py::call_guard<py::gil_scoped_release>());
processGroupNCCL.def_static(
"group_start", []() { distributed::ProcessGroupNCCL::GroupStart(); });
processGroupNCCL.def_static(
"group_end", []() { distributed::ProcessGroupNCCL::GroupEnd(); });
#endif
#if defined(PADDLE_WITH_GLOO) && defined(PADDLE_WITH_PSCORE) && \
......
......@@ -41,6 +41,14 @@ from .collective import recv # noqa: F401
from .collective import get_group # noqa: F401
from .collective import send # noqa: F401
from .collective import wait # noqa: F401
from .collective import is_initialized # noqa: F401
from .collective import destroy_process_group # noqa: F401
from .collective import alltoall_single # noqa: F401
from .collective import isend # noqa: F401
from .collective import irecv # noqa: F401
from .collective import batch_isend_irecv # noqa: F401
from .collective import P2POp # noqa: F401
from .collective import reduce_scatter # noqa: F401
from .auto_parallel import shard_op # noqa: F401
from .auto_parallel import shard_tensor # noqa: F401
......@@ -59,33 +67,11 @@ from . import utils # noqa: F401
from .sharding import * # noqa: F401
__all__ = [ # noqa
"spawn",
"launch",
"scatter",
"broadcast",
"ParallelEnv",
"new_group",
"init_parallel_env",
"gloo_init_parallel_env",
"gloo_barrier",
"gloo_release",
"QueueDataset",
"split",
"CountFilterEntry",
"ShowClickEntry",
"get_world_size",
"get_group",
"all_gather",
"InMemoryDataset",
"barrier",
"all_reduce",
"alltoall",
"send",
"reduce",
"recv",
"ReduceOp",
"wait",
"get_rank",
"ProbabilityEntry",
"ParallelMode",
"spawn", "launch", "scatter", "broadcast", "ParallelEnv", "new_group",
"init_parallel_env", "gloo_init_parallel_env", "gloo_barrier",
"gloo_release", "QueueDataset", "split", "CountFilterEntry",
"ShowClickEntry", "get_world_size", "get_group", "all_gather",
"InMemoryDataset", "barrier", "all_reduce", "alltoall", "send", "reduce",
"recv", "ReduceOp", "wait", "get_rank", "ProbabilityEntry", "ParallelMode",
"is_initialized", "isend", "irecv", "reduce_scatter"
]
......@@ -42,6 +42,7 @@ from paddle.distributed.collective import _set_default_backend
from paddle.distributed.collective import _set_default_store
from paddle.distributed.collective import _new_process_group_impl
from paddle.distributed.collective import Group
from paddle.distributed.collective import _set_group_map_backend
__all__ = []
......@@ -257,6 +258,7 @@ def init_parallel_env():
name=_default_group_name)
_set_group_map_by_name(_default_group_name, group)
_set_group_map(0, group)
_set_group_map_backend(group, backend)
parallel_helper._set_parallel_ctx(True)
paddle.distributed.barrier(group=group)
......
......@@ -72,7 +72,10 @@ list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard)
list(APPEND DIST_TEST_OPS test_auto_parallel_save_load)
list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert)
list(APPEND DIST_TEST_OPS test_collective_process_group)
list(APPEND DIST_TEST_OPS test_collective_alltoall_single)
list(APPEND DIST_TEST_OPS test_eager_dist_api)
list(APPEND DIST_TEST_OPS test_collective_batch_isend_irecv)
list(APPEND DIST_TEST_OPS test_collective_reduce_scatter)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
#remove distribute unittests.
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
......@@ -334,7 +337,11 @@ if((NOT WITH_GPU) AND (NOT WITH_ROCM))
list(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load)
list(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert)
list(REMOVE_ITEM TEST_OPS test_collective_process_group)
list(REMOVE_ITEM TEST_OPS test_collective_alltoall_single)
list(REMOVE_ITEM TEST_OPS test_eager_dist_api)
list(REMOVE_ITEM TEST_OPS test_collective_batch_isend_irecv)
list(REMOVE_ITEM TEST_OPS test_collective_reduce_scatter)
elseif(WITH_GPU)
if(${CUDNN_VERSION} VERSION_LESS 7100)
list(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
......@@ -1569,8 +1576,10 @@ if(WITH_DISTRIBUTE
set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120)
set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120)
set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120)
set_tests_properties(test_collective_alltoall_single PROPERTIES TIMEOUT 60)
set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT 100)
set_tests_properties(test_collective_batch_isend_irecv PROPERTIES TIMEOUT 100)
set_tests_properties(test_collective_reduce_scatter PROPERTIES TIMEOUT 100)
if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
set_tests_properties(test_parallel_dygraph_sparse_embedding
PROPERTIES TIMEOUT 200)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import paddle
import numpy as np
import random
import paddle.distributed as dist
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
from paddle import framework
class TestCollectiveAllToAllSingle(unittest.TestCase):
def setUp(self):
assert not paddle.distributed.is_initialized(), \
"The distributed environment has not been initialized."
dist.init_parallel_env()
assert paddle.distributed.is_initialized(), \
"The distributed environment has been initialized."
paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def test_collective_alltoall_single(self):
rank = dist.get_rank()
size = dist.get_world_size()
# case 1
input = paddle.ones([size, size], dtype='int64') * rank
output = paddle.empty([size, size], dtype='int64')
expected_output = paddle.concat(
[paddle.ones([1, size], dtype='int64') * i for i in range(size)])
group = dist.new_group([0, 1])
dist.alltoall_single(input, output, group=group)
np.testing.assert_allclose(output.numpy(), expected_output.numpy())
dist.destroy_process_group(group)
# case 2
in_split_sizes = [i + 1 for i in range(size)]
out_split_sizes = [rank + 1 for i in range(size)]
input = paddle.ones([sum(in_split_sizes), size], dtype='float32') * rank
output = paddle.empty([(rank + 1) * size, size], dtype='float32')
expected_output = paddle.concat([
paddle.ones([rank + 1, size], dtype='float32') * i
for i in range(size)
])
group = dist.new_group([0, 1])
task = dist.alltoall_single(input,
output,
in_split_sizes,
out_split_sizes,
use_calc_stream=False,
group=group)
task.wait()
np.testing.assert_allclose(output.numpy(), expected_output.numpy())
dist.destroy_process_group(group)
def tearDown(self):
dist.destroy_process_group()
assert not paddle.distributed.is_initialized(), \
"The distributed environment has been deinitialized."
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import paddle
import numpy as np
import random
import paddle.distributed as dist
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
from paddle import framework
class TestCollectiveBatchIsendIrecv(unittest.TestCase):
def setUp(self):
dist.init_parallel_env()
paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def test_collective_batch_isend_irecv(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
send_t = paddle.arange(2) + rank
# paddle.tensor([0, 1]) # Rank-0
# paddle.tensor([1, 2]) # Rank-1
recv_t = paddle.empty(shape=[2], dtype=send_t.dtype)
send_op = dist.P2POp(dist.isend, send_t, (rank + 1) % world_size)
recv_op = dist.P2POp(dist.irecv, recv_t,
(rank - 1 + world_size) % world_size)
tasks = dist.batch_isend_irecv([send_op, recv_op])
for task in tasks:
task.wait()
if rank == 0:
np.testing.assert_allclose(recv_t.numpy(), [1, 2])
elif rank == 1:
np.testing.assert_allclose(recv_t.numpy(), [0, 1])
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import paddle
import numpy as np
import random
import paddle.distributed as dist
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
from paddle import framework
class TestCollectiveReduceScatter(unittest.TestCase):
def setUp(self):
dist.init_parallel_env()
paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
def test_collective_reduce_scatter_sum(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
t1 = paddle.to_tensor([0, 1])
t2 = paddle.to_tensor([2, 3])
else:
t1 = paddle.to_tensor([4, 5])
t2 = paddle.to_tensor([6, 7])
input_list = [t1, t2]
output = paddle.empty(shape=[2], dtype=input_list[0].dtype)
dist.reduce_scatter(output, input_list)
if rank == 0:
np.testing.assert_allclose(output.numpy(), [4, 6])
elif rank == 1:
np.testing.assert_allclose(output.numpy(), [8, 10])
def test_collective_reduce_scatter_max(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
if rank == 0:
t1 = paddle.to_tensor([0, 1], dtype="float16")
t2 = paddle.to_tensor([2, 3], dtype="float16")
else:
t1 = paddle.to_tensor([4, 5], dtype="float16")
t2 = paddle.to_tensor([6, 7], dtype="float16")
input_list = [t1, t2]
output = paddle.empty(shape=[2], dtype=input_list[0].dtype)
dist.reduce_scatter(output, input_list, op=dist.ReduceOp.MAX)
if rank == 0:
np.testing.assert_allclose(output.numpy(), [4, 5])
elif rank == 1:
np.testing.assert_allclose(output.numpy(), [6, 7])
def test_collective_reduce_scatter_base(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
input = paddle.arange(4) + rank
# [0, 1, 2, 3] # Rank-0
# [1, 2, 3, 4] # Rank-1
output = paddle.empty(shape=[2], dtype=input.dtype)
task = paddle.distributed.collective._reduce_scatter_base(
output, input, use_calc_stream=False)
task.wait()
if rank == 0:
np.testing.assert_allclose(output.numpy(), [1, 3])
elif rank == 1:
np.testing.assert_allclose(output.numpy(), [5, 7])
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestCollectiveAllToAllSingle(TestMultipleGpus):
def test_collective_alltoall_single(self):
self.run_mnist_2gpu('collective_alltoall_single.py', eager_mode=True)
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestCollectiveBatchIsendIrecv(TestMultipleGpus):
def test_collective_batch_isend_irecv(self):
self.run_mnist_2gpu('collective_batch_isend_irecv.py', eager_mode=True)
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestCollectiveReduceScatter(TestMultipleGpus):
def test_collective_reduce_scatter(self):
self.run_mnist_2gpu('collective_reduce_scatter.py', eager_mode=True)
if __name__ == "__main__":
os.environ["FLAGS_enable_eager_mode"] = "1"
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册