未验证 提交 07741593 编写于 作者: K kuizhiqing 提交者: GitHub

new group (#31682)

* new group

* ci compatible fix

* assert nccl
上级 dbeb3ea4
......@@ -19,12 +19,11 @@
#include <utility>
#include <vector>
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/bkcl_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/gen_comm_id_helper.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/string_helper.h"
......@@ -77,7 +76,7 @@ void BKCLParallelContext::Init() {
bkcl_ids.resize(strategy_.nrings_);
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
// generate the unique bkclid on the root worker
for (size_t i = 0; i < bkcl_ids.size(); ++i) {
auto ret = bkcl_get_unique_id(&bkcl_ids[i]);
PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
......@@ -99,6 +98,28 @@ void BKCLParallelContext::Init() {
}
}
void BKCLParallelContext::InitWithRingID(int ring_id) {
std::vector<BKCLUniqueId> bkcl_ids;
bkcl_ids.resize(1);
if (strategy_.local_rank_ == 0) {
// generate the unique bkclid on the root worker
auto ret = bkcl_get_unique_id(&bkcl_ids[0]);
PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret,
platform::errors::PreconditionNotMet(
"BKCL get unique id failed [%d]", ret));
}
BcastBKCLId(bkcl_ids, 0);
int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device;
VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
<< " ring id: " << ring_id;
// it will assign bkcl_comm in XPUDeviceContext within ring_id
platform::BKCLCommContext::Instance().CreateBKCLComm(
&bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
}
void BKCLParallelContext::AllReduceByStream(const framework::Variable &src,
framework::Variable *dst,
int ring_id, bool use_calc_stream) {
......
......@@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext {
void Init() override;
void InitWithRingID(int ring_id) override;
void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst, int ring_id,
bool use_calc_stream) override;
......
......@@ -79,6 +79,30 @@ void NCCLParallelContext::Init() {
}
}
void NCCLParallelContext::InitWithRingID(int ring_id) {
std::vector<ncclUniqueId> nccl_ids;
nccl_ids.resize(1);
if (strategy_.local_rank_ == 0) {
// generate the unique ncclid on the root worker
platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
}
BcastNCCLId(nccl_ids, 0);
int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device;
VLOG(0) << "init nccl context nranks: " << strategy_.nranks_
<< " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
<< " ring id: " << ring_id;
// it will assign nccl_comm in CUDADeviceContext within ring_id
platform::NCCLCommContext::Instance().CreateNCCLComm(
&nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
BOOST_GET_CONST(platform::CUDAPlace, place_).device));
}
void NCCLParallelContext::AllReduceByStream(const framework::Variable &src,
framework::Variable *dst,
int ring_id, bool use_calc_stream) {
......
......@@ -53,6 +53,8 @@ class NCCLParallelContext : public ParallelContext {
void Init() override;
void InitWithRingID(int ring_id) override;
void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst, int ring_id,
bool use_calc_stream) override;
......
......@@ -50,6 +50,8 @@ class ParallelContext {
virtual void Init() = 0;
virtual void InitWithRingID(int ring_id) = 0;
virtual void AllReduceByStream(const framework::Variable& src,
framework::Variable* dst, int ring_id,
bool use_calc_stream) = 0;
......
......@@ -15,40 +15,20 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
namespace paddle {
namespace operators {
class CSyncCalcStreamOp : public framework::OperatorBase {
class CSyncCalcStreamOp : public framework::OperatorWithKernel {
public:
CSyncCalcStreamOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on gpu place only for now."));
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
#endif
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.GetPlace());
}
};
......@@ -65,10 +45,36 @@ Call calculation stream synchronization.
}
};
template <typename T>
class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
auto place = ctx.GetPlace();
auto dev_ctx = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream()));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
#endif
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp,
REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp,
ops::CSyncCalcStreamOpMaker);
REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream,
ops::CSyncCalcStreamCudaKernel<float>);
......@@ -14,45 +14,25 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace framework {
class Scope;
} // namespace framework
} // namespace paddle
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
class CSyncCommStreamOp : public framework::OperatorBase {
class CSyncCommStreamOp : public framework::OperatorWithKernel {
public:
CSyncCommStreamOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
PADDLE_ENFORCE_EQ(is_gpu_place(place), true,
platform::errors::PreconditionNotMet(
"Sync stream op can run on gpu place only for now."));
using framework::OperatorWithKernel::OperatorWithKernel;
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
int ring_id = Attr<int>("ring_id");
auto stream =
platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.GetPlace());
}
};
......@@ -72,10 +52,38 @@ Call communication stream synchronization.
}
};
template <typename T>
class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto place = ctx.GetPlace();
int ring_id = ctx.Attr<int>("ring_id");
auto stream =
platform::NCCLCommContext::Instance().Get(ring_id, place)->stream();
#ifdef PADDLE_WITH_RCCL
PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
#else
PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
#endif
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp,
REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp,
ops::CSyncCommStreamOpMaker);
REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream,
ops::CSyncCommStreamCudaKernel<float>);
......@@ -1578,7 +1578,10 @@ void BindImperative(py::module *m_ptr) {
m, "NCCLParallelContext")
.def(py::init<const imperative::ParallelStrategy &,
const platform::CUDAPlace &>())
.def("init", [](imperative::NCCLParallelContext &self) { self.Init(); });
.def("init", [](imperative::NCCLParallelContext &self) { self.Init(); })
.def("init_with_ring_id",
&imperative::NCCLParallelContext::InitWithRingID,
py::arg("ring_id"));
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
......@@ -1587,7 +1590,10 @@ void BindImperative(py::module *m_ptr) {
m, "BKCLParallelContext")
.def(py::init<const imperative::ParallelStrategy &,
const platform::XPUPlace &>())
.def("init", [](imperative::BKCLParallelContext &self) { self.Init(); });
.def("init", [](imperative::BKCLParallelContext &self) { self.Init(); })
.def("init_with_ring_id",
&imperative::BKCLParallelContext::InitWithRingID,
py::arg("ring_id"));
#endif
}
......
......@@ -119,6 +119,8 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
{"fill_constant", {"Out"}},
{"matmul", {"Out"}},
{"c_broadcast", {"Out"}},
{"c_sync_calc_stream", {"Out"}},
{"c_sync_comm_stream", {"Out"}},
{"c_allreduce_sum", {"Out"}},
{"c_allreduce_max", {"Out"}},
{"c_allreduce_min", {"Out"}},
......
......@@ -82,6 +82,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
LIST(REMOVE_ITEM TEST_OPS test_collective_wait)
......@@ -177,6 +178,7 @@ endif()
if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
list(REMOVE_ITEM TEST_OPS test_imperative_group)
LIST(REMOVE_ITEM TEST_OPS test_new_group_api)
endif()
if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
......@@ -518,6 +520,7 @@ if(WITH_DISTRIBUTE)
if(WITH_GPU OR WITH_ROCM)
bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
py_test_modules(test_launch_coverage MODULES test_launch_coverage)
bash_test_modules(test_new_group START_BASH test_new_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
endif()
bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
......@@ -831,6 +834,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
if(WITH_DISTRIBUTE)
set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
endif()
......@@ -853,6 +857,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
test_collective_barrier_api
test_collective_reduce_api
test_collective_allreduce_api
test_new_group_api
test_collective_broadcast_api
test_collective_allgather_api
PROPERTIES LABELS "RUN_TYPE=DIST")
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import argparse
import os
import sys
import signal
import time
import socket
from contextlib import closing
from six import string_types
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle.fluid.unique_name as nameGen
from paddle.fluid import core
import unittest
from multiprocessing import Process
import paddle.fluid.layers as layers
from functools import reduce
from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
paddle.enable_static()
class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase):
def __init__(self):
self.global_ring_id = 0
def get_model(self, main_prog, startup_program, rank):
with fluid.program_guard(main_prog, startup_program):
tindata = layers.data(
name="tindata", shape=[10, 1000], dtype='float32')
gp = paddle.distributed.new_group([0, 1])
paddle.distributed.all_reduce(
tindata, group=gp, use_calc_stream=False)
return [tindata]
if __name__ == "__main__":
runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce")
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import os
import paddle
class TestNewGroupAPI(object):
def __init__(self):
paddle.distributed.init_parallel_env()
d1 = np.array([1, 2, 3])
d2 = np.array([2, 3, 4])
self.tensor1 = paddle.to_tensor(d1)
self.tensor2 = paddle.to_tensor(d2)
def test_all(self):
gp = paddle.distributed.new_group([0, 1])
print("test new group api ok")
tmp = np.array([0, 0, 0])
result = paddle.to_tensor(tmp)
paddle.distributed.scatter(
result, [self.tensor2, self.tensor1],
src=0,
group=gp,
use_calc_stream=True)
if gp.rank == 0:
assert np.array_equal(result, self.tensor2)
elif gp.rank == 1:
assert np.array_equal(result, self.tensor1)
print("test scatter api ok")
paddle.distributed.broadcast(
result, src=1, group=gp, use_calc_stream=True)
assert np.array_equal(result, self.tensor1)
print("test broadcast api ok")
paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
if gp.rank == 0:
assert np.array_equal(result,
paddle.add(self.tensor1, self.tensor1))
elif gp.rank == 1:
assert np.array_equal(result, self.tensor1)
print("test reduce api ok")
paddle.distributed.all_reduce(result, use_calc_stream=True)
assert np.array_equal(
result,
paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
print("test all_reduce api ok")
paddle.distributed.wait(result, gp, use_calc_stream=True)
paddle.distributed.wait(result, gp, use_calc_stream=False)
print("test wait api ok")
result = []
paddle.distributed.all_gather(
result, self.tensor1, group=gp, use_calc_stream=True)
assert np.array_equal(result[0], self.tensor1)
assert np.array_equal(result[1], self.tensor1)
print("test all_gather api ok")
paddle.distributed.barrier(group=gp)
print("test barrier api ok")
return
if __name__ == "__main__":
gpt = TestNewGroupAPI()
gpt.test_all()
#!/bin/bash
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 new_group.py
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
from test_collective_api_base import TestDistBase
paddle.enable_static()
class TestCollectiveAllreduceAPI(TestDistBase):
def _setup_config(self):
pass
def test_allreduce_nccl(self):
self.check_with_place("collective_allreduce_new_group_api.py",
"allreduce", "nccl")
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册