未验证 提交 027f9953 编写于 作者: W WangXi 提交者: GitHub

Optimize error message, include dgc, nccl, size op (#24456)

上级 a5361982
......@@ -23,8 +23,8 @@ class DGCClipByNormOp : public ClipByNormOp {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("current_step"),
"current_step should be set.");
OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
"DGCClipByNormOp");
return ClipByNormOp::InferShape(ctx);
}
......
......@@ -25,28 +25,21 @@ class DGCOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Grad"),
"Input(Grad) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(
ctx->HasInput("Param"), true,
platform::errors::NotFound("Input(Param) of DGCop is not found."));
PADDLE_ENFORCE(ctx->HasInput("current_step"),
"Input(current_step) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("nranks"), true,
"Input(nranks) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("U_out"),
"Output(U_out) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("V_out"),
"Output(V_out) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("k"),
"Output(k) of DGCop should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"),
"Output(EncodeGrad) of DGCop should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasOutput("GatherBuff"), true,
"Output(EncodeGrad) of DGCop should not be null.");
OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "DGCOp");
OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "DGCOp");
OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "DGCOp");
OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "DGCOp");
OP_INOUT_CHECK(ctx->HasInput("current_step"), "Input", "current_step",
"DGCOp");
OP_INOUT_CHECK(ctx->HasInput("nranks"), "Input", "nranks", "DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("U_out"), "Output", "U_out", "DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("V_out"), "Output", "V_out", "DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("k"), "Output", "k", "DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("EncodeGrad"), "Output", "EncodeGrad",
"DGCOp");
OP_INOUT_CHECK(ctx->HasOutput("GatherBuff"), "Output", "GatherBuff",
"DGCOp");
}
protected:
......
......@@ -24,14 +24,22 @@ namespace operators {
inline float get_period_sparcity(const std::vector<float>& sparsity,
float cur_step, float rampup_steps) {
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0);
PADDLE_ENFORCE_GE(static_cast<int>(cur_step), 0,
platform::errors::InvalidArgument(
"DGC current step=%d, but it must >= 0, "
"please submit issue in github",
static_cast<int>(cur_step)));
size_t idx = static_cast<int>(cur_step * sparsity.size() / rampup_steps);
if (idx >= sparsity.size()) {
idx = sparsity.size() - 1;
}
PADDLE_ENFORCE_LT(idx, sparsity.size());
PADDLE_ENFORCE_LT(
idx, sparsity.size(),
platform::errors::OutOfRange(
"sparsity index out of bounds. idx=%d >= sparsity.size=%d", idx,
sparsity.size()));
return sparsity[idx];
}
......@@ -55,7 +63,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
// nranks
auto nranks_tensor = ctx.Input<framework::Tensor>("nranks");
const int nranks = static_cast<const int>(*nranks_tensor->data<float>());
PADDLE_ENFORCE_GT(nranks, 1, "DGC is not useful when num_trainers <= 1");
PADDLE_ENFORCE_GT(nranks, 1,
platform::errors::PreconditionNotMet(
"DGC is not useful when num_trainers <= 1. Please "
"use multi card or multi machine GPU"));
// regularization
auto p = ctx.Input<framework::Tensor>("Param");
......@@ -105,8 +116,10 @@ class DGCOpKernel : public framework::OpKernel<T> {
1 - get_period_sparcity(
sparsity, static_cast<float>(*current_step - rampup_begin_step),
rampup_step);
PADDLE_ENFORCE_GE(ratio, 0.0);
PADDLE_ENFORCE_LT(ratio, 1.0);
PADDLE_ENFORCE_GE(ratio, 0.0, platform::errors::InvalidArgument(
"DGC sparsity ratio must >= 0"));
PADDLE_ENFORCE_LT(ratio, 1.0, platform::errors::InvalidArgument(
"DGC sparsity ratio must < 1"));
int k = static_cast<int>(g->numel() * ratio);
VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov
......
......@@ -31,12 +31,15 @@ class NCCLInitOp : public framework::OperatorBase {
private:
void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override {
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kParallelScopes)),
"Can not find variable '%s' in the scope.",
kParallelScopes);
PADDLE_ENFORCE_NOT_NULL(
scope.FindVar(Input(kParallelScopes)),
platform::errors::NotFound("Can not find variable '%s' in the scope.",
kParallelScopes));
const auto &name = Output("Communicator");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
"Can not find variable '%s' in the scope.", name);
PADDLE_ENFORCE_NOT_NULL(
scope.FindVar(name),
platform::errors::NotFound(
"Output(%s) is needed for ncclInit operator.", name));
// A parallel do may not use all the gpus. For example, the batch size is 7
// in the last batch while we have 8 gpu. In this case, parallel_do will
// create 7 parallel scopes, so should ncclInitOp create 7 gpu peers
......@@ -46,11 +49,9 @@ class NCCLInitOp : public framework::OperatorBase {
for (int i = 0; i < static_cast<int>(parallel_scopes.size()); ++i) {
gpus[i] = i;
}
PADDLE_ENFORCE(!gpus.empty(), "NCCL init with 0 gpus.");
if (scope.FindVar(name) == nullptr) {
PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
}
PADDLE_ENFORCE_EQ(!gpus.empty(), true,
platform::errors::PreconditionNotMet(
"gpus is empty, NCCL must init with gpus"));
platform::Communicator *comm =
scope.FindVar(name)->GetMutable<platform::Communicator>();
......@@ -92,17 +93,17 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of AllReduce op input should not be NULL");
PADDLE_ENFORCE(
ctx->HasInput("Communicator"),
" Input(Communicator) of AllReduce op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of AllReduce op output should not be NULL");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLAllReduce");
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
"NCCLAllReduce");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLAllReduce");
std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
PADDLE_ENFORCE_EQ(
(reduction == "ncclSum" || reduction == "ncclProd" ||
reduction == "ncclMin" || reduction == "ncclMax"),
"invalid reduction.");
true, platform::errors::InvalidArgument("invalid nccl reduction."));
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
......@@ -137,18 +138,17 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Reduce op input should not be NULL");
PADDLE_ENFORCE(
ctx->HasInput("Communicator"),
" Input(Communicator) of Reduce op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Input(X) of Reduce op input should not be NULL");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLReduce");
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
"NCCLReduce");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLReduce");
std::string reduction = ctx->Attrs().Get<std::string>("reduction");
PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
PADDLE_ENFORCE_EQ(
(reduction == "ncclSum" || reduction == "ncclProd" ||
reduction == "ncclMin" || reduction == "ncclMax"),
"invalid reduction.");
true, platform::errors::InvalidArgument("invalid nccl reduction."));
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
......@@ -188,15 +188,16 @@ class NCCLBcastOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
" Input(X) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasInput("Communicator"),
" Input(Communicator) of Bcast op input should not be NULL");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
" Output(Out) of Bcast op output should not be NULL");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NCCLBcast");
OP_INOUT_CHECK(ctx->HasInput("Communicator"), "Input", "Communicator",
"NCCLBcast");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NCCLBcast");
int root = ctx->Attrs().Get<int>("root");
PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
PADDLE_ENFORCE_EQ(
root != platform::kInvalidGPUId, true,
platform::errors::InvalidArgument("Bcast root must be set."));
auto x_dims = ctx->GetInputsDim("X");
ctx->SetOutputsDim("Out", x_dims);
......
......@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <functional>
#include <unordered_map>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
......@@ -37,29 +38,35 @@ class NCCLTypeWrapper<double> {
static const ncclDataType_t type = ncclDouble;
};
static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
{"ncclSum", ncclSum},
{"ncclMin", ncclMin},
{"ncclMax", ncclMax},
{"ncclProd", ncclProd},
};
auto it = str_to_type.find(reduction);
PADDLE_ENFORCE_EQ(it != str_to_type.end(), true,
platform::errors::InvalidArgument(
"Invalid nccl reduction. Must be ncclMin | ncclMax | "
"ncclProd | ncclSum"));
return it->second;
}
template <typename T>
class NCCLAllReduceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::PreconditionNotMet(
"This kernel only runs on GPU device."));
auto* x = ctx.Input<LoDTensor>("X");
auto* out = ctx.Output<LoDTensor>("Out");
auto* comm = ctx.Input<Communicator>("Communicator");
std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum;
if (reduction == "ncclMin") {
reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") {
reduction_op_ = ncclMax;
} else if (reduction == "ncclSum") {
reduction_op_ = ncclSum;
} else if (reduction == "ncclProd") {
reduction_op_ = ncclProd;
} else {
PADDLE_THROW("Invalid reduction. default ncclSum.");
}
auto reduction_op_ = str_to_nccl_red_type(reduction);
// device id
int gpu_id =
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
......@@ -67,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
VLOG(3) << "gpu : "
<< " invoke allreduce. send " << x->numel() << " recv "
<< out->numel();
PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
x->data<T>(), out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, reduction_op_, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
......@@ -81,26 +88,17 @@ template <typename T>
class NCCLReduceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"This kernel only runs on GPU device."));
auto x = ctx.Input<LoDTensor>("X"); // x0, x1, x2
auto out = ctx.Output<LoDTensor>("Out");
auto* comm = ctx.Input<Communicator>("Communicator");
int root = ctx.Attr<int>("root");
std::string reduction = ctx.Attr<std::string>("reduction");
ncclRedOp_t reduction_op_ = ncclSum;
if (reduction == "ncclMin") {
reduction_op_ = ncclMin;
} else if (reduction == "ncclMax") {
reduction_op_ = ncclMax;
} else if (reduction == "ncclSum") {
reduction_op_ = ncclSum;
} else if (reduction == "ncclProd") {
reduction_op_ = ncclProd;
} else {
PADDLE_THROW("Invalid reduction. default ncclSum.");
}
auto reduction_op_ = str_to_nccl_red_type(reduction);
// device id
int gpu_id =
BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
......@@ -113,7 +111,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
}
VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
<< " recv " << out->numel();
PADDLE_ENFORCE(platform::dynload::ncclReduce(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
x->data<T>(), recvbuffer, x->numel(), NCCLTypeWrapper<T>::type,
reduction_op_, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
......@@ -126,8 +124,9 @@ template <typename T>
class NCCLBcastKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
"This kernel only runs on GPU device.");
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"This kernel only runs on GPU device."));
int root = ctx.Attr<int>("root");
auto* comm = ctx.Input<Communicator>("Communicator");
// device id
......@@ -137,7 +136,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
if (idx == root) {
auto* x = ctx.Input<LoDTensor>("X");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), x->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
......@@ -146,7 +145,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
auto* out = ctx.Output<LoDTensor>("Out");
VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
<< framework::product(out->dims());
PADDLE_ENFORCE(platform::dynload::ncclBcast(
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
out->mutable_data<T>(ctx.GetPlace()), out->numel(),
NCCLTypeWrapper<T>::type, root, comm->comms().at(idx),
ctx.cuda_device_context().stream()));
......
......@@ -45,9 +45,8 @@ class NCCLTester : public ::testing::Test {
public:
void SetUp() override {
int count = p::GetCUDADeviceCount();
if (count <= 1) {
LOG(WARNING)
<< "Cannot test multi-gpu nccl, because the CUDA device count is "
if (count <= 0) {
LOG(WARNING) << "Cannot test gpu nccl, because the CUDA device count is "
<< count;
exit(0);
}
......@@ -114,8 +113,9 @@ class NCCLTester : public ::testing::Test {
lk.unlock();
PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
"Tensor numel not match!");
PADDLE_ENFORCE_EQ(
send_tensor->numel(), f::product(kDims),
paddle::platform::errors::InvalidArgument("Tensor numel not match!"));
auto op = f::OpRegistry::CreateOp(*op1);
......@@ -126,6 +126,10 @@ class NCCLTester : public ::testing::Test {
VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
}
void testNcclReduceOp();
void testNcclAllReduceOp();
void testNcclBcastOp();
public:
std::vector<p::DeviceContext *> dev_ctxs_;
f::Scope g_scope_;
......@@ -133,13 +137,7 @@ class NCCLTester : public ::testing::Test {
std::vector<int> gpu_list_;
};
// ncclInitOp with desc
TEST_F(NCCLTester, ncclInitOp) {}
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
/*
TEST_F(NCCLTester, ncclAllReduceOp) {
void NCCLTester::testNcclAllReduceOp() {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
op2->SetType("ncclAllReduce");
op2->SetInput("X", {"st"});
......@@ -186,10 +184,8 @@ TEST_F(NCCLTester, ncclAllReduceOp) {
}
}
}
*/
// ncclReduceOp with desc
TEST_F(NCCLTester, ncclReduceOp) {
void NCCLTester::testNcclReduceOp() {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
const int kRoot = 0;
op2->SetType("ncclReduce");
......@@ -236,10 +232,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
}
}
// ncclBcastOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
/*
TEST_F(NCCLTester, ncclBcastOp) {
void NCCLTester::testNcclBcastOp() {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
const int kRoot = 0;
op2->SetType("ncclBcast");
......@@ -263,13 +256,17 @@ TEST_F(NCCLTester, ncclBcastOp) {
ths[i].join();
}
const int idx = 1;
const int idx = gpu_list_.size() - 1;
float result = GetGPUData(kRoot);
p::CPUPlace cpu_place;
p::CUDAPlace gpu_place(gpu_list_[idx]);
auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
std::string rt_str = "rt";
if (idx == kRoot) {
rt_str = "st";
}
auto &recv_tensor = dev_scopes[idx]->FindVar(rt_str)->Get<f::LoDTensor>();
auto *rt = recv_tensor.data<float>();
auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
result_tensor->Resize(kDims);
......@@ -284,4 +281,20 @@ TEST_F(NCCLTester, ncclBcastOp) {
ASSERT_NEAR(ct[j], result, 1e-5);
}
}
*/
// ncclInitOp with desc
TEST_F(NCCLTester, ncclInitOp) {}
TEST_F(NCCLTester, ncclOp) {
// Serial execution is required for the same nccl comm.
// ncclAllReduceOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
testNcclReduceOp();
testNcclAllReduceOp();
// ncclBcastOp with desc
// TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
testNcclBcastOp();
}
......@@ -23,10 +23,9 @@ class SizeOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input (Input) of Size op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output (Out) of Size op should not be null.");
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Size");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Size");
ctx->SetOutputDim("Out", {1});
}
};
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册