未验证 提交 8c40ebd1 编写于 作者: Y Yi Liu 提交者: GitHub

Enhance error message of checkpoint_notify_op, fake_init_op gen_nccl_id_op and...

Enhance error message of checkpoint_notify_op, fake_init_op gen_nccl_id_op and listen_and_serv_op (#24554) (#24844)

test=develop
上级 343687c3
......@@ -49,7 +49,10 @@ class CheckpointNotifyOp : public framework::OperatorBase {
VLOG(3) << "checkpoint notify sending lookup table: " << lookup_table_name
<< " and dir:" << dir << " to " << epmap[i];
}
PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
PADDLE_ENFORCE_EQ(
rpc_client->Wait(), true,
platform::errors::Fatal("Fail to notify checkpoint."
" Internal error occurs in RPCClient."));
}
};
......
......@@ -19,8 +19,7 @@ namespace operators {
class FakeInitInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FakeInitOp should not be null.");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "FakeInit");
auto &shape = ctx->Attrs().Get<std::vector<int64_t>>("shape");
ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
......
......@@ -44,9 +44,15 @@ class GenNCCLIdOp : public framework::OperatorBase {
std::vector<std::string> trainers =
Attr<std::vector<std::string>>("trainers");
PADDLE_ENFORCE(
trainer_id >= 0 && trainer_id < static_cast<int>(trainers.size()),
"trainer_id:%d must be in trainers.size range", trainer_id);
PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
"trainer_id %d is less than 0. Its "
"valid range is [0, trainer_size)"));
PADDLE_ENFORCE_LT(
trainer_id, static_cast<int>(trainers.size()),
platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
"range is [0, trainer_size)",
trainer_id));
std::string endpoint = trainers[trainer_id];
framework::Scope& local_scope = scope.NewScope();
......@@ -58,12 +64,20 @@ class GenNCCLIdOp : public framework::OperatorBase {
int inter_trainer_id = -1;
int exter_trainer_id = -1;
if (use_hierarchical_allreduce) {
PADDLE_ENFORCE(trainers.size() > 1, "trainers.size():%llu < 1",
trainers.size());
PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d < 1", inter_nranks);
PADDLE_ENFORCE((trainers.size() % inter_nranks == 0),
"trainers.size():%llu mod inter_nranks:%d != 0",
trainers.size(), inter_nranks);
PADDLE_ENFORCE_GT(
trainers.size(), 1,
platform::errors::PreconditionNotMet(
"The number of collective trainers %llu <= 1", trainers.size()));
PADDLE_ENFORCE_GT(
inter_nranks, 1,
platform::errors::PreconditionNotMet(
"inter_nranks %d <= 1 while in hierarchical allreduce mode",
inter_nranks));
PADDLE_ENFORCE_EQ(
trainers.size() % inter_nranks, 0,
platform::errors::PreconditionNotMet(
"The number of trainers %llu mod inter_nranks %d is not equal 0",
trainers.size(), inter_nranks));
inter_trainer_id = trainer_id % inter_nranks;
......@@ -106,10 +120,16 @@ class GenNCCLIdOp : public framework::OperatorBase {
return;
}
PADDLE_ENFORCE(trainers.size() % inter_nranks == 0,
"enpoints.size:%llu mod inter_nranks:%d should ==0",
trainers.size(), inter_nranks);
PADDLE_ENFORCE(inter_nranks > 1, "inter_nranks:%d must > 1", inter_nranks);
PADDLE_ENFORCE_EQ(
trainers.size() % inter_nranks, 0,
platform::errors::PreconditionNotMet(
"The number of trainers %llu mod inter_nranks %d is not equal 0",
trainers.size(), inter_nranks));
PADDLE_ENFORCE_GT(
inter_nranks, 1,
platform::errors::PreconditionNotMet(
"inter_nranks %d <= 1 while in hierarchical allreduce mode",
inter_nranks));
// hierarchical inter ncclid
if (inter_trainer_id == 0) {
......@@ -156,10 +176,11 @@ class GenNCCLIdOp : public framework::OperatorBase {
const std::string& nccl_id_name,
const std::vector<std::string>& endpoint_list) const {
auto var = scope->FindVar(nccl_id_name);
PADDLE_ENFORCE_NOT_NULL(var, "can't find nccl_id_var_name:%s",
nccl_id_name);
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable with name %s is not found",
nccl_id_name.c_str()));
auto id = var->GetMutable<ncclUniqueId>();
PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGetUniqueId(id));
distributed::RPCClient* client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
......
......@@ -315,8 +315,9 @@ void ListenAndServOp::CacheVarsType(const std::vector<std::string> &varnames,
const framework::Scope &scope) const {
for (const auto &varname : varnames) {
auto var = scope.FindVar(varname);
PADDLE_ENFORCE(var != nullptr,
"Received var should be initialized in the received scope.");
PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::PreconditionNotMet(
"Received var is not initialized in the received scope."));
if (var->IsType<framework::SelectedRows>()) {
sparse_vars_.push_back(varname);
} else if (var->IsType<framework::LoDTensor>() ||
......@@ -344,7 +345,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto pserver_id = Attr<int>("pserver_id");
auto inputs = Inputs("X");
PADDLE_ENFORCE(!rpc_service_);
PADDLE_ENFORCE_EQ(rpc_service_, nullptr,
platform::errors::PreconditionNotMet(
"RPC service has been created unexpectedly."));
std::string endpoint = Attr<std::string>("endpoint");
int checkpoint_block_id = Attr<int>(kCheckpointBlockId);
int lr_decay_block_id = Attr<int>(kLRDecayBlockId);
......@@ -390,8 +393,10 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
auto optimize_blocks =
Attr<std::vector<framework::BlockDesc *>>(kOptimizeBlocks);
PADDLE_ENFORCE(optimize_blocks.size() >= 1,
"optimize blocks should be 1 at least on the pserver side.");
PADDLE_ENFORCE_GE(optimize_blocks.size(), 1,
platform::errors::PreconditionNotMet(
"optimize blocks is less than 1. Optimize blocks "
"should be 1 at least on the pserver side."));
auto *program = optimize_blocks[0]->Program();
framework::Executor executor(dev_place);
......
......@@ -49,6 +49,7 @@ if(WIN32)
LIST(REMOVE_ITEM TEST_OPS test_trainer_desc)
LIST(REMOVE_ITEM TEST_OPS test_multiprocess_reader_exception)
LIST(REMOVE_ITEM TEST_OPS test_avoid_twice_initialization)
LIST(REMOVE_ITEM TEST_OPS test_checkpoint_notify_op)
endif()
if (NOT ${WITH_GPU})
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
class TestCheckpointNotifyOp(unittest.TestCase):
def test_checkpoint_notify_op(self):
program = fluid.Program()
attrs = {}
attrs['epmap'] = []
attrs['dir'] = ''
attrs['lookup_table'] = ''
program.current_block().append_op(
type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(program)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册