未验证 提交 35074963 编写于 作者: L Leo Chen 提交者: GitHub

Refine error msg in paddle/fluid/framework/details [part 2] (#27429)

* refine broadcast_op_handle

* refine some error messages

* refine some files

* fix bug

* fix bug

* fix bug

* follow comments

* follow comments
上级 162b4d6c
...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -76,7 +76,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The NoDummyInputSize should be equal " "The NoDummyInputSize should be equal "
"to the number of places, but got NoDummyInputSize is " "to the number of places, but got NoDummyInputSize is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl( ...@@ -89,7 +89,7 @@ void AllReduceOpHandle::AllReduceImpl(
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of local scopes should be equal " "The number of local scopes should be equal "
"to the number of places, but got the number of local scopes is " "to the number of places, but got the number of local scopes is "
"%d and the number of place is %d.", "%d and the number of places is %d.",
in_var_handles.size(), num_places)); in_var_handles.size(), num_places));
std::vector<const void *> lod_tensor_data; std::vector<const void *> lod_tensor_data;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() { ...@@ -31,10 +32,15 @@ void BroadcastOpHandle::RunImpl() {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); platform::errors::PreconditionNotMet(
PADDLE_ENFORCE_EQ( "The number of inputs should be 1, but got %d.",
out_var_handles.size(), places_.size(), in_var_handles.size()));
"The number of output should equal to the number of places."); PADDLE_ENFORCE_EQ(out_var_handles.size(), places_.size(),
platform::errors::PreconditionNotMet(
"The number of outputs and the number of places should "
"be equal, but got the number of outputs is %d and the "
"number of places is %d.",
out_var_handles.size(), places_.size()));
VarHandle *in_var_handle = in_var_handles[0]; VarHandle *in_var_handle = in_var_handles[0];
...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -47,7 +53,9 @@ void BroadcastOpHandle::BroadcastOneVar(
const std::vector<Scope *> &var_scopes) { const std::vector<Scope *> &var_scopes) {
auto *in_var = auto *in_var =
var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scopes.",
in_var_handle.name()));
Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var);
if (UNLIKELY(!in_tensor.IsInitialized())) { if (UNLIKELY(!in_tensor.IsInitialized())) {
VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!";
...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -103,7 +111,7 @@ void BroadcastOpHandle::BroadcastOneVar(
broadcast_calls.emplace_back( broadcast_calls.emplace_back(
[send_recv_buffer, numel, type, root_id, &nccl_ctx] { [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclBcast( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
send_recv_buffer, numel, static_cast<ncclDataType_t>(type), send_recv_buffer, numel, static_cast<ncclDataType_t>(type),
root_id, nccl_ctx.comm_, nccl_ctx.stream())); root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar( ...@@ -131,7 +139,8 @@ void BroadcastOpHandle::BroadcastOneVar(
nccl_ctxs_->DevCtx(p)->Wait(); nccl_ctxs_->DevCtx(p)->Wait();
} }
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} }
} }
...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue( ...@@ -154,10 +163,13 @@ void BroadcastOpHandle::InitOutputValue(
auto t_out_p = out_var_handle->place(); auto t_out_p = out_var_handle->place();
auto *out_var = var_scopes.at(out_var_handle->scope_idx()) auto *out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var, platform::errors::NotFound(
"Variable %s is not found in scopes.",
out_var_handle->name()));
if (is_gpu_place(in_tensor.place())) { if (is_gpu_place(in_tensor.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
......
...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle { ...@@ -79,7 +79,8 @@ struct TestBroadcastOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle { ...@@ -113,7 +114,8 @@ struct TestBroadcastOpHandle {
op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_,
place_list_, nccl_ctxs_.get()); place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle { ...@@ -171,7 +173,9 @@ struct TestBroadcastOpHandle {
float val_scalar = 0.0) { float val_scalar = 0.0) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto lod_tensor = var->GetMutable<f::LoDTensor>(); auto lod_tensor = var->GetMutable<f::LoDTensor>();
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
for (size_t k = 0; k < send_vector.size(); ++k) { for (size_t k = 0; k < send_vector.size(); ++k) {
...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle { ...@@ -194,7 +198,9 @@ struct TestBroadcastOpHandle {
} }
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto selected_rows = var->GetMutable<f::SelectedRows>(); auto selected_rows = var->GetMutable<f::SelectedRows>();
auto value = selected_rows->mutable_value(); auto value = selected_rows->mutable_value();
value->mutable_data<float>(kDims, place_list_[input_scope_idx]); value->mutable_data<float>(kDims, place_list_[input_scope_idx]);
...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle { ...@@ -211,13 +217,24 @@ struct TestBroadcastOpHandle {
const std::vector<float>& send_vector, const std::vector<float>& send_vector,
const std::vector<int64_t>& rows, int height) { const std::vector<int64_t>& rows, int height) {
auto var = param_scopes_[input_scope_idx]->FindVar(varname); auto var = param_scopes_[input_scope_idx]->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto& selected_rows = var->Get<f::SelectedRows>(); auto& selected_rows = var->Get<f::SelectedRows>();
auto rt = selected_rows.value(); auto rt = selected_rows.value();
PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(selected_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %ld.",
height, selected_rows.height()));
for (size_t k = 0; k < selected_rows.rows().size(); ++k) { for (size_t k = 0; k < selected_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); PADDLE_ENFORCE_EQ(
selected_rows.rows()[k], rows[k],
platform::errors::InvalidArgument(
"The item at position %zu of rows of SelectedRows "
"is not equal to the expected, expect %ld, but got %ld.",
k, rows[k], selected_rows.rows()[k]));
} }
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle { ...@@ -235,9 +252,15 @@ struct TestBroadcastOpHandle {
framework::Scope* scope) { framework::Scope* scope) {
p::CPUPlace cpu_place; p::CPUPlace cpu_place;
auto var = scope->FindVar(varname); auto var = scope->FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
varname));
auto tensor = var->Get<f::LoDTensor>(); auto tensor = var->Get<f::LoDTensor>();
PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); PADDLE_ENFORCE_EQ(tensor.lod(), lod,
platform::errors::InvalidArgument(
"The LoD of tensor is not equal to "
"the expected, expect %s, but got %s.",
lod, tensor.lod()));
f::Tensor result_tensor; f::Tensor result_tensor;
f::TensorCopySync(tensor, cpu_place, &result_tensor); f::TensorCopySync(tensor, cpu_place, &result_tensor);
float* ct = result_tensor.mutable_data<float>(cpu_place); float* ct = result_tensor.mutable_data<float>(cpu_place);
......
...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -235,7 +235,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("reduce_mode_multi_devices_pass").get(); AppendPass("reduce_mode_multi_devices_pass").get();
break; break;
default: default:
PADDLE_THROW("Unknown reduce strategy."); PADDLE_THROW(
platform::errors::Unimplemented("Unknown reduce strategy."));
} }
} }
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
......
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include <memory> #include <memory>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle( ...@@ -47,15 +48,19 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
if (dynamic_cast<StreamGarbageCollector *>(gc_)) { if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
platform::CUDADeviceGuard guard( platform::CUDADeviceGuard guard(
BOOST_GET_CONST(platform::CUDAPlace, place).device); BOOST_GET_CONST(platform::CUDAPlace, place).device);
PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); PADDLE_ENFORCE_CUDA_SUCCESS(
PADDLE_ENFORCE_NOT_NULL(event_); cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
PADDLE_ENFORCE_NOT_NULL(event_, platform::errors::InvalidArgument(
"The cuda envet created is NULL."));
} }
} }
#endif #endif
PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( PADDLE_ENFORCE_NE(vars.empty(), true,
"Variable names are empty.")); platform::errors::InvalidArgument(
"The variables to be deleted are empty."));
for (auto *var : var_infos_) { for (auto *var : var_infos_) {
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
"The memory optimization info is NULL."));
} }
} }
...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() { ...@@ -64,7 +69,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
if (event_) { if (event_) {
auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace()); auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx_->GetPlace());
platform::CUDADeviceGuard guard(gpu_place.device); platform::CUDADeviceGuard guard(gpu_place.device);
PADDLE_ENFORCE(cudaEventDestroy(event_)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
} }
#endif #endif
} }
...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() { ...@@ -78,12 +83,17 @@ void EagerDeletionOpHandle::InitCUDA() {
} }
void EagerDeletionOpHandle::CallOnce() { void EagerDeletionOpHandle::CallOnce() {
PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); PADDLE_ENFORCE_EQ(
vars_.empty(), true,
platform::errors::InvalidArgument(
"The variables to be deleted should be initialized here."));
Scope *exec_scope = local_exec_scopes_[0]; Scope *exec_scope = local_exec_scopes_[0];
for (auto *var_info : var_infos_) { for (auto *var_info : var_infos_) {
auto *var = exec_scope->FindVar(var_info->Name()); auto *var = exec_scope->FindVar(var_info->Name());
PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", PADDLE_ENFORCE_NOT_NULL(
var_info->Name()); var, platform::errors::NotFound(
"The variable(%s) to be inplaced is not found in scope.",
var_info->Name()));
vars_.emplace_back(var); vars_.emplace_back(var);
} }
} }
...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() { ...@@ -119,8 +129,9 @@ void EagerDeletionOpHandle::RunImpl() {
garbages.emplace_back(t.MoveMemoryHolder()); garbages.emplace_back(t.MoveMemoryHolder());
} }
} else { } else {
PADDLE_THROW("Type %s of %s is not supported eager deletion", PADDLE_THROW(platform::errors::Unimplemented(
framework::ToTypeName(var->Type()), var_info->Name()); "The variable(%s) of type %s is not supported in eager deletion.",
framework::ToTypeName(var->Type()), var_info->Name()));
} }
} }
...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages( ...@@ -137,8 +148,9 @@ void EagerDeletionOpHandle::ClearGarbages(
auto callback_stream = auto callback_stream =
reinterpret_cast<StreamGarbageCollector *>(gc_)->stream(); reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
auto callback_func = [=]() { auto callback_func = [=]() {
PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, compute_stream));
PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(callback_stream, event_, 0));
}; };
gc_->Add(std::move(*garbages), callback_func); gc_->Add(std::move(*garbages), callback_func);
} else { } else {
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() { ...@@ -56,10 +58,20 @@ void FusedAllReduceOpHandle::RunImpl() {
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), place_num * num_of_all_reduce_, in_var_handles.size(), place_num * num_of_all_reduce_,
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of places plus the number of all reduce handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of all reduce handles "
"is %d.",
in_var_handles.size(), place_num, num_of_all_reduce_));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variable handles should be equal to the number "
"of output variable handles, but got the number of input variable "
"handles is %d, and the number of output variable handles is %d.",
in_var_handles.size(), out_var_handles.size()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of // Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused. // those op are in CPUPlace, in this case, the all reduce should not be fused.
...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -106,7 +118,13 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
dtype = ele_dtype; dtype = ele_dtype;
} }
PADDLE_ENFORCE_EQ(ele_dtype, dtype); PADDLE_ENFORCE_EQ(
ele_dtype, dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(dtype)));
// Check whether the address space is contiguous. // Check whether the address space is contiguous.
std::sort( std::sort(
...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc( ...@@ -130,16 +148,29 @@ void FusedAllReduceOpHandle::FusedAllReduceFunc(
"input[%d] address: 0X%02x. The offset: %d", "input[%d] address: 0X%02x. The offset: %d",
k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k,
next_address, k, infer_next_address, offset); next_address, k, infer_next_address, offset);
PADDLE_ENFORCE_EQ(infer_next_address, next_address, PADDLE_ENFORCE_EQ(
"The address is not consistent."); infer_next_address, next_address,
platform::errors::InvalidArgument(
"The infered address of the next tensor should be equal to the "
"real address of the next tensor. But got infered address is %p "
"and real address is %p.",
infer_next_address, next_address));
} }
} }
if (!FLAGS_skip_fused_all_reduce_check) { if (!FLAGS_skip_fused_all_reduce_check) {
for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) {
for (size_t j = 1; j < num_of_all_reduce_; ++j) { for (size_t j = 1; j < num_of_all_reduce_; ++j) {
PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, PADDLE_ENFORCE_EQ(
grads_tensor.at(scope_idx).at(j).first); grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first,
platform::errors::InvalidArgument(
"The variable name of grad tensors of "
"fused_all_reduce_op_handle "
"must be consistent. The current name is %s, but the "
"previous name is %s.",
grads_tensor.at(0).at(j).first,
grads_tensor.at(scope_idx).at(j).first));
} }
} }
} }
...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace( ...@@ -167,7 +198,9 @@ bool FusedAllReduceOpHandle::InputIsInDifferentPlace(
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) {
return true; return true;
...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( ...@@ -185,14 +218,24 @@ void FusedAllReduceOpHandle::GetGradLoDTensor(
size_t place_num = places_.size(); size_t place_num = places_.size();
for (size_t j = 0; j < in_var_handles.size(); j += place_num) { for (size_t j = 0; j < in_var_handles.size(); j += place_num) {
auto var_name = in_var_handles[j]->name(); auto var_name = in_var_handles[j]->name();
PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); PADDLE_ENFORCE_EQ(
var_name, out_var_handles[j]->name(),
platform::errors::InvalidArgument(
"The name of input variable should be equal "
"to the name of output variable. But got the name of input "
"variable is %s and the name of output variable is %s.",
var_name, out_var_handles[j]->name()));
auto var = local_scope->FindVar(var_name); auto var = local_scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound(
"The variable '%s' is not found in local scope.", var_name));
auto &lod_tensor = var->Get<LoDTensor>(); auto &lod_tensor = var->Get<LoDTensor>();
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)), platform::is_same_place(lod_tensor.place(), places_.at(scope_idx)),
true, "%s(%d) is not in the right place.", var_name, scope_idx); true, platform::errors::InvalidArgument(
"The variable '%s' at scope %d is not in the right place.",
var_name, scope_idx));
grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor));
} }
} }
...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( ...@@ -204,16 +247,26 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel(
size_t size_of_dtype = 0; size_t size_of_dtype = 0;
for (size_t i = 0; i < grad_tensor.size(); ++i) { for (size_t i = 0; i < grad_tensor.size(); ++i) {
// Get dtype // Get dtype
auto ele_type = grad_tensor.at(i).second->type(); auto ele_dtype = grad_tensor.at(i).second->type();
if (i == 0) { if (i == 0) {
*dtype = ele_type; *dtype = ele_dtype;
size_of_dtype = framework::SizeOfType(ele_type); size_of_dtype = framework::SizeOfType(ele_dtype);
} }
PADDLE_ENFORCE_EQ(ele_type, *dtype); PADDLE_ENFORCE_EQ(
ele_dtype, *dtype,
platform::errors::InvalidArgument(
"The DataType of grad tensors of fused_all_reduce_op_handle "
"must be consistent. The current dtype is %s, but the "
"previous dtype is %s.",
DataTypeToString(ele_dtype), DataTypeToString(*dtype)));
// Get element number // Get element number
int64_t len = grad_tensor.at(i).second->numel(); int64_t len = grad_tensor.at(i).second->numel();
PADDLE_ENFORCE_GT(len, 0); PADDLE_ENFORCE_GT(
len, 0, platform::errors::InvalidArgument(
"The size of grad tensors of fused_all_reduce_op_handle "
"must be > 0, but got %d.",
len));
*numel += *numel +=
platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype;
} }
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() { ...@@ -32,7 +33,15 @@ void FusedBroadcastOpHandle::RunImpl() {
WaitInputVarGenerated(); WaitInputVarGenerated();
size_t place_num = places_.size(); size_t place_num = places_.size();
PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); PADDLE_ENFORCE_EQ(
in_var_handles.size() * place_num, out_var_handles.size(),
platform::errors::PreconditionNotMet(
"The number of input variable handles plus the number "
"of places should be equal to the number of output variable handles, "
"but got the number of input variable handles is %d, the "
"number of places is %d, and the number of output variable handles "
"is %d.",
in_var_handles.size(), place_num, out_var_handles.size()));
for (size_t i = 0; i < in_var_handles.size(); ++i) { for (size_t i = 0; i < in_var_handles.size(); ++i) {
BroadcastOneVar( BroadcastOneVar(
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/fused_broadcast_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" #include "paddle/fluid/framework/details/broadcast_op_handle_test.h"
#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/op_handle_base.h"
...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { ...@@ -58,7 +60,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
op_handle_ = new FusedBroadcastOpHandle( op_handle_ = new FusedBroadcastOpHandle(
nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get());
#else #else
PADDLE_THROW("CUDA is not supported."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() { ...@@ -32,13 +33,20 @@ void GatherOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of input variables should be equal "
"to the number of places, but got the number of input variables is "
"%d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(
"The number of output should be one."); out_var_handles.size(), 1,
platform::errors::InvalidArgument(
"The number of output variables should be 1, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() { ...@@ -47,10 +55,14 @@ void GatherOpHandle::RunImpl() {
auto in_0_handle = in_var_handles[0]; auto in_0_handle = in_var_handles[0];
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var); PADDLE_ENFORCE_NOT_NULL(
pre_in_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
in_0_handle->name()));
PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(), PADDLE_ENFORCE_EQ(pre_in_var->IsType<framework::SelectedRows>(), true,
"Currently, gather_op only can gather SelectedRows."); platform::errors::Unimplemented(
"Currently, gather_op only supports SelectedRows."));
// Wait input done, this Wait is asynchronous operation // Wait input done, this Wait is asynchronous operation
WaitInputVarGenerated(); WaitInputVarGenerated();
...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() { ...@@ -63,7 +75,10 @@ void GatherOpHandle::RunImpl() {
for (auto *in_handle : in_var_handles) { for (auto *in_handle : in_var_handles) {
auto *in_var = auto *in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var,
platform::errors::NotFound(
"The variable '%s' is not found in the scope.", in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var);
auto &in_sr_value = in_var->Get<framework::SelectedRows>(); auto &in_sr_value = in_var->Get<framework::SelectedRows>();
...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() { ...@@ -76,15 +91,19 @@ void GatherOpHandle::RunImpl() {
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
platform::Place t_out_p = out_var_handle->place(); platform::Place t_out_p = out_var_handle->place();
if (platform::is_gpu_place(pre_in_value.place())) { if (platform::is_gpu_place(pre_in_value.place())) {
PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), PADDLE_ENFORCE_EQ(platform::is_gpu_place(t_out_p), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var,
platform::errors::NotFound("The variable '%s' is not found in the scope.",
out_var_handle->name()));
auto out_value = out_var->GetMutable<framework::SelectedRows>(); auto out_value = out_var->GetMutable<framework::SelectedRows>();
out_value->set_height(pre_in_value.height()); out_value->set_height(pre_in_value.height());
out_value->set_rows(out_rows); out_value->set_rows(out_rows);
......
...@@ -13,8 +13,10 @@ ...@@ -13,8 +13,10 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/gather_op_handle.h" #include "paddle/fluid/framework/details/gather_op_handle.h"
#include <memory> #include <memory>
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
namespace paddle { namespace paddle {
...@@ -60,7 +62,8 @@ struct TestGatherOpHandle { ...@@ -60,7 +62,8 @@ struct TestGatherOpHandle {
ctxs_.emplace_back(new p::CUDADeviceContext(p)); ctxs_.emplace_back(new p::CUDADeviceContext(p));
} }
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -141,7 +144,9 @@ struct TestGatherOpHandle { ...@@ -141,7 +144,9 @@ struct TestGatherOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -155,7 +160,9 @@ struct TestGatherOpHandle { ...@@ -155,7 +160,9 @@ struct TestGatherOpHandle {
} }
auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound(
"The variable '%s' is not found in the scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input");
...@@ -173,9 +180,19 @@ struct TestGatherOpHandle { ...@@ -173,9 +180,19 @@ struct TestGatherOpHandle {
auto& out_select_rows = out_var->Get<f::SelectedRows>(); auto& out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) { ...@@ -207,6 +224,7 @@ TEST(GatherTester, TestGPUGatherTestSelectedRows) {
test_op.TestGatherSelectedRows(input_scope_idx); test_op.TestGatherSelectedRows(input_scope_idx);
} }
#endif #endif
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -46,14 +46,17 @@ class NCCLOpHandleBase : public OpHandleBase {
} }
virtual ~NCCLOpHandleBase() { virtual ~NCCLOpHandleBase() {
for (auto& ev : inter_events_) { for (auto& ev : inter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
for (auto& ev : exter_events_) { for (auto& ev : exter_events_) {
PADDLE_ENFORCE(cudaEventDestroy(ev.second)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(ev.second));
} }
} }
void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { void SetRunEnv(int run_order, bool use_hierarchical_allreduce) {
PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); PADDLE_ENFORCE_GE(
run_order, 0,
platform::errors::InvalidArgument(
"The argument run_order must be >= 0, but got %d.", run_order));
run_order_ = run_order; run_order_ = run_order;
use_hierarchical_allreduce_ = use_hierarchical_allreduce; use_hierarchical_allreduce_ = use_hierarchical_allreduce;
...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -74,8 +77,11 @@ class NCCLOpHandleBase : public OpHandleBase {
return; return;
} }
PADDLE_ENFORCE(places_.size() == 1, PADDLE_ENFORCE_EQ(places_.size(), 1,
"HierarchicalAllReduce run one proc with one card mode."); platform::errors::InvalidArgument(
"HierarchicalAllReduce can only run "
"one proccess with one card mode, but got %d cards.",
places_.size()));
for (auto& p : places_) { for (auto& p : places_) {
auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order);
...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -88,11 +94,11 @@ class NCCLOpHandleBase : public OpHandleBase {
continue; continue;
} }
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &inter_events_[dev_id], cudaEventDisableTiming));
PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventCreateWithFlags(
cudaEventDisableTiming)); &exter_events_[dev_id], cudaEventDisableTiming));
VLOG(10) << "Create events on dev_id:" << dev_id VLOG(10) << "Create events on dev_id:" << dev_id
<< ", inter_event:" << &inter_events_[dev_id] << ", inter_event:" << &inter_events_[dev_id]
<< ", exter_event:" << &exter_events_[dev_id]; << ", exter_event:" << &exter_events_[dev_id];
...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -102,7 +108,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, void FlatNCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); auto& nccl_ctx = flat_nccl_ctxs->at(dev_id);
...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -113,14 +122,17 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dev_id:" << dev_id << ", dtype:" << datatype << ", dev_id:" << dev_id << ", dtype:" << datatype
<< ", place:" << place; << ", place:" << place;
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
} }
void NCCLAllReduce(platform::Place place, const void* sendbuff, void NCCLAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
if (!use_hierarchical_allreduce_) { if (!use_hierarchical_allreduce_) {
FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op);
return; return;
...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -132,7 +144,10 @@ class NCCLOpHandleBase : public OpHandleBase {
void HierarchicalAllReduce(platform::Place place, const void* sendbuff, void HierarchicalAllReduce(platform::Place place, const void* sendbuff,
void* recvbuff, size_t count, void* recvbuff, size_t count,
ncclDataType_t datatype, ncclRedOp_t op) { ncclDataType_t datatype, ncclRedOp_t op) {
PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); PADDLE_ENFORCE_GE(
run_order_, 0,
platform::errors::InvalidArgument(
"The argument run_order_ must be >= 0, but got %d.", run_order_));
InterReduce(place, sendbuff, recvbuff, count, datatype, op); InterReduce(place, sendbuff, recvbuff, count, datatype, op);
// When a trainer is not in exter allreduce ring // When a trainer is not in exter allreduce ring
// they need not to call this. // they need not to call this.
...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -157,14 +172,13 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", dtype:" << datatype << ", place:" << place << ", dtype:" << datatype << ", place:" << place
<< ", stream:" << stream; << ", stream:" << stream;
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
cudaEventRecord(inter_events_.at(dev_id), stream); cudaEventRecord(inter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce inter stream error");
} }
} }
...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -172,7 +186,9 @@ class NCCLOpHandleBase : public OpHandleBase {
void* recvbuff, size_t count, ncclDataType_t datatype, void* recvbuff, size_t count, ncclDataType_t datatype,
ncclRedOp_t op) { ncclRedOp_t op) {
auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); PADDLE_ENFORCE_NOT_NULL(
nccl_ctxs_, platform::errors::NotFound(
"Can't get exter %d nccl contexts.", run_order_));
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
auto& nccl_ctx = nccl_ctxs->at(dev_id); auto& nccl_ctx = nccl_ctxs->at(dev_id);
auto stream = nccl_ctx.stream(); auto stream = nccl_ctx.stream();
...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -185,14 +201,13 @@ class NCCLOpHandleBase : public OpHandleBase {
cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclAllReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sendbuff, recvbuff, count, datatype, op, comm, stream)); sendbuff, recvbuff, count, datatype, op, comm, stream));
cudaEventRecord(exter_events_.at(dev_id), stream); cudaEventRecord(exter_events_.at(dev_id), stream);
if (FLAGS_sync_nccl_allreduce) { if (FLAGS_sync_nccl_allreduce) {
PADDLE_ENFORCE(cudaStreamSynchronize(stream), PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
"sync HierarchicalAllReduce exter stream error");
} }
} }
...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase { ...@@ -210,8 +225,8 @@ class NCCLOpHandleBase : public OpHandleBase {
<< ", stream:" << stream; << ", stream:" << stream;
cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
comm, stream)); sendbuff, count, datatype, 0, comm, stream));
} }
protected: protected:
......
...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() { ...@@ -47,8 +47,8 @@ void OpHandleBase::InitCUDA() {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (auto &p : dev_ctxes_) { for (auto &p : dev_ctxes_) {
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, p.first).device;
PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id));
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
} }
if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) {
...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() { ...@@ -62,17 +62,22 @@ void OpHandleBase::InitCUDA() {
} }
} }
} else { } else {
PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, PADDLE_ENFORCE_EQ(
"%s should have only one dev_ctx.", Name()); dev_ctxes_.size(), 1UL,
platform::errors::InvalidArgument(
"Operator %s should have only one dev_ctx, but got %d.", Name(),
dev_ctxes_.size()));
auto &place = dev_ctxes_.begin()->first; auto &place = dev_ctxes_.begin()->first;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
for (auto &out_var : outputs_) { for (auto &out_var : outputs_) {
auto *out_var_handle = dynamic_cast<VarHandle *>(out_var); auto *out_var_handle = dynamic_cast<VarHandle *>(out_var);
if (out_var_handle) { if (out_var_handle) {
PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), PADDLE_ENFORCE_EQ(
platform::is_same_place(place, out_var_handle->place()), true,
platform::errors::InvalidArgument(
"The place of output(%s) is not consistent with the " "The place of output(%s) is not consistent with the "
"place of current op(%s).", "place of current op(%s).",
out_var_handle->Name(), Name()); out_var_handle->Name(), Name()));
out_var_handle->SetGenerateEvent(events_.at(dev_id)); out_var_handle->SetGenerateEvent(events_.at(dev_id));
} }
} }
...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -86,7 +91,10 @@ void OpHandleBase::Run(bool use_cuda) {
InitCUDA(); InitCUDA();
} }
#else #else
PADDLE_ENFORCE(!use_cuda); PADDLE_ENFORCE_EQ(use_cuda, false,
platform::errors::InvalidArgument(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."));
#endif #endif
// skip running current op, used with inplace_addto_op_pass // skip running current op, used with inplace_addto_op_pass
...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) { ...@@ -100,17 +108,20 @@ void OpHandleBase::Run(bool use_cuda) {
void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_NOT_NULL(waited_ctx); PADDLE_ENFORCE_NOT_NULL(waited_ctx, platform::errors::InvalidArgument(
"Argument waited_ctx is NULL."));
if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) {
for (auto &dev_ctx : dev_ctxes_) { for (auto &dev_ctx : dev_ctxes_) {
PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); PADDLE_ENFORCE_NOT_NULL(
dev_ctx.second,
platform::errors::InvalidArgument("The device context is NULL."));
dev_ctx.second->Wait(); dev_ctx.second->Wait();
} }
} else { } else {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream(); static_cast<platform::CUDADeviceContext *>(waited_ctx)->stream();
for (auto &ev : events_) { for (auto &ev : events_) {
PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
} }
} }
#else #else
...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() { ...@@ -145,10 +156,11 @@ void OpHandleBase::WaitInputVarGenerated() {
auto stream = auto stream =
static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place)) static_cast<platform::CUDADeviceContext *>(dev_ctxes_.at(place))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { ...@@ -169,10 +181,11 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
auto stream = static_cast<platform::CUDADeviceContext *>( auto stream = static_cast<platform::CUDADeviceContext *>(
dev_ctxes_.at(in_var_handle->place())) dev_ctxes_.at(in_var_handle->place()))
->stream(); ->stream();
PADDLE_ENFORCE( PADDLE_ENFORCE_CUDA_SUCCESS(
cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
#else #else
PADDLE_THROW("Doesn't compile the GPU."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} }
// There are nothing to do when the place is CPUPlace. // There are nothing to do when the place is CPUPlace.
...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes( ...@@ -242,7 +255,9 @@ void OpHandleBase::SetLocalExecScopes(
auto scopes = GetLocalScopes(); auto scopes = GetLocalScopes();
for (auto *scope : scopes) { for (auto *scope : scopes) {
auto iter = scope_map.find(scope); auto iter = scope_map.find(scope);
PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); PADDLE_ENFORCE_NE(
iter, scope_map.end(),
platform::errors::NotFound("Local scope not found in scope map."));
local_exec_scopes_.emplace_back(iter->second); local_exec_scopes_.emplace_back(iter->second);
} }
} }
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/grad_op_desc_maker.h" #include "paddle/fluid/framework/grad_op_desc_maker.h"
#include "paddle/fluid/framework/inplace_op_inference.h" #include "paddle/fluid/framework/inplace_op_inference.h"
#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> { ...@@ -186,19 +187,20 @@ struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
void operator()(const char* op_type, OpInfo* info) const { void operator()(const char* op_type, OpInfo* info) const {
PADDLE_ENFORCE_EQ(info->proto_, nullptr, PADDLE_ENFORCE_EQ(info->proto_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpProto of %s has been registered", op_type)); "OpProto of %s has been registered.", op_type));
PADDLE_ENFORCE_EQ(info->checker_, nullptr, PADDLE_ENFORCE_EQ(info->checker_, nullptr,
platform::errors::AlreadyExists( platform::errors::AlreadyExists(
"OpAttrChecker of %s has been registered", op_type)); "OpAttrChecker of %s has been registered.", op_type));
info->proto_ = new proto::OpProto; info->proto_ = new proto::OpProto;
info->checker_ = new OpAttrChecker(); info->checker_ = new OpAttrChecker();
T maker; T maker;
maker(info->proto_, info->checker_); maker(info->proto_, info->checker_);
info->proto_->set_type(op_type); info->proto_->set_type(op_type);
PADDLE_ENFORCE( PADDLE_ENFORCE_EQ(
info->proto_->IsInitialized(), info->proto_->IsInitialized(), true,
"Fail to initialize %s's OpProto, because %s is not initialized", platform::errors::PreconditionNotMet(
op_type, info->proto_->InitializationErrorString()); "Fail to initialize %s's OpProto, because %s is not initialized.",
op_type, info->proto_->InitializationErrorString()));
} }
}; };
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <map> #include <map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -32,9 +33,13 @@ struct ReduceLoDTensor { ...@@ -32,9 +33,13 @@ struct ReduceLoDTensor {
template <typename T> template <typename T>
void apply() const { void apply() const {
PADDLE_ENFORCE(!src_tensors_.empty()); PADDLE_ENFORCE_NE(src_tensors_.empty(), true,
platform::errors::InvalidArgument(
"The number of tensors to be reduced is 0."));
auto &t0 = *src_tensors_[0]; auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0); PADDLE_ENFORCE_NE(t0.numel(), 0,
platform::errors::InvalidArgument(
"The size of first tensor to be reduced is 0."));
dst_tensor_.Resize(t0.dims()); dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace()); T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
...@@ -45,8 +50,19 @@ struct ReduceLoDTensor { ...@@ -45,8 +50,19 @@ struct ReduceLoDTensor {
continue; continue;
} }
PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); PADDLE_ENFORCE_EQ(t.dims(), t0.dims(),
PADDLE_ENFORCE_EQ(t.type(), t0.type()); platform::errors::InvalidArgument(
"The shape of tensors to be reduced must be "
"consistent. The shape of current tensor is %s, "
"but the shape of the first tensor is %s.",
t.dims(), t0.dims()));
PADDLE_ENFORCE_EQ(t.type(), t0.type(),
platform::errors::InvalidArgument(
"The type of tensors to be reduced must be "
"consistent. The type of current tensor is %s, "
"but the type of the first tensor is %s.",
t.type(), t0.type()));
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst, std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
[](T a, T b) -> T { return a + b; }); [](T a, T b) -> T { return a + b; });
} }
...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor { ...@@ -88,7 +104,9 @@ struct GatherLocalSelectedRowsFunctor {
in_places_(in_places), in_places_(in_places),
out_place_(out_place), out_place_(out_place),
dst_selected_rows_(dst_selected_rows) { dst_selected_rows_(dst_selected_rows) {
PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); PADDLE_ENFORCE_NE(src_selected_rows.empty(), true,
platform::errors::InvalidArgument(
"The number of selected_rows to be gathered is 0."));
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <memory> #include <memory>
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
#include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/details/variable_visitor.h"
...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -116,8 +118,15 @@ void ReduceOpHandle::GatherSelectedRows(
merged_dev_ctx->Wait(); merged_dev_ctx->Wait();
scope->EraseVars(std::vector<std::string>{gathered_var_name}); scope->EraseVars(std::vector<std::string>{gathered_var_name});
PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(remote.size() == vars.size()); client->Gather(vars, &remote, *merged_dev_ctx, scope), true,
platform::errors::PreconditionNotMet("Gather SelectedRows failed."));
PADDLE_ENFORCE_EQ(remote.size(), vars.size(),
platform::errors::PreconditionNotMet(
"The number of remotes should be equal to the number "
"of variables to be gathered, but got the number of "
"remotes is %d and the number of variables is %d.",
remote.size(), vars.size()));
// 4. merged local selected rows. // 4. merged local selected rows.
std::vector<const SelectedRows *> all; std::vector<const SelectedRows *> all;
...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() { ...@@ -151,14 +160,19 @@ void ReduceOpHandle::RunImpl() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The number of output should equal to the number of places."); platform::errors::InvalidArgument(
"The number of inputs should equal to the number of places, but got "
"the number of inputs is %d and the number of places is %d.",
in_var_handles.size(), places_.size()));
VarHandle *out_var_handle; VarHandle *out_var_handle;
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); platform::errors::InvalidArgument(
"The number of output should be one, but got %d.",
out_var_handles.size()));
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -168,7 +182,10 @@ void ReduceOpHandle::RunImpl() {
auto pre_in_var = auto pre_in_var =
var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name());
PADDLE_ENFORCE_NOT_NULL(pre_in_var);
PADDLE_ENFORCE_NOT_NULL(pre_in_var, platform::errors::NotFound(
"Variable %s is not found in scope.",
in_0_handle->name()));
// NOTE: The Places of all input tensor must be all on CPU or all on GPU. // NOTE: The Places of all input tensor must be all on CPU or all on GPU.
std::vector<platform::Place> in_places; // used to get dev_ctx std::vector<platform::Place> in_places; // used to get dev_ctx
...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() { ...@@ -176,21 +193,29 @@ void ReduceOpHandle::RunImpl() {
in_places.emplace_back(in_handle->place()); in_places.emplace_back(in_handle->place());
auto in_var = auto in_var =
var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name());
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
in_handle->name()));
VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var);
} }
auto out_var = var_scopes.at(out_var_handle->scope_idx()) auto out_var = var_scopes.at(out_var_handle->scope_idx())
->FindVar(out_var_handle->name()); ->FindVar(out_var_handle->name());
PADDLE_ENFORCE_NOT_NULL(out_var);
PADDLE_ENFORCE_NOT_NULL(
out_var, platform::errors::NotFound("Variable %s is not found in scope.",
out_var_handle->name()));
// NOTE: The tensors' Place of input and output must be all on GPU or all on // NOTE: The tensors' Place of input and output must be all on GPU or all on
// CPU. // CPU.
auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place();
platform::Place t_out_p; platform::Place t_out_p;
if (platform::is_gpu_place(in_p)) { if (platform::is_gpu_place(in_p)) {
PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_var_handle->place()), true,
"Places of input and output must be all on GPU."); platform::errors::PreconditionNotMet(
"Places of input and output must be all on GPU."));
t_out_p = out_var_handle->place(); t_out_p = out_var_handle->place();
} else { } else {
t_out_p = platform::CPUPlace(); t_out_p = platform::CPUPlace();
...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() { ...@@ -229,7 +254,10 @@ void ReduceOpHandle::RunImpl() {
in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p,
out_var->GetMutable<framework::SelectedRows>()); out_var->GetMutable<framework::SelectedRows>());
} else { } else {
PADDLE_THROW("only support double or float when gather SelectedRows"); PADDLE_THROW(platform::errors::Unimplemented(
"Only support double or float when gather SelectedRows, but got "
"%s.",
framework::DataTypeToString(in_selected_rows[0]->value().type())));
} }
#endif #endif
}); });
...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -292,7 +320,7 @@ void ReduceOpHandle::RunImpl() {
size_t numel = static_cast<size_t>(lod_tensor.numel()); size_t numel = static_cast<size_t>(lod_tensor.numel());
all_reduce_calls.emplace_back( all_reduce_calls.emplace_back(
[buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
PADDLE_ENFORCE(platform::dynload::ncclReduce( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type), buffer, recvbuffer, numel, static_cast<ncclDataType_t>(type),
ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream()));
}); });
...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() { ...@@ -306,10 +334,13 @@ void ReduceOpHandle::RunImpl() {
} }
}); });
#else #else
PADDLE_THROW("CUDA is not enabled."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with CUDA."));
#endif #endif
} else { } else {
PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); PADDLE_THROW(platform::errors::InvalidArgument(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s.",
lod_tensors[0]->place()));
} }
} }
} }
......
...@@ -13,7 +13,9 @@ ...@@ -13,7 +13,9 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_op_handle.h"
#include <unordered_map> #include <unordered_map>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
...@@ -69,7 +71,8 @@ struct TestReduceOpHandle { ...@@ -69,7 +71,8 @@ struct TestReduceOpHandle {
} }
nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
int count = 8; int count = 8;
...@@ -103,7 +106,8 @@ struct TestReduceOpHandle { ...@@ -103,7 +106,8 @@ struct TestReduceOpHandle {
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get())); gpu_list_, nccl_ctxs_.get()));
#else #else
PADDLE_THROW("CUDA is not support."); PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with NCLL."));
#endif #endif
} else { } else {
#if defined(PADDLE_WITH_NCCL) #if defined(PADDLE_WITH_NCCL)
...@@ -164,7 +168,10 @@ struct TestReduceOpHandle { ...@@ -164,7 +168,10 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var);
PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_selected_rows = in_var->GetMutable<f::SelectedRows>(); auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();
auto value = in_selected_rows->mutable_value(); auto value = in_selected_rows->mutable_value();
value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); value->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
...@@ -178,7 +185,9 @@ struct TestReduceOpHandle { ...@@ -178,7 +185,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_selected_rows = out_var->GetMutable<f::SelectedRows>(); auto out_selected_rows = out_var->GetMutable<f::SelectedRows>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -196,9 +205,18 @@ struct TestReduceOpHandle { ...@@ -196,9 +205,18 @@ struct TestReduceOpHandle {
auto &out_select_rows = out_var->Get<f::SelectedRows>(); auto &out_select_rows = out_var->Get<f::SelectedRows>();
auto rt = out_select_rows.value(); auto rt = out_select_rows.value();
PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); PADDLE_ENFORCE_EQ(out_select_rows.height(), height,
platform::errors::InvalidArgument(
"The height of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
height, out_select_rows.height()));
for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { for (size_t k = 0; k < out_select_rows.rows().size(); ++k) {
PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); PADDLE_ENFORCE_EQ(
out_select_rows.rows()[k], rows[k % rows.size()],
platform::errors::InvalidArgument(
"The item at position %d of rows of SelectedRows is not equal to "
"the expected, expect %d, but got %d.",
k, rows[k % rows.size()], out_select_rows.rows()[k]));
} }
f::Tensor result_tensor; f::Tensor result_tensor;
...@@ -208,7 +226,7 @@ struct TestReduceOpHandle { ...@@ -208,7 +226,7 @@ struct TestReduceOpHandle {
for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) {
ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5);
} }
} } // namespace details
void TestReduceLodTensors(size_t output_scope_idx) { void TestReduceLodTensors(size_t output_scope_idx) {
std::vector<float> send_vector(static_cast<size_t>(f::product(kDims))); std::vector<float> send_vector(static_cast<size_t>(f::product(kDims)));
...@@ -220,7 +238,9 @@ struct TestReduceOpHandle { ...@@ -220,7 +238,9 @@ struct TestReduceOpHandle {
for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size();
++input_scope_idx) { ++input_scope_idx) {
auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); auto in_var = param_scopes_[input_scope_idx]->FindVar("input");
PADDLE_ENFORCE_NOT_NULL(in_var); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound(
"Variable %s is not found in scope.", "input"));
auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>(); auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]); in_lod_tensor->mutable_data<float>(kDims, gpu_list_[input_scope_idx]);
in_lod_tensor->set_lod(lod); in_lod_tensor->set_lod(lod);
...@@ -230,7 +250,9 @@ struct TestReduceOpHandle { ...@@ -230,7 +250,9 @@ struct TestReduceOpHandle {
} }
auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); auto out_var = param_scopes_[output_scope_idx]->FindVar("out");
PADDLE_ENFORCE_NOT_NULL(out_var); PADDLE_ENFORCE_NOT_NULL(out_var,
platform::errors::NotFound(
"Variable %s is not found in scope.", "out"));
auto out_lodtensor = out_var->GetMutable<f::LoDTensor>(); auto out_lodtensor = out_var->GetMutable<f::LoDTensor>();
auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); auto in_var = param_scopes_[output_scope_idx]->FindVar("input");
...@@ -254,7 +276,7 @@ struct TestReduceOpHandle { ...@@ -254,7 +276,7 @@ struct TestReduceOpHandle {
ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5);
} }
} }
}; }; // namespace details
TEST(ReduceTester, TestCPUReduceTestSelectedRows) { TEST(ReduceTester, TestCPUReduceTestSelectedRows) {
TestReduceOpHandle test_op; TestReduceOpHandle test_op;
......
...@@ -111,12 +111,11 @@ void ShareTensorBufferFunctor::CallOnce() { ...@@ -111,12 +111,11 @@ void ShareTensorBufferFunctor::CallOnce() {
auto *out_var = exec_scope_->FindVar(out_var_names_[i]); auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound( in_var, platform::errors::NotFound(
"The input variable(%s)to be inplaced should not be NULL.", "The variable(%s) to be inplaced is not found in scope.",
in_var_infos_[i]->Name())); in_var_infos_[i]->Name()));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_var, out_var, platform::errors::NotFound(
platform::errors::NotFound( "The variable(%s) to be inplaced is not found in scope.",
"The output variable(%s) to be inplaced should not be NULL.",
out_var_names_[i])); out_var_names_[i]));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in_var, out_var, in_var, out_var,
......
...@@ -12,8 +12,10 @@ ...@@ -12,8 +12,10 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
#include <algorithm> #include <algorithm>
#include <utility> #include <utility>
#include "dgc/dgc.h" #include "dgc/dgc.h"
#include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/container_cast.h"
#include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/reduce_and_gather.h"
...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle( ...@@ -38,18 +40,23 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
is_encoded_(is_encoded), is_encoded_(is_encoded),
nranks_(nranks) { nranks_(nranks) {
// TODO(gongwb) :polish them! // TODO(gongwb) :polish them!
PADDLE_ENFORCE_EQ(is_encoded, true); PADDLE_ENFORCE_EQ(is_encoded, true, platform::errors::InvalidArgument(
"The argument is_encoded is false."));
VLOG(1) << "Use dgc allreduce mode" VLOG(1) << "Use dgc allreduce mode"
<< ", nranks:" << nranks_; << ", nranks:" << nranks_;
PADDLE_ENFORCE_GT(local_scopes_.size(), 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto nranks_name = g_dgc_nranks; auto nranks_name = g_dgc_nranks;
for (size_t i = 0; i < local_scopes_.size(); ++i) { for (size_t i = 0; i < local_scopes_.size(); ++i) {
auto *local_scope = local_scopes_[i]; auto *local_scope = local_scopes_[i];
auto nranks_var = local_scope->FindVar(nranks_name); auto nranks_var = local_scope->FindVar(nranks_name);
if (nranks_var == nullptr) {
PADDLE_THROW("not find nranks_var:%s", nranks_name); PADDLE_ENFORCE_NOT_NULL(
} nranks_var, platform::errors::NotFound(
"Variable %s is not found in scope.", nranks_name));
float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>(); float *dgc_nranks = nranks_var->GetMutable<LoDTensor>()->data<float>();
*dgc_nranks = nranks; *dgc_nranks = nranks;
...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -64,10 +71,18 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), places_.size(), in_var_handles.size(), places_.size(),
"The NoDummyInputSize should be equal to the number of places."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"places, but got the number of input variables is %zu and the the "
"number of places is %zu.",
in_var_handles.size(), places_.size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_handles.size(), out_var_handles.size(), in_var_handles.size(), out_var_handles.size(),
"The NoDummyInputSize and NoDummyOutputSize should be equal."); platform::errors::PreconditionNotMet(
"The number of input variables should be equal to the number of "
"output variables, but got the number of input variables is %zu and "
"the the number of output variables is %zu.",
in_var_handles.size(), out_var_handles.size()));
std::vector<const LoDTensor *> ins; std::vector<const LoDTensor *> ins;
std::vector<LoDTensor *> gathers; std::vector<LoDTensor *> gathers;
...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -80,14 +95,17 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
auto encode_var_name = original_name + g_dgc_encoded; auto encode_var_name = original_name + g_dgc_encoded;
auto *in_var = local_scope->FindVar(encode_var_name); auto *in_var = local_scope->FindVar(encode_var_name);
PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); PADDLE_ENFORCE_NOT_NULL(
in_var, platform::errors::NotFound("Variable %s is not found in scope.",
encode_var_name));
auto &in = in_var->Get<LoDTensor>(); auto &in = in_var->Get<LoDTensor>();
ins.emplace_back(&in); ins.emplace_back(&in);
auto gather_var_name = original_name + g_dgc_gather; auto gather_var_name = original_name + g_dgc_gather;
auto *gather_var = local_scope->FindVar(gather_var_name); auto *gather_var = local_scope->FindVar(gather_var_name);
PADDLE_ENFORCE_NOT_NULL(gather_var, "%s should not be null", PADDLE_ENFORCE_NOT_NULL(
gather_var_name); gather_var, platform::errors::NotFound(
"Variable %s is not found in scope.", gather_var));
auto *gather = gather_var->GetMutable<LoDTensor>(); auto *gather = gather_var->GetMutable<LoDTensor>();
gathers.emplace_back(gather); gathers.emplace_back(gather);
...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -100,14 +118,26 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
} }
} }
PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); platform::is_gpu_place(ins[0]->place()), true,
PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
ins[0]->place()));
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(outs[0]->place()), true,
platform::errors::InvalidArgument(
"The place of input variable should be CUDAPlace, but got %s.",
outs[0]->place()));
PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, platform::errors::PreconditionNotMet(
"The nccl contexts are NULL."));
int dtype = -1; int dtype = -1;
size_t in_numel = 0; size_t in_numel = 0;
size_t out_numel = 0; size_t out_numel = 0;
PADDLE_ENFORCE(nranks_ > 1); PADDLE_ENFORCE_GT(
nranks_, 1,
platform::errors::PreconditionNotMet(
"The number of ranks should be > 1, but got %d.", nranks_));
std::vector<std::function<void()>> all_gather_calls; std::vector<std::function<void()>> all_gather_calls;
std::vector<std::function<void()>> sparse_reduce_calls; std::vector<std::function<void()>> sparse_reduce_calls;
...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -123,8 +153,16 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel; in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
PADDLE_ENFORCE(in_numel % 2 == 0); PADDLE_ENFORCE_EQ(in_numel % 2, 0,
PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k)); platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
PADDLE_ENFORCE_EQ(in_numel / 2, static_cast<size_t>(k),
platform::errors::InvalidArgument(
"The number of elements of input variable should be "
"even, but got %zu.",
in_numel));
out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel; out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; int dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() { ...@@ -154,7 +192,8 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce( PADDLE_ENFORCE_EQ(paddle::communication::dgc::sparseReduce(
gather_buff, k, out_tensor_buf, gather_buff, k, out_tensor_buf,
static_cast<int>(out_numel), nranks_, stream), static_cast<int>(out_numel), nranks_, stream),
true); true, platform::errors::Unavailable(
"Calling sparseReduce() failed."));
}); });
} }
...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc( ...@@ -187,11 +226,16 @@ void SparseAllReduceOpHandle::SparseAllReduceFunc(
int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
auto original_name = paddle::framework::GradOriginalVarName(grad_name); auto original_name = paddle::framework::GradOriginalVarName(grad_name);
auto var_name = original_name + g_dgc_k; auto var_name = original_name + g_dgc_k;
PADDLE_ENFORCE(local_scopes_.size() > 0); PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *scope = local_exec_scopes_[0]; auto *scope = local_exec_scopes_[0];
auto var = scope->FindVar(var_name); auto var = scope->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found in scope.",
var_name));
auto tensor = var->Get<LoDTensor>().data<float>(); auto tensor = var->Get<LoDTensor>().data<float>();
return *tensor; return *tensor;
} }
...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() { ...@@ -202,15 +246,22 @@ bool SparseAllReduceOpHandle::IsEncoded() {
} }
auto counter_name = g_dgc_counter_name; auto counter_name = g_dgc_counter_name;
auto step_name = g_dgc_rampup_begin_step; auto step_name = g_dgc_rampup_begin_step;
PADDLE_ENFORCE(local_scopes_.size() > 0);
PADDLE_ENFORCE_GT(local_scopes_.size(), 0,
platform::errors::PreconditionNotMet(
"The number of local scope should be > 0, but got %zu.",
local_scopes_.size()));
auto *local_scope = local_exec_scopes_[0]; auto *local_scope = local_exec_scopes_[0];
auto count_var = local_scope->FindVar(counter_name); auto count_var = local_scope->FindVar(counter_name);
auto step_var = local_scope->FindVar(step_name); auto step_var = local_scope->FindVar(step_name);
if (count_var == nullptr || step_var == nullptr) {
PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, PADDLE_ENFORCE_NOT_NULL(
step_var); count_var, platform::errors::NotFound(
} "Variable %s is not found in scope.", counter_name));
PADDLE_ENFORCE_NOT_NULL(
step_var, platform::errors::NotFound("Variable %s is not found in scope.",
step_var));
float count = *count_var->Get<LoDTensor>().data<float>(); float count = *count_var->Get<LoDTensor>().data<float>();
float step = *step_var->Get<LoDTensor>().data<float>(); float step = *step_var->Get<LoDTensor>().data<float>();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册